From 01604cdd966faa1a9460e44d2e0bda5e551b8fcf Mon Sep 17 00:00:00 2001 From: Benni Mack Date: Wed, 11 Dec 2024 13:48:17 +0100 Subject: [PATCH] [BUGFIX] Ensure Indexed Search can handle PPTX and XLSX files Thanks to Xavier Perseguers for providing this fix which properly indexes XLSX and PPTX files. Problem for XLSX was that the wrong "unzipped" file was indexed, a file that contains some kind of metadata which are basically only pointers (integers). The content extracted is plain wrong and useless. Problem for PPTX was that only the content of Slide 1 was extracted, it missed content from all other slides. Resolves: #99527 Releases: main, 13.4 Change-Id: I8293ad8a247b243bec80188471ee2cee3a5151f7 Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/87441 Reviewed-by: Christian Kuhn Tested-by: Garvin Hicking Tested-by: Christian Kuhn Reviewed-by: Garvin Hicking Reviewed-by: Benni Mack Tested-by: Benni Mack Tested-by: core-ci --- Classes/FileContentParser.php | 44 ++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/Classes/FileContentParser.php b/Classes/FileContentParser.php index 623938b..0040f91 100644 --- a/Classes/FileContentParser.php +++ b/Classes/FileContentParser.php @@ -522,6 +522,8 @@ public function readFileContent(string $ext, string $absFile, string|int $cPKey) case 'xltx': if ($this->app['unzip']) { $this->setLocaleForServerFileSystem(); + $utf8_content = null; + $cmd = ''; switch ($ext) { case 'docx': case 'dotx': @@ -531,22 +533,23 @@ public function readFileContent(string $ext, string $absFile, string|int $cPKey) case 'ppsx': case 'pptx': case 'potx': - // Read slide1.xml: - $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml'; + $utf8_content = $this->extractPptxContent($absFile); break; case 'xlsx': case 'xltx': - // Read sheet1.xml: - $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml'; + // Read sharedStrings.xml: + $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/sharedStrings.xml'; break; default: $cmd = ''; break; } - CommandUtility::exec($cmd, $res); - $content_xml = implode(LF, $res); - unset($res); - $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml))); + if ($utf8_content === null) { + CommandUtility::exec($cmd, $res); + $content_xml = implode(LF, $res); + unset($res); + $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml))); + } $indexingDataDto = $this->pObj->splitRegularContent($utf8_content); // Make sure the title doesn't expose the absolute path! $indexingDataDto->title = PathUtility::basename($absFile); @@ -789,6 +792,31 @@ public function removeEndJunk(string $string): string return trim((string)preg_replace('/[' . LF . chr(12) . ']*$/', '', $string)); } + /** + * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function) + */ + protected function extractPptxContent(string $absFile): string + { + // Extract the list of slides: + $cmd = $this->app['unzip'] . ' -l ' . escapeshellarg($absFile); + CommandUtility::exec($cmd, $res); + + $buffer = []; + foreach ($res as $line) { + if (preg_match('#\s+(ppt/slides/slide\d+.xml)$#', $line, $matches)) { + $slideFile = $matches[1]; + // Extract the content of the slide: + $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ' . $slideFile; + CommandUtility::exec($cmd, $xml); + $content_xml = implode(LF, $xml); + unset($xml); + $buffer[] = trim(strip_tags(str_replace('<', ' <', $content_xml))); + } + } + + return trim(implode(LF, $buffer)); + } + /************************ * * Backend analyzer