[BUGFIX] Ensure Indexed Search can handle PPTX and XLSX files

Thanks to Xavier Perseguers for providing this fix which properly indexes XLSX and PPTX files. Problem for XLSX was that the wrong "unzipped" file was indexed, a file that contains some kind of metadata which are basically only pointers (integers). The content extracted is plain wrong and useless. Problem for PPTX was that only the content of Slide 1 was extracted, it missed content from all other slides. Resolves: #99527 Releases: main, 13.4 Change-Id: I8293ad8a247b243bec80188471ee2cee3a5151f7 Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/87441 Reviewed-by: Christian Kuhn <lolli@schwarzbu.ch> Tested-by: Garvin Hicking <gh@faktor-e.de> Tested-by: Christian Kuhn <lolli@schwarzbu.ch> Reviewed-by: Garvin Hicking <gh@faktor-e.de> Reviewed-by: Benni Mack <benni@typo3.org> Tested-by: Benni Mack <benni@typo3.org> Tested-by: core-ci <typo3@b13.com>
TYPO3-CMS · Dec 19, 2024 · 01604cd · 01604cd
1 parent 18f9d9d
commit 01604cd
Showing 1 changed file with 36 additions and 8 deletions.
diff --git a/Classes/FileContentParser.php b/Classes/FileContentParser.php
@@ -522,6 +522,8 @@ public function readFileContent(string $ext, string $absFile, string|int $cPKey)
             case 'xltx':
                 if ($this->app['unzip']) {
                     $this->setLocaleForServerFileSystem();
+                    $utf8_content = null;
+                    $cmd = '';
                     switch ($ext) {
                         case 'docx':
                         case 'dotx':
@@ -531,22 +533,23 @@ public function readFileContent(string $ext, string $absFile, string|int $cPKey)
                         case 'ppsx':
                         case 'pptx':
                         case 'potx':
-                            // Read slide1.xml:
-                            $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
+                            $utf8_content = $this->extractPptxContent($absFile);
                             break;
                         case 'xlsx':
                         case 'xltx':
-                            // Read sheet1.xml:
-                            $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
+                            // Read sharedStrings.xml:
+                            $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/sharedStrings.xml';
                             break;
                         default:
                             $cmd = '';
                             break;
                     }
-                    CommandUtility::exec($cmd, $res);
-                    $content_xml = implode(LF, $res);
-                    unset($res);
-                    $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
+                    if ($utf8_content === null) {
+                        CommandUtility::exec($cmd, $res);
+                        $content_xml = implode(LF, $res);
+                        unset($res);
+                        $utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
+                    }
                     $indexingDataDto = $this->pObj->splitRegularContent($utf8_content);
                     // Make sure the title doesn't expose the absolute path!
                     $indexingDataDto->title = PathUtility::basename($absFile);
@@ -789,6 +792,31 @@ public function removeEndJunk(string $string): string
         return trim((string)preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
     }
 
+    /**
+     * @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
+     */
+    protected function extractPptxContent(string $absFile): string
+    {
+        // Extract the list of slides:
+        $cmd = $this->app['unzip'] . ' -l ' . escapeshellarg($absFile);
+        CommandUtility::exec($cmd, $res);
+
+        $buffer = [];
+        foreach ($res as $line) {
+            if (preg_match('#\s+(ppt/slides/slide\d+.xml)$#', $line, $matches)) {
+                $slideFile = $matches[1];
+                // Extract the content of the slide:
+                $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ' . $slideFile;
+                CommandUtility::exec($cmd, $xml);
+                $content_xml = implode(LF, $xml);
+                unset($xml);
+                $buffer[] = trim(strip_tags(str_replace('<', ' <', $content_xml)));
+            }
+        }
+
+        return trim(implode(LF, $buffer));
+    }
+
     /************************
      *
      * Backend analyzer