Skip to content

Commit

Permalink
[BUGFIX] Ensure Indexed Search can handle PPTX and XLSX files
Browse files Browse the repository at this point in the history
Thanks to Xavier Perseguers for providing this fix
which properly indexes XLSX and PPTX files.

Problem for XLSX was that the wrong "unzipped" file was indexed,
a file that contains some kind of metadata which are basically only
pointers (integers). The content extracted is plain wrong and useless.

Problem for PPTX was that only the content of Slide 1 was extracted,
it missed content from all other slides.

Resolves: #99527
Releases: main, 13.4
Change-Id: I8293ad8a247b243bec80188471ee2cee3a5151f7
Reviewed-on: https://review.typo3.org/c/Packages/TYPO3.CMS/+/87441
Reviewed-by: Christian Kuhn <lolli@schwarzbu.ch>
Tested-by: Garvin Hicking <gh@faktor-e.de>
Tested-by: Christian Kuhn <lolli@schwarzbu.ch>
Reviewed-by: Garvin Hicking <gh@faktor-e.de>
Reviewed-by: Benni Mack <benni@typo3.org>
Tested-by: Benni Mack <benni@typo3.org>
Tested-by: core-ci <typo3@b13.com>
  • Loading branch information
bmack committed Dec 19, 2024
1 parent 18f9d9d commit 01604cd
Showing 1 changed file with 36 additions and 8 deletions.
44 changes: 36 additions & 8 deletions Classes/FileContentParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,8 @@ public function readFileContent(string $ext, string $absFile, string|int $cPKey)
case 'xltx':
if ($this->app['unzip']) {
$this->setLocaleForServerFileSystem();
$utf8_content = null;
$cmd = '';
switch ($ext) {
case 'docx':
case 'dotx':
Expand All @@ -531,22 +533,23 @@ public function readFileContent(string $ext, string $absFile, string|int $cPKey)
case 'ppsx':
case 'pptx':
case 'potx':
// Read slide1.xml:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
$utf8_content = $this->extractPptxContent($absFile);
break;
case 'xlsx':
case 'xltx':
// Read sheet1.xml:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
// Read sharedStrings.xml:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' xl/sharedStrings.xml';
break;
default:
$cmd = '';
break;
}
CommandUtility::exec($cmd, $res);
$content_xml = implode(LF, $res);
unset($res);
$utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
if ($utf8_content === null) {
CommandUtility::exec($cmd, $res);
$content_xml = implode(LF, $res);
unset($res);
$utf8_content = trim(strip_tags(str_replace('<', ' <', $content_xml)));
}
$indexingDataDto = $this->pObj->splitRegularContent($utf8_content);
// Make sure the title doesn't expose the absolute path!
$indexingDataDto->title = PathUtility::basename($absFile);
Expand Down Expand Up @@ -789,6 +792,31 @@ public function removeEndJunk(string $string): string
return trim((string)preg_replace('/[' . LF . chr(12) . ']*$/', '', $string));
}

/**
* @param string $absFile Absolute filename of file (must exist and be validated OK before calling function)
*/
protected function extractPptxContent(string $absFile): string
{
// Extract the list of slides:
$cmd = $this->app['unzip'] . ' -l ' . escapeshellarg($absFile);
CommandUtility::exec($cmd, $res);

$buffer = [];
foreach ($res as $line) {
if (preg_match('#\s+(ppt/slides/slide\d+.xml)$#', $line, $matches)) {
$slideFile = $matches[1];
// Extract the content of the slide:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' ' . $slideFile;
CommandUtility::exec($cmd, $xml);
$content_xml = implode(LF, $xml);
unset($xml);
$buffer[] = trim(strip_tags(str_replace('<', ' <', $content_xml)));
}
}

return trim(implode(LF, $buffer));
}

/************************
*
* Backend analyzer
Expand Down

0 comments on commit 01604cd

Please # to comment.