From 4d3a9d957ad94f5ac93f0eca26b88d6c3714adcf Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:00 -0500 Subject: [PATCH 01/29] Optimize line handling in table parser --- CHANGELOG.md | 4 ++++ src/Extension/Table/TableStartParser.php | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 917e5e80c7..8c82aafe29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ## [Unreleased][unreleased] +### Changed + +- Several small performance optimizations + ## [2.5.3] - 2024-08-16 ### Changed diff --git a/src/Extension/Table/TableStartParser.php b/src/Extension/Table/TableStartParser.php index 12206d289a..08b4878f74 100644 --- a/src/Extension/Table/TableStartParser.php +++ b/src/Extension/Table/TableStartParser.php @@ -35,8 +35,8 @@ public function tryStart(Cursor $cursor, MarkdownParserStateInterface $parserSta return BlockStart::none(); } - $lines = \explode("\n", $paragraph); - $lastLine = \array_pop($lines); + $lastLineBreak = \strrpos($paragraph, "\n"); + $lastLine = $lastLineBreak === false ? $paragraph : \substr($paragraph, $lastLineBreak + 1); $headerCells = TableParser::split($lastLine); if (\count($headerCells) > \count($columns)) { @@ -47,9 +47,9 @@ public function tryStart(Cursor $cursor, MarkdownParserStateInterface $parserSta $parsers = []; - if (\count($lines) > 0) { + if ($lastLineBreak !== false) { $p = new ParagraphParser(); - $p->addLine(\implode("\n", $lines)); + $p->addLine(\substr($paragraph, 0, $lastLineBreak)); $parsers[] = $p; } From b73dd0b41bedf93c16f83503bbc83871b1e58a57 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:00 -0500 Subject: [PATCH 02/29] Add pathological test suite --- .github/workflows/tests.yml | 18 ++ composer.json | 9 +- tests/pathological/convert.php | 58 ++++++ tests/pathological/test.php | 360 +++++++++++++++++++++++++++++++++ 4 files changed, 442 insertions(+), 3 deletions(-) create mode 100755 tests/pathological/convert.php create mode 100755 tests/pathological/test.php diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d9c08637ef..1546d4f67e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -109,6 +109,24 @@ jobs: - run: vendor/bin/psalm --no-progress --stats --threads=$(nproc) --output-format=github --shepherd + pathological: + name: Pathological Tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: shivammathur/setup-php@v2 + with: + php-version: 8.1 + extensions: curl, mbstring, yaml + coverage: none + tools: composer:v2 + + - run: composer update --no-progress + + - run: php tests/pathological/test.php + docs-lint: permissions: contents: read # for actions/checkout to fetch code diff --git a/composer.json b/composer.json index 9b90662034..d74b0b4b11 100644 --- a/composer.json +++ b/composer.json @@ -42,8 +42,9 @@ "phpstan/phpstan": "^1.8.2", "phpunit/phpunit": "^9.5.21 || ^10.5.9 || ^11.0.0", "scrutinizer/ocular": "^1.8.1", - "symfony/finder": "^5.3 | ^6.0 || ^7.0", - "symfony/yaml": "^2.3 | ^3.0 | ^4.0 | ^5.0 | ^6.0 || ^7.0", + "symfony/finder": "^5.3 | ^6.0 | ^7.0", + "symfony/process": "^5.4 | ^6.0 | ^7.0", + "symfony/yaml": "^2.3 | ^3.0 | ^4.0 | ^5.0 | ^6.0 | ^7.0", "unleashedtech/php-coding-standard": "^3.1.1", "vimeo/psalm": "^4.24.0 || ^5.0.0" }, @@ -103,11 +104,13 @@ "phpstan": "phpstan analyse", "phpunit": "phpunit --no-coverage", "psalm": "psalm --stats", + "pathological": "tests/pathological/test.php", "test": [ "@phpcs", "@phpstan", "@psalm", - "@phpunit" + "@phpunit", + "@pathological" ] }, "extra": { diff --git a/tests/pathological/convert.php b/tests/pathological/convert.php new file mode 100755 index 0000000000..0768db3a9b --- /dev/null +++ b/tests/pathological/convert.php @@ -0,0 +1,58 @@ +#!/usr/bin/env php + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +ini_set('memory_limit', '1024M'); + +use League\CommonMark\Environment\Environment; +use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension; +use League\CommonMark\Extension\Footnote\FootnoteExtension; +use League\CommonMark\Extension\GithubFlavoredMarkdownExtension; +use League\CommonMark\Extension\Table\TableExtension; +use League\CommonMark\MarkdownConverter; + +require_once __DIR__ . '/../../vendor/autoload.php'; + +ini_set('display_errors', 'stderr'); +ini_set('xdebug.max_nesting_level', '999999'); + +$stdin = fopen('php://stdin', 'r'); +if (stream_set_blocking($stdin, true)) { + $markdown = stream_get_contents($stdin); +} +fclose($stdin); + +if (empty($markdown)) { + fwrite(STDERR, "No input provided\n"); + exit(1); +} + +$environment = new Environment(); +$environment->addExtension(new CommonMarkCoreExtension()); + +// Enable additional extensions if requested +$extension = $argv[1] ?? null; +switch ($argv[1] ?? null) { + case 'table': + $environment->addExtension(new TableExtension()); + break; + case 'footnotes': + $environment->addExtension(new FootnoteExtension()); + break; + case 'gfm': + default: + $environment->addExtension(new GithubFlavoredMarkdownExtension()); + break; +} + +$converter = new MarkdownConverter($environment); + +echo $converter->convert($markdown)->getContent(); diff --git a/tests/pathological/test.php b/tests/pathological/test.php new file mode 100755 index 0000000000..3b9ee2229a --- /dev/null +++ b/tests/pathological/test.php @@ -0,0 +1,360 @@ +#!/usr/bin/env php + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +use Symfony\Component\Process\Exception\ProcessSignaledException; +use Symfony\Component\Process\Exception\ProcessTimedOutException; +use Symfony\Component\Process\Process; + +require_once __DIR__ . '/../../vendor/autoload.php'; + +$cases = [ + 'Alternate line endings' => [ + 'sizes' => [1], + 'input' => static fn($n) => "- a\n- b\r- c\r\n- d", + 'expected' => static fn($n) => "\n", + ], + 'Nested strong emphasis' => [ + 'sizes' => [50, 500], + 'input' => static fn($n) => \str_repeat('*a **a ', $n) . 'b' . \str_repeat(' a** a*', $n), + 'expected' => static fn($n) => '

' . \str_repeat('a a ', $n) . 'b' . \str_repeat(' a a', $n) . '

', + ], + 'Emphasis closers with no openers' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('a_ ', $n), + 'expected' => static fn($n) => '

' . \str_repeat('a_ ', $n - 1) . 'a_

', + ], + 'Emphasis openers with no closers' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('_a ', $n), + 'expected' => static fn($n) => '

' . \str_repeat('_a ', $n - 1) . '_a

', + ], + 'Openers and closers multiple of 3' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => 'a**b' . \str_repeat('c* ', $n), + 'expected' => static fn($n) => '

a**b' . \str_repeat('c* ', $n - 1) . 'c*

', + ], + 'Delimiters that cannot open or close' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/172', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('*_* _ ', $n), + 'expected' => static fn($n) => '

' . \str_repeat('_ _ ', $n - 1) . '_ _

', + ], + 'Link closers with no openers' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('a] ', $n), + 'expected' => static fn($n) => '

' . \str_repeat('a] ', $n - 1) . 'a]

', + ], + 'Link openers with no closers' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('[a ', $n), + 'expected' => static fn($n) => '

' . \str_repeat('[a ', $n - 1) . '[a

', + ], + 'Link openers and emphasis closers' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('[ a_ ', $n), + 'expected' => static fn($n) => '

' . \str_repeat('[ a_ ', $n - 1) . '[ a_

', + ], + 'Mismatched openers and closers' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('*a_ ', $n), + 'expected' => static fn($n) => '

' . \str_repeat('*a_ ', $n - 1) . '*a_

', + ], + 'Pattern [ (](' => [ + 'sizes' => [500, 5_000, 50_000], + 'input' => static fn($n) => \str_repeat('[ (](', $n), + 'expected' => static fn($n) => '

' . \str_repeat('[ (](', $n) . '

', + ], + 'Nested brackets' => [ + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('[', $n) . 'a' . \str_repeat(']', $n), + 'expected' => static fn($n) => '

' . \str_repeat('[', $n) . 'a' . \str_repeat(']', $n) . '

', + ], + 'Backslash in link' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/157', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => '[' . \str_repeat('\\', $n) . "\n", + 'expected' => static fn($n) => '

[' . \str_repeat('\\', $n / 2) . '

', + ], + 'Backslash in unclosed link title' => [ + 'sizes' => [10, 100, 1_000], + 'input' => static fn($n) => '[test](\\url "' . \str_repeat('\\', $n) . "\n", + 'expected' => static fn($n) => '

[test](\\url "' . \str_repeat('\\', $n / 2) . '

', + ], + 'Unclosed inline links (1)' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/129', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('[](', $n), + 'expected' => static fn($n) => '

' . \str_repeat('[](', $n) . '

', + ], + 'Unclosed inline links (2)' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/129', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('[a](b', $n), + 'expected' => static fn($n) => '

' . \str_repeat('[a](b', $n) . '

', + ], + 'Unclosed inline links (3)' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/129', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('[a]( static fn($n) => '

' . \str_repeat('[a](<b', $n) . '

', + ], + 'Nested blockquotes' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/129', + 'sizes' => [100, 1_000], + 'input' => static fn($n) => \str_repeat('> ', $n) . "a\n", + 'expected' => static fn($n) => \str_repeat("
\n", $n) . "

a

\n" . \str_repeat("
\n", $n), + ], + 'Backticks' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/129', + 'sizes' => [500, 1_000, 2_000, 4_000], + 'input' => static fn($n) => \implode('', \array_map(static fn($i) => 'e' . \str_repeat('`', $i), \range(1, $n))), + ], + 'Many ref. definitions' => [ + 'ref' => 'https://github.com/commonmark/commonmark.js/issues/129', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat("[a]: u\n", $n), + ], + 'Huge horizontal rule' => [ + 'sizes' => [500, 5_000], + 'input' => static fn($n) => \str_repeat('*', $n) . "\n", + 'expected' => static fn($n) => '
', + ], + 'CVE-2022-39209' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-cgh3-p57x-9q7q', + 'extension' => 'autolink', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('![l', $n) . "\n", + 'expected' => static fn($n) => '

' . \str_repeat('![l', $n) . '

', + ], + 'CVE-2023-22486' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-r572-jvj2-3m8p', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('![[]()', $n) . "\n", + 'expected' => static fn($n) => '

' . \str_repeat('![', $n) . '

', + ], + 'CVE-2023-22484' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-24f7-9frr-5h2r', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => ' static fn($n) => '

</' . \str_repeat('<!--', $n) . '

', + ], + 'CVE-2023-22483 test 1' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => "-1" . \str_repeat(" static fn($n) => '

-1' . \str_repeat('<?x', $n) . 'y

', + ], + 'CVE-2023-22483 test 2 (tables)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'extension' => 'table', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat("|-\nt>|-\n", $n) . "y\n", + 'expected' => static fn($n) => '

' . \str_repeat("|-\nt>|-\n", $n) . 'y

', + ], + 'CVE-2023-22483 test 3' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [1_000, 10_000, 20_000, 40_000], + 'input' => static fn($n) => \str_repeat('*t ', $n) . \str_repeat('_t*_ ', $n) . "\n", + 'expected' => static fn($n) => '

' . \str_repeat('t ', $n) . \str_repeat('_t_ ', $n - 1) . '_t_

', + ], + 'CVE-2023-22483 test 4 (tables)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'extension' => 'table', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => "x\n| - |\n" . \str_repeat("|", $n) . "y\n", + 'expected' => static fn($n) => "

x\n| - |\n" . \str_repeat('|', $n) . 'y

', + ], + 'CVE-2023-22483 test 5 (autolink)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat('z_www.', $n) . "\n", + ], + 'CVE-2023-22483 test 6' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [100, 1_000, 10_000, 100_000], + 'input' => static fn($n) => "[f]:u\n\"\n" . \str_repeat("[f]\n", $n) . "[f]:u \"t\n", + ], + 'CVE-2023-22483 test 7 (autolink)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => "www." . \str_repeat(")", $n) . "\n", + 'expected' => static fn($n) => '

www.' . \str_repeat(')', $n) . '

', + ], + 'CVE-2023-22483 test 8 (tables)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'extension' => 'table', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => ":-\n" . \str_repeat(":\n", $n) . ":-\n", + 'expected' => static fn($n) => "

:-\n" . \str_repeat(":\n", $n) . ':-

', + ], + 'CVE-2023-22483 test 9 (autolink)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat(" static fn($n) => '

' . \str_repeat('<http://s', $n) . '

', + ], + 'CVE-2023-22483 test 10 (strikethrough)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => '~e' . \str_repeat(',z~~', $n) . "\n", + 'expected' => static fn($n) => '

~e' . \str_repeat(',z~~', $n) . '

', + ], + 'CVE-2023-22483 test 11 (deeply-nested blockquotes)' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-29g3-96g3-jg6c', + 'sizes' => [100, 1_000, 2_000, 3_000], + 'input' => static fn($n) => \str_repeat(">", $n) . \str_repeat(".", $n) . "\n", + 'expected' => static fn($n) => \str_repeat("
\n", $n) . '

' . \str_repeat('.', $n) . "

\n" . \str_repeat("
\n", $n), + ], + 'CVE-2023-26485 test 1' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-r8vr-c48j-fcc5', + 'sizes' => [50, 500, 5_000], // ideally should be 1000, 10_000, 100_000 but recursive rendering makes large sizes fail + 'input' => static fn($n) => \str_repeat('_', $n) . '.' . \str_repeat('_', $n) . "\n", + 'expected' => static fn($n) => '

' . \str_repeat('', $n/2) . '.' . \str_repeat('', $n/2) . '

', + ], + 'CVE-2023-26485 test 2' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-r8vr-c48j-fcc5', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => "1.\n" . \str_repeat(" 2.\n", $n), + 'expected' => static fn($n) => "
    \n
  1. \n" . \str_repeat("
  2. \n", $n) . "
\n", + ], + 'CVE-2023-26485 test 3' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-r8vr-c48j-fcc5', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat(" -\n", $n) . "x\n", + 'expected' => static fn($n) => "
    \n" . \str_repeat("
  • \n", $n) . "
\n

x

", + ], + 'CVE-2023-37463 test 1' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-w4qg-3vf7-m9x5', + 'extension' => 'table', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => '|' . \str_repeat('x|', $n) . "\n|" . \str_repeat('-|', $n) . "\n", + 'expected' => static fn($n) => "\n\n\n" . \str_repeat("\n", $n) . "\n\n
x
\n", + ], + 'CVE-2023-37463 test 2' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-w4qg-3vf7-m9x5', + 'extension' => 'table', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => '|' . \str_repeat('x|', $n) . "\n|" . \str_repeat('-|', $n) . "\n" . \str_repeat("a\n", $n), + ], + 'CVE-2023-37463 test 3' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-w4qg-3vf7-m9x5', + 'extension' => 'footnotes', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat("[^1]:\n", $n) . \str_repeat("\n", $n), + 'expected' => static fn($n) => '', + ], +]; + +print("Running " . \count($cases) . " pathological test cases\n\n"); + +$failures = []; +foreach ($cases as $name => $case) { + print("\033[1m$name\033[0m"); + if (isset($case['ref'])) { + print(" (\033[4;34m{$case['ref']}\033[0m)"); + } + print("\n"); + + $lastRunTime = null; + $lastInputSize = null; + $lastOutputSize = null; + foreach ($case['sizes'] as $size) { + printf(' - %s: ', number_format($size)); + + $input = $case['input']($size); + $inputSize = \strlen($input); + + if ($lastInputSize === null) { + $timeout = 5; // 5 seconds + } else { + // Ideally, these cases should run in linear time or better, + // but we'll allow a 50% margin of error. + $timeout = \ceil($lastRunTime * $inputSize / $lastInputSize * 1.5); + // But regardless of this, we always want to wait at least 5 seconds, + // and at most 60 seconds. + $timeout = \max(5, \min(60, $timeout)); + } + + $command = ['php', 'convert.php']; + if (isset($case['extension'])) { + $command[] = $case['extension']; + } + + $process = new Process($command, __DIR__, null, $input, $timeout); + + $startTime = \microtime(true); + try { + $result = $process->run(); + $executionTime = \round(\microtime(true) - $startTime, 3); + + if ($lastRunTime !== null) { + $relativeDifference = \round($executionTime / $lastRunTime, 1); + printf("%.3f seconds (%sx slower); ", $executionTime, $relativeDifference); + } else { + printf("%.3f seconds; ", $executionTime); + } + + $lastRunTime = $executionTime; + + $actual = $process->getOutput(); + $actualOutputSize = strlen($actual); + + if ($lastOutputSize !== null && $lastOutputSize !== 0 && $lastInputSize !== null) { + $relativeDifference = \round($actualOutputSize / $lastOutputSize, 1); + printf("%.1fkb output (%sx larger)", $actualOutputSize / 1024, $relativeDifference); + if ($actualOutputSize > ($lastOutputSize * $inputSize / $lastInputSize * 1.5)) { + + printf(" \033[31;31m%s\033[0m", 'Max allowed size exceeded'); + $failures[$name] = 'Max allowed size exceeded'; + } + } else { + printf("%.1fkb output", $actualOutputSize / 1024); + } + echo "\n"; + + $lastOutputSize = $actualOutputSize; + + if (isset($case['expected'])) { + $expected = $case['expected']($size); + if (\trim($expected) !== \trim($actual)) { + printf(" \033[31;31m%s\033[0m\n", 'Unexpected output'); + $failures[$name] = 'Unexpected output'; + } + } + + if ($result !== 0) { + printf(" \033[31;31m%s\033[0m\n", 'Process failed: ' . $process->getErrorOutput()); + $failures[$name] = 'Process failed'; + } + } catch (ProcessTimedOutException $e) { + printf("\033[31;31m%s\033[0m\n", 'Max execution time ('.$timeout.'s) exceeded'); + $failures[$name] = 'Max execution time exceeded'; + } catch (ProcessSignaledException $e) { + printf("\033[31;31m%s\033[0m\n", 'Process signaled: ' . $e->getSignal()); + $failures[$name] = 'Process signaled'; + } finally { + $lastInputSize = \strlen($input); + } + } + print("\n"); +} + +print("\n"); +print("--------------------\n"); +print("\n"); +print(\count($failures) . " failure(s) out of " . \count($cases) . " test(s)\n"); + +if (\count($failures) > 0) { + exit(1); +} From 26c9ee36edc50dbc15aaa110e3c9241a25f596e8 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:00 -0500 Subject: [PATCH 03/29] Fix NUL characters not being replaced in the input --- CHANGELOG.md | 4 ++++ src/Parser/MarkdownParser.php | 3 +++ tests/pathological/test.php | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c82aafe29..0d2d9b9365 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,6 +81,10 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Fixed declaration parser being too strict - `FencedCodeRenderer`: don't add `language-` to class if already prefixed +### Fixed + +- Fixed NUL characters not being replaced in the input + ## [2.4.1] - 2023-08-30 ### Fixed diff --git a/src/Parser/MarkdownParser.php b/src/Parser/MarkdownParser.php index 2fecb9ba47..82ebb6682d 100644 --- a/src/Parser/MarkdownParser.php +++ b/src/Parser/MarkdownParser.php @@ -115,6 +115,9 @@ public function parse(string $input): Document */ private function parseLine(string $line): void { + // replace NUL characters for security + $line = \str_replace("\0", "\u{FFFD}", $line); + $this->cursor = new Cursor($line); $matches = $this->parseBlockContinuation(); diff --git a/tests/pathological/test.php b/tests/pathological/test.php index 3b9ee2229a..ec7f5e9d3d 100755 --- a/tests/pathological/test.php +++ b/tests/pathological/test.php @@ -19,6 +19,11 @@ require_once __DIR__ . '/../../vendor/autoload.php'; $cases = [ + 'U+0000 in input' => [ + 'sizes' => [1], + 'input' => static fn($n) => "abc\u{0000}def\u{0000}\n", + 'expected' => static fn($n) => "

abc\u{FFFD}def\u{FFFD}

", + ], 'Alternate line endings' => [ 'sizes' => [1], 'input' => static fn($n) => "- a\n- b\r- c\r\n- d", From 98caa5f27cc84e3e6d32dc263beabdd4285980bc Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:00 -0500 Subject: [PATCH 04/29] Fix quadratic complexity parsing unclosed inline links --- CHANGELOG.md | 5 +++++ src/Util/LinkParserHelper.php | 32 ++++++++++++++--------------- src/Util/RegexHelper.php | 6 ++++++ tests/unit/Util/RegexHelperTest.php | 10 +++++++++ 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d2d9b9365..61226cb889 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ## [Unreleased][unreleased] +### Added + +- Added `RegexHelper::isWhitespace()` method to check if a given character is an ASCII whitespace character + ### Changed - Several small performance optimizations @@ -84,6 +88,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ### Fixed - Fixed NUL characters not being replaced in the input +- Fixed quadratic complexity parsing unclosed inline links ## [2.4.1] - 2023-08-30 diff --git a/src/Util/LinkParserHelper.php b/src/Util/LinkParserHelper.php index e329669bdd..edf82b28e2 100644 --- a/src/Util/LinkParserHelper.php +++ b/src/Util/LinkParserHelper.php @@ -100,27 +100,27 @@ public static function parsePartialLinkTitle(Cursor $cursor, string $endDelimite private static function manuallyParseLinkDestination(Cursor $cursor): ?string { - $oldPosition = $cursor->getPosition(); - $oldState = $cursor->saveState(); - + $remainder = $cursor->getRemainder(); $openParens = 0; - while (($c = $cursor->getCurrentCharacter()) !== null) { - if ($c === '\\' && ($peek = $cursor->peek()) !== null && RegexHelper::isEscapable($peek)) { - $cursor->advanceBy(2); + $len = \strlen($remainder); + for ($i = 0; $i < $len; $i++) { + $c = $remainder[$i]; + if ($c === '\\' && $i + 1 < $len && RegexHelper::isEscapable($remainder[$i + 1])) { + $i++; } elseif ($c === '(') { - $cursor->advanceBy(1); $openParens++; + // Limit to 32 nested parens for pathological cases + if ($openParens > 32) { + return null; + } } elseif ($c === ')') { if ($openParens < 1) { break; } - $cursor->advanceBy(1); $openParens--; - } elseif (\preg_match(RegexHelper::REGEX_WHITESPACE_CHAR, $c)) { + } elseif (\ord($c) <= 32 && RegexHelper::isWhitespace($c)) { break; - } else { - $cursor->advanceBy(1); } } @@ -128,15 +128,13 @@ private static function manuallyParseLinkDestination(Cursor $cursor): ?string return null; } - if ($cursor->getPosition() === $oldPosition && (! isset($c) || $c !== ')')) { + if ($i === 0 && (! isset($c) || $c !== ')')) { return null; } - $newPos = $cursor->getPosition(); - $cursor->restoreState($oldState); - - $cursor->advanceBy($newPos - $cursor->getPosition()); + $destination = \substr($remainder, 0, $i); + $cursor->advanceBy(\mb_strlen($destination, 'UTF-8')); - return $cursor->getPreviousText(); + return $destination; } } diff --git a/src/Util/RegexHelper.php b/src/Util/RegexHelper.php index a89e7bda66..1767e3a4fd 100644 --- a/src/Util/RegexHelper.php +++ b/src/Util/RegexHelper.php @@ -83,6 +83,12 @@ public static function isEscapable(string $character): bool return \preg_match('/' . self::PARTIAL_ESCAPABLE . '/', $character) === 1; } + public static function isWhitespace(string $character): bool + { + /** @psalm-suppress InvalidLiteralArgument */ + return $character !== '' && \strpos(" \t\n\x0b\x0c\x0d", $character) !== false; + } + /** * @psalm-pure */ diff --git a/tests/unit/Util/RegexHelperTest.php b/tests/unit/Util/RegexHelperTest.php index c71a6e079c..ed9c5af6c6 100644 --- a/tests/unit/Util/RegexHelperTest.php +++ b/tests/unit/Util/RegexHelperTest.php @@ -281,6 +281,16 @@ public function testIsEscapable(): void $this->assertTrue(RegexHelper::isEscapable('\\')); } + public function testIsWhitespace(): void + { + $this->assertFalse(RegexHelper::isWhitespace('')); + $this->assertFalse(RegexHelper::isWhitespace('A')); + $this->assertTrue(RegexHelper::isWhitespace(' ')); + $this->assertTrue(RegexHelper::isWhitespace("\t")); + $this->assertTrue(RegexHelper::isWhitespace("\n")); + $this->assertTrue(RegexHelper::isWhitespace("\r")); + } + /** * @dataProvider dataForTestMatchAt */ From 9efa9145e3c1b1e2139896abfc278211eacfb40c Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:01 -0500 Subject: [PATCH 05/29] Fix quadratic complexity finding the bottom opener for multiple-of-3 delimiters --- CHANGELOG.md | 7 +++ .../2.5/customization/delimiter-processing.md | 2 + docs/2.6/upgrading.md | 17 +++++++ src/Delimiter/DelimiterStack.php | 37 ++++++++------- .../CacheableDelimiterProcessorInterface.php | 46 +++++++++++++++++++ .../Processor/DelimiterProcessorInterface.php | 3 ++ .../Processor/EmphasisDelimiterProcessor.php | 15 +++++- .../StrikethroughDelimiterProcessor.php | 9 +++- 8 files changed, 115 insertions(+), 21 deletions(-) create mode 100644 src/Delimiter/Processor/CacheableDelimiterProcessorInterface.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 61226cb889..ef2a89032e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ### Added - Added `RegexHelper::isWhitespace()` method to check if a given character is an ASCII whitespace character +- Added `CacheableDelimiterProcessorInterface` to ensure linear complexity for dynamic delimiter processing ### Changed @@ -85,10 +86,16 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Fixed declaration parser being too strict - `FencedCodeRenderer`: don't add `language-` to class if already prefixed +### Deprecated + +- Returning dynamic values from `DelimiterProcessorInterface::getDelimiterUse()` is deprecated + - You should instead implement `CacheableDelimiterProcessorInterface` to help the engine perform caching to avoid performance issues. + ### Fixed - Fixed NUL characters not being replaced in the input - Fixed quadratic complexity parsing unclosed inline links +- Fixed quadratic complexity finding the bottom opener for emphasis and strikethrough delimiters ## [2.4.1] - 2023-08-30 diff --git a/docs/2.5/customization/delimiter-processing.md b/docs/2.5/customization/delimiter-processing.md index 9d172b256c..8748d4ef08 100644 --- a/docs/2.5/customization/delimiter-processing.md +++ b/docs/2.5/customization/delimiter-processing.md @@ -48,6 +48,8 @@ public function getDelimiterUse(DelimiterInterface $opener, DelimiterInterface $ This method is used to tell the engine how many characters from the matching delimiters should be consumed. For simple processors you'll likely return `1` (or whatever your minimum length is). In more advanced cases, you can examine the opening and closing delimiters and perform additional logic to determine whether they should be fully or partially consumed. You can also return `0` if you'd like. +**Note:** Unless you're returning a hard-coded value, you should probably implement `CacheableDelimiterProcessorInterface` instead of `DelimiterProcessorInterface` - this will allow the engine to perform additional caching for better performance. + ### `process()` ```php diff --git a/docs/2.6/upgrading.md b/docs/2.6/upgrading.md index 4e84686028..d19d1760a5 100644 --- a/docs/2.6/upgrading.md +++ b/docs/2.6/upgrading.md @@ -6,3 +6,20 @@ redirect_from: /upgrading/ --- # Upgrading from 2.5 to 2.6 + +## Custom Delimiter Processors + +If you're implementing a custom delimiter processor, and `getDelimiterUse()` has more logic than just a +simple `return` statement, you should implement `CacheableDelimiterProcessorInterface` instead of +`DelimiterProcessorInterface` to improve performance and avoid possible quadratic performance issues. + +`DelimiterProcessorInterface` has a `getDelimiterUse()` method that tells the engine how many characters from the +matching delimiters should be consumed. Simple processors usually always return a hard-coded integer like `1` or `2`. +However, some more advanced processors may need to examine the opening and closing delimiters and perform additional +logic to determine whether they should be fully or partially consumed. Previously, these results could not be safely +cached, resulting in possible quadratic performance issues. + +In 2.6, the `CacheableDelimiterProcessorInterface` was introduced to allow these "dynamic" processors to be safely +cached. It requires a new `getCacheKey()` method that returns a string that uniquely identifies the combination of +factors considered when determining the delimiter use. This key is then used to cache the results of the search for +a matching delimiter. diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index fb95b907cf..b2cdcd9bc4 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -19,6 +19,7 @@ namespace League\CommonMark\Delimiter; +use League\CommonMark\Delimiter\Processor\CacheableDelimiterProcessorInterface; use League\CommonMark\Delimiter\Processor\DelimiterProcessorCollection; use League\CommonMark\Node\Inline\AdjacentTextMerger; @@ -129,21 +130,27 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro // Move forward, looking for closers, and handling each while ($closer !== null) { - $delimiterChar = $closer->getChar(); + $closingDelimiterChar = $closer->getChar(); - $delimiterProcessor = $processors->getDelimiterProcessor($delimiterChar); + $delimiterProcessor = $processors->getDelimiterProcessor($closingDelimiterChar); if (! $closer->canClose() || $delimiterProcessor === null) { $closer = $closer->getNext(); continue; } + if ($delimiterProcessor instanceof CacheableDelimiterProcessorInterface) { + $openersBottomCacheKey = $delimiterProcessor->getCacheKey($closer); + } else { + $openersBottomCacheKey = $closingDelimiterChar; + } + $openingDelimiterChar = $delimiterProcessor->getOpeningCharacter(); $useDelims = 0; $openerFound = false; $potentialOpenerFound = false; $opener = $closer->getPrevious(); - while ($opener !== null && $opener !== $stackBottom && $opener !== ($openersBottom[$delimiterChar] ?? null)) { + while ($opener !== null && $opener !== $stackBottom && $opener !== ($openersBottom[$openersBottomCacheKey] ?? null)) { if ($opener->canOpen() && $opener->getChar() === $openingDelimiterChar) { $potentialOpenerFound = true; $useDelims = $delimiterProcessor->getDelimiterUse($opener, $closer); @@ -157,20 +164,16 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro } if (! $openerFound) { - if (! $potentialOpenerFound) { - // Only do this when we didn't even have a potential - // opener (one that matches the character and can open). - // If an opener was rejected because of the number of - // delimiters (e.g. because of the "multiple of 3" - // Set lower bound for future searches for openersrule), - // we want to consider it next time because the number - // of delimiters can change as we continue processing. - $openersBottom[$delimiterChar] = $closer->getPrevious(); - if (! $closer->canOpen()) { - // We can remove a closer that can't be an opener, - // once we've seen there's no matching opener. - $this->removeDelimiter($closer); - } + // Set lower bound for future searches + // TODO: Remove this conditional check in 3.0. It only exists to prevent behavioral BC breaks in 2.x. + if ($potentialOpenerFound === false || $delimiterProcessor instanceof CacheableDelimiterProcessorInterface) { + $openersBottom[$openersBottomCacheKey] = $closer->getPrevious(); + } + + if (! $potentialOpenerFound && ! $closer->canOpen()) { + // We can remove a closer that can't be an opener, + // once we've seen there's no matching opener. + $this->removeDelimiter($closer); } $closer = $closer->getNext(); diff --git a/src/Delimiter/Processor/CacheableDelimiterProcessorInterface.php b/src/Delimiter/Processor/CacheableDelimiterProcessorInterface.php new file mode 100644 index 0000000000..a2a9b7ef39 --- /dev/null +++ b/src/Delimiter/Processor/CacheableDelimiterProcessorInterface.php @@ -0,0 +1,46 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Delimiter\Processor; + +use League\CommonMark\Delimiter\DelimiterInterface; + +/** + * Special marker interface for delimiter processors that return dynamic values from getDelimiterUse() + * + * In order to guarantee linear performance of delimiter processing, the delimiter stack must be able to + * cache the lower bound when searching for a matching opener. This gets complicated for delimiter processors + * that use a dynamic number of characters (like with emphasis and its "multiple of 3" rule). + */ +interface CacheableDelimiterProcessorInterface extends DelimiterProcessorInterface +{ + /** + * Returns a cache key of the factors that determine the number of characters to use. + * + * In order to guarantee linear performance of delimiter processing, the delimiter stack must be able to + * cache the lower bound when searching for a matching opener. This lower bound is usually quite simple; + * for example, with quotes, it's just the last opener with that characted. However, this gets complicated + * for delimiter processors that use a dynamic number of characters (like with emphasis and its "multiple + * of 3" rule), because the delimiter length being considered may change during processing because of that + * dynamic logic in getDelimiterUse(). Therefore, we cannot safely cache the lower bound for these dynamic + * processors without knowing the factors that determine the number of characters to use. + * + * At a minimum, this should include the delimiter character, plus any other factors used to determine + * the result of getDelimiterUse(). The format of the string is not important so long as it is unique + * (compared to other processors) and consistent for a given set of factors. + * + * If getDelimiterUse() always returns the same hard-coded value, this method should return just + * the delimiter character. + */ + public function getCacheKey(DelimiterInterface $closer): string; +} diff --git a/src/Delimiter/Processor/DelimiterProcessorInterface.php b/src/Delimiter/Processor/DelimiterProcessorInterface.php index 465378c390..5e88ddc7ad 100644 --- a/src/Delimiter/Processor/DelimiterProcessorInterface.php +++ b/src/Delimiter/Processor/DelimiterProcessorInterface.php @@ -58,6 +58,9 @@ public function getMinLength(): int; * return 0 when it doesn't want to allow this particular combination of * delimiter runs. * + * IMPORTANT: Unless this method returns the same hard-coded value in all cases, + * you MUST implement the CacheableDelimiterProcessorInterface interface instead. + * * @param DelimiterInterface $opener The opening delimiter run * @param DelimiterInterface $closer The closing delimiter run */ diff --git a/src/Extension/CommonMark/Delimiter/Processor/EmphasisDelimiterProcessor.php b/src/Extension/CommonMark/Delimiter/Processor/EmphasisDelimiterProcessor.php index 84b46ee6d9..9a6be13491 100644 --- a/src/Extension/CommonMark/Delimiter/Processor/EmphasisDelimiterProcessor.php +++ b/src/Extension/CommonMark/Delimiter/Processor/EmphasisDelimiterProcessor.php @@ -20,14 +20,14 @@ namespace League\CommonMark\Extension\CommonMark\Delimiter\Processor; use League\CommonMark\Delimiter\DelimiterInterface; -use League\CommonMark\Delimiter\Processor\DelimiterProcessorInterface; +use League\CommonMark\Delimiter\Processor\CacheableDelimiterProcessorInterface; use League\CommonMark\Extension\CommonMark\Node\Inline\Emphasis; use League\CommonMark\Extension\CommonMark\Node\Inline\Strong; use League\CommonMark\Node\Inline\AbstractStringContainer; use League\Config\ConfigurationAwareInterface; use League\Config\ConfigurationInterface; -final class EmphasisDelimiterProcessor implements DelimiterProcessorInterface, ConfigurationAwareInterface +final class EmphasisDelimiterProcessor implements CacheableDelimiterProcessorInterface, ConfigurationAwareInterface { /** @psalm-readonly */ private string $char; @@ -105,4 +105,15 @@ public function setConfiguration(ConfigurationInterface $configuration): void { $this->config = $configuration; } + + public function getCacheKey(DelimiterInterface $closer): string + { + return \sprintf( + '%s-%s-%d-%d', + $this->char, + $closer->canOpen() ? 'canOpen' : 'cannotOpen', + $closer->getOriginalLength() % 3, + $closer->getLength(), + ); + } } diff --git a/src/Extension/Strikethrough/StrikethroughDelimiterProcessor.php b/src/Extension/Strikethrough/StrikethroughDelimiterProcessor.php index 978e75a43f..a6c8d3889f 100644 --- a/src/Extension/Strikethrough/StrikethroughDelimiterProcessor.php +++ b/src/Extension/Strikethrough/StrikethroughDelimiterProcessor.php @@ -14,10 +14,10 @@ namespace League\CommonMark\Extension\Strikethrough; use League\CommonMark\Delimiter\DelimiterInterface; -use League\CommonMark\Delimiter\Processor\DelimiterProcessorInterface; +use League\CommonMark\Delimiter\Processor\CacheableDelimiterProcessorInterface; use League\CommonMark\Node\Inline\AbstractStringContainer; -final class StrikethroughDelimiterProcessor implements DelimiterProcessorInterface +final class StrikethroughDelimiterProcessor implements CacheableDelimiterProcessorInterface { public function getOpeningCharacter(): string { @@ -61,4 +61,9 @@ public function process(AbstractStringContainer $opener, AbstractStringContainer $opener->insertAfter($strikethrough); } + + public function getCacheKey(DelimiterInterface $closer): string + { + return '~' . $closer->getLength(); + } } From 4a0842c2a3d9075d4184e95d4c055bccadc54252 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:01 -0500 Subject: [PATCH 06/29] Only set 'delim' attribute when we actually have an associated delimiter --- src/Delimiter/DelimiterParser.php | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Delimiter/DelimiterParser.php b/src/Delimiter/DelimiterParser.php index 3f96addf85..fdfe093c90 100644 --- a/src/Delimiter/DelimiterParser.php +++ b/src/Delimiter/DelimiterParser.php @@ -62,16 +62,20 @@ public function parse(InlineParserContext $inlineContext): bool [$canOpen, $canClose] = self::determineCanOpenOrClose($charBefore, $charAfter, $character, $processor); + if (! ($canOpen || $canClose)) { + $inlineContext->getContainer()->appendChild(new Text(\str_repeat($character, $numDelims))); + + return true; + } + $node = new Text(\str_repeat($character, $numDelims), [ 'delim' => true, ]); $inlineContext->getContainer()->appendChild($node); // Add entry to stack to this opener - if ($canOpen || $canClose) { - $delimiter = new Delimiter($character, $numDelims, $node, $canOpen, $canClose); - $inlineContext->getDelimiterStack()->push($delimiter); - } + $delimiter = new Delimiter($character, $numDelims, $node, $canOpen, $canClose, $inlineContext->getCursor()->getPosition()); + $inlineContext->getDelimiterStack()->push($delimiter); return true; } From 1ced64623b47873dee5812419787afc902c2813a Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:01 -0500 Subject: [PATCH 07/29] Avoid triggering PHP segfault issue when dealing with tons of delimiters --- CHANGELOG.md | 1 + src/Delimiter/DelimiterStack.php | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef2a89032e..91fa508743 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -96,6 +96,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Fixed NUL characters not being replaced in the input - Fixed quadratic complexity parsing unclosed inline links - Fixed quadratic complexity finding the bottom opener for emphasis and strikethrough delimiters +- Fixed issue where having 500,000+ delimiters could trigger a [known segmentation fault issue in PHP's garbage collection](https://bugs.php.net/bug.php?id=68606) ## [2.4.1] - 2023-08-30 diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index b2cdcd9bc4..b20d55053f 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -28,6 +28,9 @@ final class DelimiterStack /** @psalm-readonly-allow-private-mutation */ private ?DelimiterInterface $top = null; + /** @var DelimiterInterface[] */ + private array $removedDelimiters = []; + public function push(DelimiterInterface $newDelimiter): void { $newDelimiter->setPrevious($this->top); @@ -63,6 +66,9 @@ public function removeDelimiter(DelimiterInterface $delimiter): void /** @psalm-suppress PossiblyNullReference */ $delimiter->getNext()->setPrevious($delimiter->getPrevious()); } + + // Mark this delimiter as removed so we can help garbage collect references from it later + $this->removedDelimiters[] = $delimiter; } private function removeDelimiterAndNode(DelimiterInterface $delimiter): void @@ -130,6 +136,8 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro // Move forward, looking for closers, and handling each while ($closer !== null) { + $this->garbageCollectRemovedDelimiterReferences(); + $closingDelimiterChar = $closer->getChar(); $delimiterProcessor = $processors->getDelimiterProcessor($closingDelimiterChar); @@ -214,4 +222,31 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro // Remove all delimiters $this->removeAll($stackBottom); } + + /** + * Nullify all references from removed delimiters to other delimiters. + * + * All references to this particular delimiter in the linked list should be gone, + * but it's possible we're still hanging on to other references to things that + * have been (or soon will be) removed, which may interfere with efficient + * garbage collection by the PHP runtime. + * + * Explicitly releasing these references should help to avoid possible + * segfaults like in https://bugs.php.net/bug.php?id=68606. + * + * Note that we cannot do this in the `removeDelimiter()` method, because + * that method is called from within a loop that iterates over the linked + * list. If we were to nullify the references there, we'd be modifying the + * linked list while we're iterating over it, which would cause the loop to + * skip over some elements. + */ + private function garbageCollectRemovedDelimiterReferences(): void + { + foreach ($this->removedDelimiters as $delimiter) { + $delimiter->setPrevious(null); + $delimiter->setNext(null); + } + + $this->removedDelimiters = []; + } } From ce6ce98d42bd038f1dde0a5b4941e9319b783b6a Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:01 -0500 Subject: [PATCH 08/29] Better segfault workaround --- src/Delimiter/DelimiterStack.php | 46 +++++++++++--------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index b20d55053f..60e24d6707 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -28,9 +28,6 @@ final class DelimiterStack /** @psalm-readonly-allow-private-mutation */ private ?DelimiterInterface $top = null; - /** @var DelimiterInterface[] */ - private array $removedDelimiters = []; - public function push(DelimiterInterface $newDelimiter): void { $newDelimiter->setPrevious($this->top); @@ -67,8 +64,15 @@ public function removeDelimiter(DelimiterInterface $delimiter): void $delimiter->getNext()->setPrevious($delimiter->getPrevious()); } - // Mark this delimiter as removed so we can help garbage collect references from it later - $this->removedDelimiters[] = $delimiter; + // Nullify all references from the removed delimiter to other delimiters. + // All references to this particular delimiter in the linked list should be gone, + // but it's possible we're still hanging on to other references to things that + // have been (or soon will be) removed, which may interfere with efficient + // garbage collection by the PHP runtime. + // Explicitly releasing these references should help to avoid possible + // segfaults like in https://bugs.php.net/bug.php?id=68606. + $delimiter->setPrevious(null); + $delimiter->setNext(null); } private function removeDelimiterAndNode(DelimiterInterface $delimiter): void @@ -136,8 +140,6 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro // Move forward, looking for closers, and handling each while ($closer !== null) { - $this->garbageCollectRemovedDelimiterReferences(); - $closingDelimiterChar = $closer->getChar(); $delimiterProcessor = $processors->getDelimiterProcessor($closingDelimiterChar); @@ -181,10 +183,13 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro if (! $potentialOpenerFound && ! $closer->canOpen()) { // We can remove a closer that can't be an opener, // once we've seen there's no matching opener. + $next = $closer->getNext(); $this->removeDelimiter($closer); + $closer = $next; + } else { + $closer = $closer->getNext(); } - $closer = $closer->getNext(); continue; } @@ -224,29 +229,10 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro } /** - * Nullify all references from removed delimiters to other delimiters. - * - * All references to this particular delimiter in the linked list should be gone, - * but it's possible we're still hanging on to other references to things that - * have been (or soon will be) removed, which may interfere with efficient - * garbage collection by the PHP runtime. - * - * Explicitly releasing these references should help to avoid possible - * segfaults like in https://bugs.php.net/bug.php?id=68606. - * - * Note that we cannot do this in the `removeDelimiter()` method, because - * that method is called from within a loop that iterates over the linked - * list. If we were to nullify the references there, we'd be modifying the - * linked list while we're iterating over it, which would cause the loop to - * skip over some elements. + * @internal */ - private function garbageCollectRemovedDelimiterReferences(): void + public function __destruct() { - foreach ($this->removedDelimiters as $delimiter) { - $delimiter->setPrevious(null); - $delimiter->setNext(null); - } - - $this->removedDelimiters = []; + $this->removeAll(); } } From dd62b76f92cdb01a28bc48442820d9f75919231d Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:01 -0500 Subject: [PATCH 09/29] Avoid calling getRemainder() until we know we need that full substring --- src/Parser/Cursor.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Parser/Cursor.php b/src/Parser/Cursor.php index faae75bc1d..598cd75b52 100644 --- a/src/Parser/Cursor.php +++ b/src/Parser/Cursor.php @@ -322,17 +322,17 @@ public function advanceToNextNonSpaceOrTab(): int */ public function advanceToNextNonSpaceOrNewline(): int { - $remainder = $this->getRemainder(); + $currentCharacter = $this->getCurrentCharacter(); // Optimization: Avoid the regex if we know there are no spaces or newlines - if ($remainder === '' || ($remainder[0] !== ' ' && $remainder[0] !== "\n")) { + if ($currentCharacter !== ' ' && $currentCharacter !== "\n") { $this->previousPosition = $this->currentPosition; return 0; } $matches = []; - \preg_match('/^ *(?:\n *)?/', $remainder, $matches, \PREG_OFFSET_CAPTURE); + \preg_match('/^ *(?:\n *)?/', $this->getRemainder(), $matches, \PREG_OFFSET_CAPTURE); // [0][0] contains the matched text // [0][1] contains the index of that match From 15982df26f0554ac47bff3141d16cb1375cd27c5 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:02 -0500 Subject: [PATCH 10/29] Optimize case folding for ASCII-only strings --- src/Normalizer/TextNormalizer.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Normalizer/TextNormalizer.php b/src/Normalizer/TextNormalizer.php index 7860f1b939..43eb117453 100644 --- a/src/Normalizer/TextNormalizer.php +++ b/src/Normalizer/TextNormalizer.php @@ -34,6 +34,11 @@ public function normalize(string $text, array $context = []): string $text = \preg_replace('/[ \t\r\n]+/', ' ', \trim($text)); \assert(\is_string($text)); + // Is it strictly ASCII? If so, we can use strtolower() instead (faster) + if (\mb_check_encoding($text, 'ASCII')) { + return \strtolower($text); + } + return \mb_convert_case($text, \MB_CASE_FOLD, 'UTF-8'); } } From 02eeb63cefd7ba9b1f7133dd89b76577e9f2a034 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:02 -0500 Subject: [PATCH 11/29] Avoid unnecessary normalization when refmap is empty --- src/Reference/ReferenceMap.php | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Reference/ReferenceMap.php b/src/Reference/ReferenceMap.php index 982cb1253a..97a167dcf2 100644 --- a/src/Reference/ReferenceMap.php +++ b/src/Reference/ReferenceMap.php @@ -48,6 +48,10 @@ public function add(ReferenceInterface $reference): void public function contains(string $label): bool { + if ($this->references === []) { + return false; + } + $label = $this->normalizer->normalize($label); return isset($this->references[$label]); @@ -55,6 +59,10 @@ public function contains(string $label): bool public function get(string $label): ?ReferenceInterface { + if ($this->references === []) { + return null; + } + $label = $this->normalizer->normalize($label); return $this->references[$label] ?? null; From 0f53cf1f3d97a46e369d6189d8f1bb9be8f3b0a3 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:02 -0500 Subject: [PATCH 12/29] Optimize link destination parsing --- src/Util/LinkParserHelper.php | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Util/LinkParserHelper.php b/src/Util/LinkParserHelper.php index edf82b28e2..af4d5e15c0 100644 --- a/src/Util/LinkParserHelper.php +++ b/src/Util/LinkParserHelper.php @@ -30,14 +30,14 @@ final class LinkParserHelper */ public static function parseLinkDestination(Cursor $cursor): ?string { - if ($res = $cursor->match(RegexHelper::REGEX_LINK_DESTINATION_BRACES)) { - // Chop off surrounding <..>: - return UrlEncoder::unescapeAndEncode( - RegexHelper::unescape(\substr($res, 1, -1)) - ); - } - if ($cursor->getCurrentCharacter() === '<') { + if ($res = $cursor->match(RegexHelper::REGEX_LINK_DESTINATION_BRACES)) { + // Chop off surrounding <..>: + return UrlEncoder::unescapeAndEncode( + RegexHelper::unescape(\substr($res, 1, -1)) + ); + } + return null; } @@ -100,9 +100,9 @@ public static function parsePartialLinkTitle(Cursor $cursor, string $endDelimite private static function manuallyParseLinkDestination(Cursor $cursor): ?string { - $remainder = $cursor->getRemainder(); + $remainder = $cursor->getRemainder(); $openParens = 0; - $len = \strlen($remainder); + $len = \strlen($remainder); for ($i = 0; $i < $len; $i++) { $c = $remainder[$i]; if ($c === '\\' && $i + 1 < $len && RegexHelper::isEscapable($remainder[$i + 1])) { From 7ce82c66841412e311fced9c967bf94f91aebd73 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:02 -0500 Subject: [PATCH 13/29] Add config option to prevent denial-of-service with large tables --- .phpstorm.meta.php | 1 + CHANGELOG.md | 1 + docs/2.5/extensions/tables.md | 9 +++++++++ src/Extension/Table/TableExtension.php | 3 ++- src/Extension/Table/TableParser.php | 24 +++++++++++++++++++----- src/Extension/Table/TableStartParser.php | 9 ++++++++- 6 files changed, 40 insertions(+), 7 deletions(-) diff --git a/.phpstorm.meta.php b/.phpstorm.meta.php index ca1bec7ee1..0b56ab530d 100644 --- a/.phpstorm.meta.php +++ b/.phpstorm.meta.php @@ -89,6 +89,7 @@ 'table/alignment_attributes/left', 'table/alignment_attributes/center', 'table/alignment_attributes/right', + 'table/max_autocompleted_cells', 'table_of_contents', 'table_of_contents/html_class', 'table_of_contents/max_heading_level', diff --git a/CHANGELOG.md b/CHANGELOG.md index 91fa508743..d2d544b137 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Added `RegexHelper::isWhitespace()` method to check if a given character is an ASCII whitespace character - Added `CacheableDelimiterProcessorInterface` to ensure linear complexity for dynamic delimiter processing +- Added `table/max_autocompleted_cells` config option to prevent denial of service attacks when parsing large tables ### Changed diff --git a/docs/2.5/extensions/tables.md b/docs/2.5/extensions/tables.md index c98bd07264..5f838a675e 100644 --- a/docs/2.5/extensions/tables.md +++ b/docs/2.5/extensions/tables.md @@ -44,6 +44,7 @@ $config = [ 'center' => ['align' => 'center'], 'right' => ['align' => 'right'], ], + 'max_autocompleted_cells' => 10_000, ], ]; @@ -159,6 +160,14 @@ $config = [ Or any other HTML attributes you'd like! +### Limiting Auto-Completed Cells + +The GFM specification says that the: + +> table’s rows may vary in the number of cells. If there are a number of cells fewer than the number of cells in the header row, empty cells are inserted. + +This feature could be abused to create very large tables. To prevent this, you can configure the `max_autocompleted_cells` option to limit the number of empty cells that will be autocompleted. If the limit is reached, further parsing of the table will be aborted. + ## Credits The Table functionality was originally built by [Martin Hasoň](https://github.com/hason) and [Webuni s.r.o.](https://www.webuni.cz) before it was merged into the core parser. diff --git a/src/Extension/Table/TableExtension.php b/src/Extension/Table/TableExtension.php index 27a58bbdc6..0a8db3ed2a 100644 --- a/src/Extension/Table/TableExtension.php +++ b/src/Extension/Table/TableExtension.php @@ -41,6 +41,7 @@ public function configureSchema(ConfigurationBuilderInterface $builder): void 'center' => (clone $attributeArraySchema)->default(['align' => 'center']), 'right' => (clone $attributeArraySchema)->default(['align' => 'right']), ]), + 'max_autocompleted_cells' => Expect::int()->min(0)->default(TableParser::DEFAULT_MAX_AUTOCOMPLETED_CELLS), ])); } @@ -52,7 +53,7 @@ public function register(EnvironmentBuilderInterface $environment): void } $environment - ->addBlockStartParser(new TableStartParser()) + ->addBlockStartParser(new TableStartParser($environment->getConfiguration()->get('table/max_autocompleted_cells'))) ->addRenderer(Table::class, $tableRenderer) ->addRenderer(TableSection::class, new TableSectionRenderer()) diff --git a/src/Extension/Table/TableParser.php b/src/Extension/Table/TableParser.php index ca340a31f2..8ceb2ad5eb 100644 --- a/src/Extension/Table/TableParser.php +++ b/src/Extension/Table/TableParser.php @@ -25,6 +25,11 @@ final class TableParser extends AbstractBlockContinueParser implements BlockContinueParserWithInlinesInterface { + /** + * @internal + */ + public const DEFAULT_MAX_AUTOCOMPLETED_CELLS = 10_000; + /** @psalm-readonly */ private Table $block; @@ -54,6 +59,8 @@ final class TableParser extends AbstractBlockContinueParser implements BlockCont /** @psalm-readonly-allow-private-mutation */ private bool $nextIsSeparatorLine = true; + private int $remainingAutocompletedCells; + /** * @param array $columns * @param array $headerCells @@ -62,12 +69,13 @@ final class TableParser extends AbstractBlockContinueParser implements BlockCont * * @phpstan-param array $columns */ - public function __construct(array $columns, array $headerCells) + public function __construct(array $columns, array $headerCells, int $remainingAutocompletedCells = self::DEFAULT_MAX_AUTOCOMPLETED_CELLS) { - $this->block = new Table(); - $this->bodyLines = new ArrayCollection(); - $this->columns = $columns; - $this->headerCells = $headerCells; + $this->block = new Table(); + $this->bodyLines = new ArrayCollection(); + $this->columns = $columns; + $this->headerCells = $headerCells; + $this->remainingAutocompletedCells = $remainingAutocompletedCells; } public function canHaveLazyContinuationLines(): bool @@ -121,6 +129,12 @@ public function parseInlines(InlineParserEngineInterface $inlineParser): void // Body can not have more columns than head for ($i = 0; $i < $headerColumns; $i++) { + // It can have less columns though, in which case we'll autocomplete the empty ones (up to some limit) + if (! isset($cells[$i]) && $this->remainingAutocompletedCells-- <= 0) { + // Too many cells were auto-completed, so we'll just stop here + return; + } + $cell = $cells[$i] ?? ''; $tableCell = $this->parseCell($cell, $i, $inlineParser); $row->appendChild($tableCell); diff --git a/src/Extension/Table/TableStartParser.php b/src/Extension/Table/TableStartParser.php index 08b4878f74..7411951c3d 100644 --- a/src/Extension/Table/TableStartParser.php +++ b/src/Extension/Table/TableStartParser.php @@ -23,6 +23,13 @@ final class TableStartParser implements BlockStartParserInterface { + private int $maxAutocompletedCells; + + public function __construct(int $maxAutocompletedCells = TableParser::DEFAULT_MAX_AUTOCOMPLETED_CELLS) + { + $this->maxAutocompletedCells = $maxAutocompletedCells; + } + public function tryStart(Cursor $cursor, MarkdownParserStateInterface $parserState): ?BlockStart { $paragraph = $parserState->getParagraphContent(); @@ -53,7 +60,7 @@ public function tryStart(Cursor $cursor, MarkdownParserStateInterface $parserSta $parsers[] = $p; } - $parsers[] = new TableParser($columns, $headerCells); + $parsers[] = new TableParser($columns, $headerCells, $this->maxAutocompletedCells); return BlockStart::of(...$parsers) ->at($cursor) From 4f9261f0ff520352211c92cf9224670c88f2673a Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:02 -0500 Subject: [PATCH 14/29] Optimize TableParser::parseCell() for empty cells --- src/Extension/Table/TableParser.php | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Extension/Table/TableParser.php b/src/Extension/Table/TableParser.php index 8ceb2ad5eb..a005f8a97e 100644 --- a/src/Extension/Table/TableParser.php +++ b/src/Extension/Table/TableParser.php @@ -152,14 +152,12 @@ public function parseInlines(InlineParserEngineInterface $inlineParser): void private function parseCell(string $cell, int $column, InlineParserEngineInterface $inlineParser): TableCell { - $tableCell = new TableCell(); + $tableCell = new TableCell(TableCell::TYPE_DATA, $this->columns[$column] ?? null); - if ($column < \count($this->columns)) { - $tableCell->setAlign($this->columns[$column]); + if ($cell !== '') { + $inlineParser->parse(\trim($cell), $tableCell); } - $inlineParser->parse(\trim($cell), $tableCell); - return $tableCell; } From 67cd33be3a861e40f88298e6903e4f3709e1eac7 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:02 -0500 Subject: [PATCH 15/29] Limit number of subdomains allowed in an autolink --- CHANGELOG.md | 1 + src/Extension/Autolink/UrlAutolinkParser.php | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2d544b137..e8e3fa61d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ### Changed +- `UrlAutolinkParser` no longer parses URLs with more than 127 subdomains - Several small performance optimizations ## [2.5.3] - 2024-08-16 diff --git a/src/Extension/Autolink/UrlAutolinkParser.php b/src/Extension/Autolink/UrlAutolinkParser.php index 1ef270fe85..f487616552 100644 --- a/src/Extension/Autolink/UrlAutolinkParser.php +++ b/src/Extension/Autolink/UrlAutolinkParser.php @@ -34,7 +34,7 @@ final class UrlAutolinkParser implements InlineParserInterface (?: (?:xn--[a-z0-9-]++\.)*+xn--[a-z0-9-]++ # a domain name using punycode | - (?:[\pL\pN\pS\pM\-\_]++\.)+[\pL\pN\pM]++ # a multi-level domain name + (?:[\pL\pN\pS\pM\-\_]++\.){1,127}[\pL\pN\pM]++ # a multi-level domain name; total length must be 253 bytes or less | [a-z0-9\-\_]++ # a single-level domain name )\.? From 00f06e53900397bfc40be04ca49c333b69499a67 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:03 -0500 Subject: [PATCH 16/29] Avoid quadratic output growth when expanding reference links Keep track of the number bytes added through expansion of reference links and limit the total to the size of the input document. Always allow a minimum of 100KB. See https://github.com/github/cmark-gfm/commit/b4f0106289d0ae41b591c91ee340d455b5049c2a --- CHANGELOG.md | 1 + src/Parser/MarkdownParser.php | 7 ++- src/Reference/MemoryLimitedReferenceMap.php | 68 +++++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 src/Reference/MemoryLimitedReferenceMap.php diff --git a/CHANGELOG.md b/CHANGELOG.md index e8e3fa61d5..68dfad6d3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ### Changed - `UrlAutolinkParser` no longer parses URLs with more than 127 subdomains +- Expanded reference links can no longer exceed 100kb, or the size of the input document (whichever is greater) - Several small performance optimizations ## [2.5.3] - 2024-08-16 diff --git a/src/Parser/MarkdownParser.php b/src/Parser/MarkdownParser.php index 82ebb6682d..904c7c45b4 100644 --- a/src/Parser/MarkdownParser.php +++ b/src/Parser/MarkdownParser.php @@ -32,6 +32,7 @@ use League\CommonMark\Parser\Block\BlockStartParserInterface; use League\CommonMark\Parser\Block\DocumentBlockParser; use League\CommonMark\Parser\Block\ParagraphParser; +use League\CommonMark\Reference\MemoryLimitedReferenceMap; use League\CommonMark\Reference\ReferenceInterface; use League\CommonMark\Reference\ReferenceMap; @@ -102,7 +103,7 @@ public function parse(string $input): Document // finalizeAndProcess $this->closeBlockParsers(\count($this->activeBlockParsers), $this->lineNumber); - $this->processInlines(); + $this->processInlines(\strlen($input)); $this->environment->dispatch(new DocumentParsedEvent($documentParser->getBlock())); @@ -266,9 +267,9 @@ private function finalize(BlockContinueParserInterface $blockParser, int $endLin /** * Walk through a block & children recursively, parsing string content into inline content where appropriate. */ - private function processInlines(): void + private function processInlines(int $inputSize): void { - $p = new InlineParserEngine($this->environment, $this->referenceMap); + $p = new InlineParserEngine($this->environment, new MemoryLimitedReferenceMap($this->referenceMap, $inputSize)); foreach ($this->closedBlockParsers as $blockParser) { $blockParser->parseInlines($p); diff --git a/src/Reference/MemoryLimitedReferenceMap.php b/src/Reference/MemoryLimitedReferenceMap.php new file mode 100644 index 0000000000..d47bd6a6de --- /dev/null +++ b/src/Reference/MemoryLimitedReferenceMap.php @@ -0,0 +1,68 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Reference; + +final class MemoryLimitedReferenceMap implements ReferenceMapInterface +{ + private ReferenceMapInterface $decorated; + + private const MINIMUM_SIZE = 100_000; + + private int $remaining; + + public function __construct(ReferenceMapInterface $decorated, int $maxSize) + { + $this->decorated = $decorated; + $this->remaining = \max(self::MINIMUM_SIZE, $maxSize); + } + + public function add(ReferenceInterface $reference): void + { + $this->decorated->add($reference); + } + + public function contains(string $label): bool + { + return $this->decorated->contains($label); + } + + public function get(string $label): ?ReferenceInterface + { + $reference = $this->decorated->get($label); + if ($reference === null) { + return null; + } + + // Check for expansion limit + $this->remaining -= \strlen($reference->getDestination()) + \strlen($reference->getTitle()); + if ($this->remaining < 0) { + return null; + } + + return $reference; + } + + /** + * @return \Traversable + */ + public function getIterator(): \Traversable + { + return $this->decorated->getIterator(); + } + + public function count(): int + { + return $this->decorated->count(); + } +} From af0d839b49a1c7c86d55bd0dbfc2d9c3234ea2c8 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:03 -0500 Subject: [PATCH 17/29] Add tests for DelimiterStack --- tests/unit/Delimiter/DelimiterStackTest.php | 248 ++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 tests/unit/Delimiter/DelimiterStackTest.php diff --git a/tests/unit/Delimiter/DelimiterStackTest.php b/tests/unit/Delimiter/DelimiterStackTest.php new file mode 100644 index 0000000000..93d3e12210 --- /dev/null +++ b/tests/unit/Delimiter/DelimiterStackTest.php @@ -0,0 +1,248 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Tests\Unit\Delimiter; + +use League\CommonMark\Delimiter\Delimiter; +use League\CommonMark\Delimiter\DelimiterStack; +use League\CommonMark\Node\Inline\Text; +use PHPUnit\Framework\TestCase; + +final class DelimiterStackTest extends TestCase +{ + public function testPush(): void + { + $text1 = new Text('*'); + $text2 = new Text('_'); + $text3 = new Text('*'); + + $delim1 = new Delimiter('*', 1, $text1, true, false); + $delim2 = new Delimiter('_', 1, $text2, false, false); + $delim3 = new Delimiter('*', 1, $text3, false, true); + + $stack = new DelimiterStack(); + + $this->assertNull($stack->searchByCharacter('*')); + $this->assertNull($stack->searchByCharacter('_')); + + $stack->push($delim1); + + $this->assertSame($delim1, $stack->searchByCharacter('*')); + $this->assertNull($stack->searchByCharacter('_')); + $this->assertNull($delim1->getPrevious()); + $this->assertNull($delim1->getNext()); + $this->assertNull($delim2->getPrevious()); + $this->assertNull($delim2->getNext()); + $this->assertNull($delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + + $stack->push($delim2); + + $this->assertSame($delim1, $stack->searchByCharacter('*')); + $this->assertSame($delim2, $stack->searchByCharacter('_')); + $this->assertNull($delim1->getPrevious()); + $this->assertSame($delim2, $delim1->getNext()); + $this->assertSame($delim1, $delim2->getPrevious()); + $this->assertNull($delim2->getNext()); + $this->assertNull($delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + + $stack->push($delim3); + + $this->assertSame($delim3, $stack->searchByCharacter('*')); + $this->assertSame($delim2, $stack->searchByCharacter('_')); + $this->assertNull($delim1->getPrevious()); + $this->assertSame($delim2, $delim1->getNext()); + $this->assertSame($delim1, $delim2->getPrevious()); + $this->assertSame($delim3, $delim2->getNext()); + $this->assertSame($delim2, $delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + } + + public function testRemoveDelimiter(): void + { + $text1 = new Text('*'); + $text2 = new Text('_'); + $text3 = new Text('*'); + + $stack = new DelimiterStack(); + $stack->push($delim1 = new Delimiter('*', 1, $text1, true, false)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, true)); + + $this->assertNull($delim1->getPrevious()); + $this->assertSame($delim2, $delim1->getNext()); + + $this->assertSame($delim1, $delim2->getPrevious()); + $this->assertSame($delim3, $delim2->getNext()); + + $this->assertSame($delim2, $delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + + $stack->removeDelimiter($delim2); + + $this->assertNull($delim1->getPrevious()); + $this->assertSame($delim3, $delim1->getNext()); + + $this->assertNull($delim2->getPrevious()); + $this->assertNull($delim2->getNext()); + + $this->assertSame($delim1, $delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + + $stack->removeDelimiter($delim3); + + $this->assertNull($delim1->getPrevious()); + $this->assertNull($delim1->getNext()); + + $this->assertNull($delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + } + + public function testRemoveDelimitersBetween(): void + { + $text1 = new Text('*'); + $text2 = new Text('_'); + $text3 = new Text('*'); + $text4 = new Text('_'); + $text5 = new Text('*'); + + $stack = new DelimiterStack(); + $stack->push($delim1 = new Delimiter('*', 1, $text1, false, false)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, false)); + $stack->push($delim4 = new Delimiter('_', 1, $text4, false, false)); + $stack->push($delim5 = new Delimiter('*', 1, $text5, false, false)); + + // Make removeDelimiterBetween() callable using reflection + $reflection = new \ReflectionClass($stack); + $method = $reflection->getMethod('removeDelimitersBetween'); + $method->setAccessible(true); + + $method->invoke($stack, $delim1, $delim5); + + $this->assertNull($delim1->getPrevious()); + $this->assertSame($delim5, $delim1->getNext()); + + $this->assertNull($delim2->getPrevious()); + $this->assertNull($delim2->getNext()); + + $this->assertNull($delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + + $this->assertNull($delim4->getPrevious()); + $this->assertNull($delim4->getNext()); + + $this->assertSame($delim1, $delim5->getPrevious()); + $this->assertNull($delim5->getNext()); + } + + public function testFindEarliest(): void + { + $text1 = new Text('*'); + $text2 = new Text('_'); + $text3 = new Text('*'); + + $stack = new DelimiterStack(); + $stack->push($delim1 = new Delimiter('*', 1, $text1, true, false)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, true)); + + // Use reflection to make findEarliest() callable + $reflection = new \ReflectionClass($stack); + $method = $reflection->getMethod('findEarliest'); + $method->setAccessible(true); + + $this->assertSame($delim1, $method->invoke($stack)); + $this->assertSame($delim2, $method->invoke($stack, $delim1)); + $this->assertSame($delim3, $method->invoke($stack, $delim2)); + $this->assertNull($method->invoke($stack, $delim3)); + } + + public function testRemoveAll(): void + { + $text1 = new Text('*'); + $text2 = new Text('_'); + $text3 = new Text('*'); + + $stack = new DelimiterStack(); + $stack->push($delim1 = new Delimiter('*', 1, $text1, true, false)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, true)); + + $stack->removeAll(); + + $this->assertNull($delim1->getPrevious()); + $this->assertNull($delim1->getNext()); + + $this->assertNull($delim2->getPrevious()); + $this->assertNull($delim2->getNext()); + + $this->assertNull($delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + + $this->assertNull($stack->searchByCharacter('*')); + $this->assertNull($stack->searchByCharacter('_')); + } + + public function testRemoveAllWithStackBottomGiven(): void + { + $text1 = new Text('*'); + $text2 = new Text('_'); + $text3 = new Text('*'); + + $stack = new DelimiterStack(); + $stack->push($delim1 = new Delimiter('*', 1, $text1, true, false)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, true)); + + $stack->removeAll($delim2); + + $this->assertSame($delim1, $stack->searchByCharacter('*')); + $this->assertSame($delim2, $stack->searchByCharacter('_')); + + $this->assertNull($delim1->getPrevious()); + $this->assertSame($delim2, $delim1->getNext()); + + $this->assertSame($delim1, $delim2->getPrevious()); + $this->assertNull($delim2->getNext()); + + $this->assertNull($delim3->getPrevious()); + $this->assertNull($delim3->getNext()); + } + + public function testRemoveEarlierMatches(): void + { + $text1 = new Text('*'); + $text2 = new Text('_'); + $text3 = new Text('*'); + + $stack = new DelimiterStack(); + $stack->push($delim1 = new Delimiter('*', 1, $text1, true, false)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, true)); + + $this->assertTrue($delim1->isActive()); + $this->assertTrue($delim2->isActive()); + $this->assertTrue($delim3->isActive()); + + $stack->removeEarlierMatches('*'); + + $this->assertFalse($delim1->isActive()); + $this->assertTrue($delim2->isActive()); + $this->assertFalse($delim3->isActive()); + + $stack->removeEarlierMatches('_'); + $this->assertFalse($delim2->isActive()); + } +} From a8df15c6bc234c8c961a61c2a04b4031aefa6e04 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:03 -0500 Subject: [PATCH 18/29] Use a separate stack for brackets See https://github.com/commonmark/commonmark.js/pull/101 --- CHANGELOG.md | 4 + src/Delimiter/Bracket.php | 87 +++++++++++++++++++ src/Delimiter/DelimiterInterface.php | 6 ++ src/Delimiter/DelimiterStack.php | 67 +++++++++++++- .../CommonMark/Parser/Inline/BangParser.php | 4 +- .../Parser/Inline/CloseBracketParser.php | 46 +++++----- .../Parser/Inline/OpenBracketParser.php | 4 +- tests/unit/Delimiter/DelimiterStackTest.php | 82 +++++++++++++++++ 8 files changed, 271 insertions(+), 29 deletions(-) create mode 100644 src/Delimiter/Bracket.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 68dfad6d3e..c7b1089347 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,11 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Added `RegexHelper::isWhitespace()` method to check if a given character is an ASCII whitespace character - Added `CacheableDelimiterProcessorInterface` to ensure linear complexity for dynamic delimiter processing - Added `table/max_autocompleted_cells` config option to prevent denial of service attacks when parsing large tables +- Added `Bracket` delimiter type to optimize bracket parsing ### Changed +- `[` and `]` are no longer added as `Delimiter` objects on the stack; a new `Bracket` type with its own stack is used instead - `UrlAutolinkParser` no longer parses URLs with more than 127 subdomains - Expanded reference links can no longer exceed 100kb, or the size of the input document (whichever is greater) - Several small performance optimizations @@ -93,6 +95,8 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Returning dynamic values from `DelimiterProcessorInterface::getDelimiterUse()` is deprecated - You should instead implement `CacheableDelimiterProcessorInterface` to help the engine perform caching to avoid performance issues. +- Deprecated `DelimiterInterface::isActive()` and `DelimiterInterface::setActive()`, as these are no longer used by the engine +- Deprecated `DelimiterStack::removeEarlierMatches()` and `DelimiterStack::searchByCharacter()`, as these are no longer used by the engine ### Fixed diff --git a/src/Delimiter/Bracket.php b/src/Delimiter/Bracket.php new file mode 100644 index 0000000000..182cf60bfb --- /dev/null +++ b/src/Delimiter/Bracket.php @@ -0,0 +1,87 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Delimiter; + +use League\CommonMark\Node\Node; + +final class Bracket +{ + private Node $node; + private ?Bracket $previous; + private ?DelimiterInterface $previousDelimiter; + private bool $hasNext = false; + private int $index; + private bool $image; + private bool $active = true; + + public function __construct(Node $node, ?Bracket $previous, ?DelimiterInterface $previousDelimiter, int $index, bool $image) + { + $this->node = $node; + $this->previous = $previous; + $this->previousDelimiter = $previousDelimiter; + $this->index = $index; + $this->image = $image; + } + + public function getNode(): Node + { + return $this->node; + } + + public function getPrevious(): ?Bracket + { + return $this->previous; + } + + public function getPreviousDelimiter(): ?DelimiterInterface + { + return $this->previousDelimiter; + } + + public function hasNext(): bool + { + return $this->hasNext; + } + + public function getIndex(): int + { + return $this->index; + } + + public function isImage(): bool + { + return $this->image; + } + + public function isActive(): bool + { + return $this->active; + } + + /** + * @internal + */ + public function setHasNext(bool $hasNext): void + { + $this->hasNext = $hasNext; + } + + /** + * @internal + */ + public function setActive(bool $active): void + { + $this->active = $active; + } +} diff --git a/src/Delimiter/DelimiterInterface.php b/src/Delimiter/DelimiterInterface.php index 6bfa32e48e..0cefba7e60 100644 --- a/src/Delimiter/DelimiterInterface.php +++ b/src/Delimiter/DelimiterInterface.php @@ -24,8 +24,14 @@ public function canClose(): bool; public function canOpen(): bool; + /** + * @deprecated This method is no longer used internally and will be removed in 3.0 + */ public function isActive(): bool; + /** + * @deprecated This method is no longer used internally and will be removed in 3.0 + */ public function setActive(bool $active): void; public function getChar(): string; diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index 60e24d6707..63abca207d 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -22,12 +22,16 @@ use League\CommonMark\Delimiter\Processor\CacheableDelimiterProcessorInterface; use League\CommonMark\Delimiter\Processor\DelimiterProcessorCollection; use League\CommonMark\Node\Inline\AdjacentTextMerger; +use League\CommonMark\Node\Node; final class DelimiterStack { /** @psalm-readonly-allow-private-mutation */ private ?DelimiterInterface $top = null; + /** @psalm-readonly-allow-private-mutation */ + private ?Bracket $brackets = null; + public function push(DelimiterInterface $newDelimiter): void { $newDelimiter->setPrevious($this->top); @@ -39,6 +43,26 @@ public function push(DelimiterInterface $newDelimiter): void $this->top = $newDelimiter; } + /** + * @internal + */ + public function addBracket(Node $node, int $index, bool $image): void + { + if ($this->brackets !== null) { + $this->brackets->setHasNext(true); + } + + $this->brackets = new Bracket($node, $this->brackets, $this->top, $index, $image); + } + + /** + * @psalm-immutable + */ + public function getLastBracket(): ?Bracket + { + return $this->brackets; + } + private function findEarliest(?DelimiterInterface $stackBottom = null): ?DelimiterInterface { $delimiter = $this->top; @@ -49,6 +73,22 @@ private function findEarliest(?DelimiterInterface $stackBottom = null): ?Delimit return $delimiter; } + /** + * @internal + */ + public function removeBracket(): void + { + if ($this->brackets === null) { + return; + } + + $this->brackets = $this->brackets->getPrevious(); + + if ($this->brackets !== null) { + $this->brackets->setHasNext(false); + } + } + public function removeDelimiter(DelimiterInterface $delimiter): void { if ($delimiter->getPrevious() !== null) { @@ -98,6 +138,9 @@ public function removeAll(?DelimiterInterface $stackBottom = null): void } } + /** + * @deprecated This method is no longer used internally and will be removed in 3.0 + */ public function removeEarlierMatches(string $character): void { $opener = $this->top; @@ -111,6 +154,22 @@ public function removeEarlierMatches(string $character): void } /** + * @internal + */ + public function deactivateLinkOpeners(): void + { + $opener = $this->brackets; + while ($opener !== null) { + if (! $opener->isImage()) { + $opener->setActive(false); + } + $opener = $opener->getPrevious(); + } + } + + /** + * @deprecated This method is no longer used internally and will be removed in 3.0 + * * @param string|string[] $characters */ public function searchByCharacter($characters): ?DelimiterInterface @@ -233,6 +292,12 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro */ public function __destruct() { - $this->removeAll(); + while ($this->top) { + $this->removeDelimiter($this->top); + } + + while ($this->brackets) { + $this->removeBracket(); + } } } diff --git a/src/Extension/CommonMark/Parser/Inline/BangParser.php b/src/Extension/CommonMark/Parser/Inline/BangParser.php index 8a9e1bd65c..cbf6ca3828 100644 --- a/src/Extension/CommonMark/Parser/Inline/BangParser.php +++ b/src/Extension/CommonMark/Parser/Inline/BangParser.php @@ -16,7 +16,6 @@ namespace League\CommonMark\Extension\CommonMark\Parser\Inline; -use League\CommonMark\Delimiter\Delimiter; use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; use League\CommonMark\Parser\Inline\InlineParserMatch; @@ -38,8 +37,7 @@ public function parse(InlineParserContext $inlineContext): bool $inlineContext->getContainer()->appendChild($node); // Add entry to stack for this opener - $delimiter = new Delimiter('!', 1, $node, true, false, $cursor->getPosition()); - $inlineContext->getDelimiterStack()->push($delimiter); + $inlineContext->getDelimiterStack()->addBracket($node, $cursor->getPosition(), true); return true; } diff --git a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php index 16f24dc2aa..654f08ed42 100644 --- a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php @@ -16,6 +16,7 @@ namespace League\CommonMark\Extension\CommonMark\Parser\Inline; +use League\CommonMark\Delimiter\Bracket; use League\CommonMark\Environment\EnvironmentAwareInterface; use League\CommonMark\Environment\EnvironmentInterface; use League\CommonMark\Extension\CommonMark\Node\Inline\AbstractWebResource; @@ -46,14 +47,14 @@ public function getMatchDefinition(): InlineParserMatch public function parse(InlineParserContext $inlineContext): bool { // Look through stack of delimiters for a [ or ! - $opener = $inlineContext->getDelimiterStack()->searchByCharacter(['[', '!']); + $opener = $inlineContext->getDelimiterStack()->getLastBracket(); if ($opener === null) { return false; } if (! $opener->isActive()) { - // no matched opener; remove from emphasis stack - $inlineContext->getDelimiterStack()->removeDelimiter($opener); + // no matched opener; remove from stack + $inlineContext->getDelimiterStack()->removeBracket(); return false; } @@ -70,21 +71,19 @@ public function parse(InlineParserContext $inlineContext): bool // Inline link? if ($result = $this->tryParseInlineLinkAndTitle($cursor)) { $link = $result; - } elseif ($link = $this->tryParseReference($cursor, $inlineContext->getReferenceMap(), $opener->getIndex(), $startPos)) { + } elseif ($link = $this->tryParseReference($cursor, $inlineContext->getReferenceMap(), $opener, $startPos)) { $reference = $link; $link = ['url' => $link->getDestination(), 'title' => $link->getTitle()]; } else { - // No match - $inlineContext->getDelimiterStack()->removeDelimiter($opener); // Remove this opener from stack + // No match; remove this opener from stack + $inlineContext->getDelimiterStack()->removeBracket(); $cursor->restoreState($previousState); return false; } - $isImage = $opener->getChar() === '!'; - - $inline = $this->createInline($link['url'], $link['title'], $isImage, $reference ?? null); - $opener->getInlineNode()->replaceWith($inline); + $inline = $this->createInline($link['url'], $link['title'], $opener->isImage(), $reference ?? null); + $opener->getNode()->replaceWith($inline); while (($label = $inline->next()) !== null) { // Is there a Mention or Link contained within this link? // CommonMark does not allow nested links, so we'll restore the original text. @@ -104,8 +103,9 @@ public function parse(InlineParserContext $inlineContext): bool // Process delimiters such as emphasis inside link/image $delimiterStack = $inlineContext->getDelimiterStack(); - $stackBottom = $opener->getPrevious(); + $stackBottom = $opener->getPreviousDelimiter(); $delimiterStack->processDelimiters($stackBottom, $this->environment->getDelimiterProcessors()); + $delimiterStack->removeBracket(); $delimiterStack->removeAll($stackBottom); // Merge any adjacent Text nodes together @@ -113,8 +113,8 @@ public function parse(InlineParserContext $inlineContext): bool // processEmphasis will remove this and later delimiters. // Now, for a link, we also remove earlier link openers (no links in links) - if (! $isImage) { - $inlineContext->getDelimiterStack()->removeEarlierMatches('['); + if (! $opener->isImage()) { + $inlineContext->getDelimiterStack()->deactivateLinkOpeners(); } return true; @@ -168,21 +168,23 @@ private function tryParseInlineLinkAndTitle(Cursor $cursor): ?array return ['url' => $dest, 'title' => $title]; } - private function tryParseReference(Cursor $cursor, ReferenceMapInterface $referenceMap, ?int $openerIndex, int $startPos): ?ReferenceInterface + private function tryParseReference(Cursor $cursor, ReferenceMapInterface $referenceMap, Bracket $opener, int $startPos): ?ReferenceInterface { - if ($openerIndex === null) { - return null; - } - $savePos = $cursor->saveState(); $beforeLabel = $cursor->getPosition(); $n = LinkParserHelper::parseLinkLabel($cursor); - if ($n === 0 || $n === 2) { - $start = $openerIndex; - $length = $startPos - $openerIndex; - } else { + if ($n > 2) { $start = $beforeLabel + 1; $length = $n - 2; + } elseif (! $opener->hasNext()) { + // Empty or missing second label means to use the first label as the reference. + // The reference must not contain a bracket. If we know there's a bracket, we don't even bother checking it. + $start = $opener->getIndex(); + $length = $startPos - $start; + } else { + $cursor->restoreState($savePos); + + return null; } $referenceLabel = $cursor->getSubstring($start, $length); diff --git a/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php b/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php index 2b52d1cdc6..1ba8c133ab 100644 --- a/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/OpenBracketParser.php @@ -16,7 +16,6 @@ namespace League\CommonMark\Extension\CommonMark\Parser\Inline; -use League\CommonMark\Delimiter\Delimiter; use League\CommonMark\Node\Inline\Text; use League\CommonMark\Parser\Inline\InlineParserInterface; use League\CommonMark\Parser\Inline\InlineParserMatch; @@ -36,8 +35,7 @@ public function parse(InlineParserContext $inlineContext): bool $inlineContext->getContainer()->appendChild($node); // Add entry to stack for this opener - $delimiter = new Delimiter('[', 1, $node, true, false, $inlineContext->getCursor()->getPosition()); - $inlineContext->getDelimiterStack()->push($delimiter); + $inlineContext->getDelimiterStack()->addBracket($node, $inlineContext->getCursor()->getPosition(), false); return true; } diff --git a/tests/unit/Delimiter/DelimiterStackTest.php b/tests/unit/Delimiter/DelimiterStackTest.php index 93d3e12210..55c3791c55 100644 --- a/tests/unit/Delimiter/DelimiterStackTest.php +++ b/tests/unit/Delimiter/DelimiterStackTest.php @@ -13,6 +13,7 @@ namespace League\CommonMark\Tests\Unit\Delimiter; +use League\CommonMark\Delimiter\Bracket; use League\CommonMark\Delimiter\Delimiter; use League\CommonMark\Delimiter\DelimiterStack; use League\CommonMark\Node\Inline\Text; @@ -245,4 +246,85 @@ public function testRemoveEarlierMatches(): void $stack->removeEarlierMatches('_'); $this->assertFalse($delim2->isActive()); } + + public function testAddBracket(): void + { + $text1 = new Text('['); + $text2 = new Text('*'); + $text3 = new Text(']'); + + $stack = new DelimiterStack(); + + $stack->addBracket($text1, 0, false); + + $firstBracket = $stack->getLastBracket(); + $this->assertSame($text1, $firstBracket->getNode()); + $this->assertSame(0, $firstBracket->getIndex()); + $this->assertFalse($firstBracket->isImage()); + $this->assertNull($firstBracket->getPrevious()); + $this->assertNull($firstBracket->getPreviousDelimiter()); + $this->assertFalse($firstBracket->hasNext()); + + $stack->push(new Delimiter('*', 1, $text2, true, false)); + $stack->addBracket($text3, 2, true); + + $this->assertSame($text3, $stack->getLastBracket()->getNode()); + $this->assertSame(2, $stack->getLastBracket()->getIndex()); + $this->assertTrue($stack->getLastBracket()->isImage()); + $this->assertSame($firstBracket, $stack->getLastBracket()->getPrevious()); + $this->assertSame($text2, $stack->getLastBracket()->getPreviousDelimiter()->getInlineNode()); + $this->assertFalse($stack->getLastBracket()->hasNext()); + + $this->assertTrue($firstBracket->hasNext()); + } + + public function testRemoveBracket(): void + { + $text1 = new Text('['); + $text2 = new Text(']'); + + $stack = new DelimiterStack(); + + $stack->addBracket($text1, 0, false); + $firstBracket = $stack->getLastBracket(); + + $stack->addBracket($text2, 1, false); + + $this->assertSame($text2, $stack->getLastBracket()->getNode()); + $this->assertTrue($firstBracket->hasNext()); + + $stack->removeBracket(); + + $this->assertSame($text1, $stack->getLastBracket()->getNode()); + $this->assertFalse($firstBracket->hasNext()); + } + + public function testDeactivateLinkOpeners(): void + { + $text1 = new Text('['); + $text2 = new Text('['); + $text3 = new Text('['); + + $stack = new DelimiterStack(); + + $stack->addBracket($text1, 0, false); + $bracket1 = $stack->getLastBracket(); + $this->assertTrue($bracket1->isActive()); + + $stack->addBracket($text2, 1, true); + $bracket2 = $stack->getLastBracket(); + $this->assertTrue($bracket2->isActive()); + + $stack->addBracket($text3, 2, false); + $bracket3 = $stack->getLastBracket(); + $this->assertTrue($bracket3->isActive()); + + $stack->deactivateLinkOpeners(); + + $this->assertSame($bracket3, $stack->getLastBracket()); + + $this->assertFalse($bracket1->isActive()); + $this->assertTrue($bracket2->isActive()); + $this->assertFalse($bracket3->isActive()); + } } From d5c27e70779db52f6f6593d26fda24e92c079701 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:03 -0500 Subject: [PATCH 19/29] Disable extensions (like xdebug and blackfire) when running pathological tests --- tests/pathological/test.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pathological/test.php b/tests/pathological/test.php index ec7f5e9d3d..83936ec731 100755 --- a/tests/pathological/test.php +++ b/tests/pathological/test.php @@ -291,7 +291,7 @@ $timeout = \max(5, \min(60, $timeout)); } - $command = ['php', 'convert.php']; + $command = ['php', '-n', 'convert.php']; if (isset($case['extension'])) { $command[] = $case['extension']; } From 71f63e9ae57715d81a4f5ec340500a842b55d335 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:03 -0500 Subject: [PATCH 20/29] Fix quadratic complexity deactivating link openers --- CHANGELOG.md | 1 + src/Delimiter/Bracket.php | 3 +++ src/Delimiter/DelimiterStack.php | 6 ++---- .../CommonMark/Parser/Inline/CloseBracketParser.php | 2 +- tests/unit/Delimiter/DelimiterStackTest.php | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7b1089347..e39fb21c4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -104,6 +104,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Fixed quadratic complexity parsing unclosed inline links - Fixed quadratic complexity finding the bottom opener for emphasis and strikethrough delimiters - Fixed issue where having 500,000+ delimiters could trigger a [known segmentation fault issue in PHP's garbage collection](https://bugs.php.net/bug.php?id=68606) +- Fixed quadratic complexity deactivating link openers ## [2.4.1] - 2023-08-30 diff --git a/src/Delimiter/Bracket.php b/src/Delimiter/Bracket.php index 182cf60bfb..3a7f103d9d 100644 --- a/src/Delimiter/Bracket.php +++ b/src/Delimiter/Bracket.php @@ -64,6 +64,9 @@ public function isImage(): bool return $this->image; } + /** + * Only valid in the context of non-images (links) + */ public function isActive(): bool { return $this->active; diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index 63abca207d..5fd62f8127 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -159,10 +159,8 @@ public function removeEarlierMatches(string $character): void public function deactivateLinkOpeners(): void { $opener = $this->brackets; - while ($opener !== null) { - if (! $opener->isImage()) { - $opener->setActive(false); - } + while ($opener !== null && $opener->isActive()) { + $opener->setActive(false); $opener = $opener->getPrevious(); } } diff --git a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php index 654f08ed42..4ecac71135 100644 --- a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php @@ -52,7 +52,7 @@ public function parse(InlineParserContext $inlineContext): bool return false; } - if (! $opener->isActive()) { + if (! $opener->isImage() && ! $opener->isActive()) { // no matched opener; remove from stack $inlineContext->getDelimiterStack()->removeBracket(); diff --git a/tests/unit/Delimiter/DelimiterStackTest.php b/tests/unit/Delimiter/DelimiterStackTest.php index 55c3791c55..976b51f17a 100644 --- a/tests/unit/Delimiter/DelimiterStackTest.php +++ b/tests/unit/Delimiter/DelimiterStackTest.php @@ -324,7 +324,7 @@ public function testDeactivateLinkOpeners(): void $this->assertSame($bracket3, $stack->getLastBracket()); $this->assertFalse($bracket1->isActive()); - $this->assertTrue($bracket2->isActive()); + $this->assertFalse($bracket2->isActive()); $this->assertFalse($bracket3->isActive()); } } From e7584cf38e7b33799789abfd21127aba2fa1634c Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:03 -0500 Subject: [PATCH 21/29] Fix catastrophic backtracking when parsing link labels/titles --- CHANGELOG.md | 1 + src/Util/LinkParserHelper.php | 2 +- src/Util/RegexHelper.php | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e39fb21c4b..30b487cb64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -105,6 +105,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Fixed quadratic complexity finding the bottom opener for emphasis and strikethrough delimiters - Fixed issue where having 500,000+ delimiters could trigger a [known segmentation fault issue in PHP's garbage collection](https://bugs.php.net/bug.php?id=68606) - Fixed quadratic complexity deactivating link openers +- Fixed catastrophic backtracking when parsing link labels/titles ## [2.4.1] - 2023-08-30 diff --git a/src/Util/LinkParserHelper.php b/src/Util/LinkParserHelper.php index af4d5e15c0..ae722f7a98 100644 --- a/src/Util/LinkParserHelper.php +++ b/src/Util/LinkParserHelper.php @@ -69,7 +69,7 @@ public static function parseLinkLabel(Cursor $cursor): int public static function parsePartialLinkLabel(Cursor $cursor): ?string { - return $cursor->match('/^(?:[^\\\\\[\]]+|\\\\.?)*/'); + return $cursor->match('/^(?:[^\\\\\[\]]++|\\\\.?)*+/'); } /** diff --git a/src/Util/RegexHelper.php b/src/Util/RegexHelper.php index 1767e3a4fd..603631f294 100644 --- a/src/Util/RegexHelper.php +++ b/src/Util/RegexHelper.php @@ -61,9 +61,9 @@ final class RegexHelper self::PARTIAL_PROCESSINGINSTRUCTION . '|' . self::PARTIAL_DECLARATION . '|' . self::PARTIAL_CDATA . ')'; public const PARTIAL_HTMLBLOCKOPEN = '<(?:' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s\/>]|$)' . '|' . '\/' . self::PARTIAL_BLOCKTAGNAME . '(?:[\s>]|$)' . '|' . '[?!])'; - public const PARTIAL_LINK_TITLE = '^(?:"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*"' . - '|' . '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*\'' . - '|' . '\((' . self::PARTIAL_ESCAPED_CHAR . '|[^()\x00])*\))'; + public const PARTIAL_LINK_TITLE = '^(?:"(' . self::PARTIAL_ESCAPED_CHAR . '|[^"\x00])*+"' . + '|' . '\'(' . self::PARTIAL_ESCAPED_CHAR . '|[^\'\x00])*+\'' . + '|' . '\((' . self::PARTIAL_ESCAPED_CHAR . '|[^()\x00])*+\))'; public const REGEX_PUNCTUATION = '/^[!"#$%&\'()*+,\-.\\/:;<=>?@\\[\\]\\\\^_`{|}~\p{P}\p{S}]/u'; public const REGEX_UNSAFE_PROTOCOL = '/^javascript:|vbscript:|file:|data:/i'; From b61bbd4b09c0a061d1b63a75b1515c3809a797c4 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:04 -0500 Subject: [PATCH 22/29] Use delimiter position to optimize processing Delimiters may be deleted, so we store delimiter positions instead of pointers. This also allows us to optimize searches within the stack, avoiding quadratic behavior when parsing emphasis. See https://github.com/github/cmark-gfm/commit/75008f1e46580748edb971bc0784ca185cfecef9 --- CHANGELOG.md | 7 +- phpstan.neon.dist | 2 + src/Delimiter/Bracket.php | 23 ++-- src/Delimiter/DelimiterStack.php | 129 ++++++++++++++++-- .../Parser/Inline/CloseBracketParser.php | 4 +- src/Extension/SmartPunct/QuoteParser.php | 3 +- src/Extension/Table/TableParser.php | 4 + ...lockContinueParserWithInlinesInterface.php | 3 + src/Parser/Inline/InlineParserInterface.php | 4 + src/Parser/InlineParserEngineInterface.php | 3 + src/Parser/MarkdownParser.php | 3 + tests/unit/Delimiter/DelimiterStackTest.php | 58 +++++--- 12 files changed, 192 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30b487cb64..e5c79e7f07 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,9 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - `[` and `]` are no longer added as `Delimiter` objects on the stack; a new `Bracket` type with its own stack is used instead - `UrlAutolinkParser` no longer parses URLs with more than 127 subdomains - Expanded reference links can no longer exceed 100kb, or the size of the input document (whichever is greater) +- Delimiters should always provide a non-null value via `DelimiterInterface::getIndex()` + - We'll attempt to infer the index based on surrounding delimiters where possible +- The `DelimiterStack` now accepts integer positions for any `$stackBottom` argument - Several small performance optimizations ## [2.5.3] - 2024-08-16 @@ -95,14 +98,16 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Returning dynamic values from `DelimiterProcessorInterface::getDelimiterUse()` is deprecated - You should instead implement `CacheableDelimiterProcessorInterface` to help the engine perform caching to avoid performance issues. +- Failing to set a delimiter's index (or returning `null` from `DelimiterInterface::getIndex()`) is deprecated and will not be supported in 3.0 - Deprecated `DelimiterInterface::isActive()` and `DelimiterInterface::setActive()`, as these are no longer used by the engine - Deprecated `DelimiterStack::removeEarlierMatches()` and `DelimiterStack::searchByCharacter()`, as these are no longer used by the engine +- Passing a `DelimiterInterface` as the `$stackBottom` argument to `DelimiterStack::processDelimiters()` or `::removeAll()` is deprecated and will not be supported in 3.0; pass the integer position instead. ### Fixed - Fixed NUL characters not being replaced in the input - Fixed quadratic complexity parsing unclosed inline links -- Fixed quadratic complexity finding the bottom opener for emphasis and strikethrough delimiters +- Fixed quadratic complexity parsing emphasis and strikethrough delimiters - Fixed issue where having 500,000+ delimiters could trigger a [known segmentation fault issue in PHP's garbage collection](https://bugs.php.net/bug.php?id=68606) - Fixed quadratic complexity deactivating link openers - Fixed catastrophic backtracking when parsing link labels/titles diff --git a/phpstan.neon.dist b/phpstan.neon.dist index e25d706246..54d7b4b6f5 100644 --- a/phpstan.neon.dist +++ b/phpstan.neon.dist @@ -7,6 +7,8 @@ parameters: message: '#Parameter .+ of class .+Reference constructor expects string, string\|null given#' - path: src/Util/RegexHelper.php message: '#Method .+RegexHelper::unescape\(\) should return string but returns string\|null#' + - path: src/Delimiter/DelimiterStack.php + message: '#unknown class WeakMap#' exceptions: uncheckedExceptionClasses: # Exceptions caused by bad developer logic that should always bubble up: diff --git a/src/Delimiter/Bracket.php b/src/Delimiter/Bracket.php index 3a7f103d9d..3a86859c71 100644 --- a/src/Delimiter/Bracket.php +++ b/src/Delimiter/Bracket.php @@ -19,19 +19,17 @@ final class Bracket { private Node $node; private ?Bracket $previous; - private ?DelimiterInterface $previousDelimiter; private bool $hasNext = false; - private int $index; + private int $position; private bool $image; private bool $active = true; - public function __construct(Node $node, ?Bracket $previous, ?DelimiterInterface $previousDelimiter, int $index, bool $image) + public function __construct(Node $node, ?Bracket $previous, int $position, bool $image) { - $this->node = $node; - $this->previous = $previous; - $this->previousDelimiter = $previousDelimiter; - $this->index = $index; - $this->image = $image; + $this->node = $node; + $this->previous = $previous; + $this->position = $position; + $this->image = $image; } public function getNode(): Node @@ -44,19 +42,14 @@ public function getPrevious(): ?Bracket return $this->previous; } - public function getPreviousDelimiter(): ?DelimiterInterface - { - return $this->previousDelimiter; - } - public function hasNext(): bool { return $this->hasNext; } - public function getIndex(): int + public function getPosition(): int { - return $this->index; + return $this->position; } public function isImage(): bool diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index 5fd62f8127..25003f942d 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -21,6 +21,7 @@ use League\CommonMark\Delimiter\Processor\CacheableDelimiterProcessorInterface; use League\CommonMark\Delimiter\Processor\DelimiterProcessorCollection; +use League\CommonMark\Exception\LogicException; use League\CommonMark\Node\Inline\AdjacentTextMerger; use League\CommonMark\Node\Node; @@ -32,6 +33,23 @@ final class DelimiterStack /** @psalm-readonly-allow-private-mutation */ private ?Bracket $brackets = null; + /** + * @deprecated This property will be removed in 3.0 once all delimiters MUST have an index/position + * + * @var \SplObjectStorage|\WeakMap + */ + private $missingIndexCache; + + public function __construct() + { + if (\PHP_VERSION_ID >= 80000) { + /** @psalm-suppress PropertyTypeCoercion */ + $this->missingIndexCache = new \WeakMap(); // @phpstan-ignore-line + } else { + $this->missingIndexCache = new \SplObjectStorage(); // @phpstan-ignore-line + } + } + public function push(DelimiterInterface $newDelimiter): void { $newDelimiter->setPrevious($this->top); @@ -52,7 +70,7 @@ public function addBracket(Node $node, int $index, bool $image): void $this->brackets->setHasNext(true); } - $this->brackets = new Bracket($node, $this->brackets, $this->top, $index, $image); + $this->brackets = new Bracket($node, $this->brackets, $index, $image); } /** @@ -63,14 +81,21 @@ public function getLastBracket(): ?Bracket return $this->brackets; } - private function findEarliest(?DelimiterInterface $stackBottom = null): ?DelimiterInterface + /** + * @throws LogicException + */ + private function findEarliest(int $stackBottom): ?DelimiterInterface { - $delimiter = $this->top; - while ($delimiter !== null && $delimiter->getPrevious() !== $stackBottom) { - $delimiter = $delimiter->getPrevious(); + // Move back to first relevant delim. + $delimiter = $this->top; + $lastChecked = null; + + while ($delimiter !== null && self::getIndex($delimiter) > $stackBottom) { + $lastChecked = $delimiter; + $delimiter = $delimiter->getPrevious(); } - return $delimiter; + return $lastChecked; } /** @@ -113,6 +138,9 @@ public function removeDelimiter(DelimiterInterface $delimiter): void // segfaults like in https://bugs.php.net/bug.php?id=68606. $delimiter->setPrevious(null); $delimiter->setNext(null); + + // TODO: Remove the line below once PHP 7.4 support is dropped, as WeakMap won't hold onto the reference, making this unnecessary + unset($this->missingIndexCache[$delimiter]); } private function removeDelimiterAndNode(DelimiterInterface $delimiter): void @@ -121,19 +149,30 @@ private function removeDelimiterAndNode(DelimiterInterface $delimiter): void $this->removeDelimiter($delimiter); } + /** + * @throws LogicException + */ private function removeDelimitersBetween(DelimiterInterface $opener, DelimiterInterface $closer): void { - $delimiter = $closer->getPrevious(); - while ($delimiter !== null && $delimiter !== $opener) { + $delimiter = $closer->getPrevious(); + $openerPosition = self::getIndex($opener); + while ($delimiter !== null && self::getIndex($delimiter) > $openerPosition) { $previous = $delimiter->getPrevious(); $this->removeDelimiter($delimiter); $delimiter = $previous; } } - public function removeAll(?DelimiterInterface $stackBottom = null): void + /** + * @param DelimiterInterface|int|null $stackBottom + * + * @throws LogicException if the index/position cannot be determined for some delimiter + */ + public function removeAll($stackBottom = null): void { - while ($this->top && $this->top !== $stackBottom) { + $stackBottomPosition = \is_int($stackBottom) ? $stackBottom : self::getIndex($stackBottom); + + while ($this->top && $this->getIndex($this->top) > $stackBottomPosition) { $this->removeDelimiter($this->top); } } @@ -188,12 +227,22 @@ public function searchByCharacter($characters): ?DelimiterInterface return $opener; } - public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterProcessorCollection $processors): void + /** + * @param DelimiterInterface|int|null $stackBottom + * + * @throws LogicException if the index/position cannot be determined for any delimiter + * + * @todo change $stackBottom to an int in 3.0 + */ + public function processDelimiters($stackBottom, DelimiterProcessorCollection $processors): void { + /** @var array $openersBottom */ $openersBottom = []; + $stackBottomPosition = \is_int($stackBottom) ? $stackBottom : self::getIndex($stackBottom); + // Find first closer above stackBottom - $closer = $this->findEarliest($stackBottom); + $closer = $this->findEarliest($stackBottomPosition); // Move forward, looking for closers, and handling each while ($closer !== null) { @@ -217,7 +266,7 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro $openerFound = false; $potentialOpenerFound = false; $opener = $closer->getPrevious(); - while ($opener !== null && $opener !== $stackBottom && $opener !== ($openersBottom[$openersBottomCacheKey] ?? null)) { + while ($opener !== null && ($openerPosition = self::getIndex($opener)) > $stackBottomPosition && $openerPosition >= ($openersBottom[$openersBottomCacheKey] ?? 0)) { if ($opener->canOpen() && $opener->getChar() === $openingDelimiterChar) { $potentialOpenerFound = true; $useDelims = $delimiterProcessor->getDelimiterUse($opener, $closer); @@ -234,7 +283,7 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro // Set lower bound for future searches // TODO: Remove this conditional check in 3.0. It only exists to prevent behavioral BC breaks in 2.x. if ($potentialOpenerFound === false || $delimiterProcessor instanceof CacheableDelimiterProcessorInterface) { - $openersBottom[$openersBottomCacheKey] = $closer->getPrevious(); + $openersBottom[$openersBottomCacheKey] = self::getIndex($closer); } if (! $potentialOpenerFound && ! $closer->canOpen()) { @@ -282,7 +331,7 @@ public function processDelimiters(?DelimiterInterface $stackBottom, DelimiterPro } // Remove all delimiters - $this->removeAll($stackBottom); + $this->removeAll($stackBottomPosition); } /** @@ -298,4 +347,54 @@ public function __destruct() $this->removeBracket(); } } + + /** + * @deprecated This method will be dropped in 3.0 once all delimiters MUST have an index/position + * + * @throws LogicException if no index was defined on this delimiter, and no reasonable guess could be made based on its neighbors + */ + private function getIndex(?DelimiterInterface $delimiter): int + { + if ($delimiter === null) { + return -1; + } + + if (($index = $delimiter->getIndex()) !== null) { + return $index; + } + + if (isset($this->missingIndexCache[$delimiter])) { + return $this->missingIndexCache[$delimiter]; + } + + $prev = $delimiter->getPrevious(); + $next = $delimiter->getNext(); + + $i = 0; + do { + $i++; + if ($prev === null) { + break; + } + + if ($prev->getIndex() !== null) { + return $this->missingIndexCache[$delimiter] = $prev->getIndex() + $i; + } + } while ($prev = $prev->getPrevious()); + + $j = 0; + do { + $j++; + if ($next === null) { + break; + } + + if ($next->getIndex() !== null) { + return $this->missingIndexCache[$delimiter] = $next->getIndex() - $j; + } + } while ($next = $next->getNext()); + + // No index was defined on this delimiter, and none could be guesstimated based on the stack. + throw new LogicException('No index was defined on this delimiter, and none could be guessed based on the stack. Ensure you are passing the index when instantiating the Delimiter.'); + } } diff --git a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php index 4ecac71135..f3b83fd129 100644 --- a/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php +++ b/src/Extension/CommonMark/Parser/Inline/CloseBracketParser.php @@ -103,7 +103,7 @@ public function parse(InlineParserContext $inlineContext): bool // Process delimiters such as emphasis inside link/image $delimiterStack = $inlineContext->getDelimiterStack(); - $stackBottom = $opener->getPreviousDelimiter(); + $stackBottom = $opener->getPosition(); $delimiterStack->processDelimiters($stackBottom, $this->environment->getDelimiterProcessors()); $delimiterStack->removeBracket(); $delimiterStack->removeAll($stackBottom); @@ -179,7 +179,7 @@ private function tryParseReference(Cursor $cursor, ReferenceMapInterface $refere } elseif (! $opener->hasNext()) { // Empty or missing second label means to use the first label as the reference. // The reference must not contain a bracket. If we know there's a bracket, we don't even bother checking it. - $start = $opener->getIndex(); + $start = $opener->getPosition(); $length = $startPos - $start; } else { $cursor->restoreState($savePos); diff --git a/src/Extension/SmartPunct/QuoteParser.php b/src/Extension/SmartPunct/QuoteParser.php index 959930b3ee..31ba8c7738 100644 --- a/src/Extension/SmartPunct/QuoteParser.php +++ b/src/Extension/SmartPunct/QuoteParser.php @@ -46,6 +46,7 @@ public function parse(InlineParserContext $inlineContext): bool { $char = $inlineContext->getFullMatch(); $cursor = $inlineContext->getCursor(); + $index = $cursor->getPosition(); $charBefore = $cursor->peek(-1); if ($charBefore === null) { @@ -67,7 +68,7 @@ public function parse(InlineParserContext $inlineContext): bool $inlineContext->getContainer()->appendChild($node); // Add entry to stack to this opener - $inlineContext->getDelimiterStack()->push(new Delimiter($char, 1, $node, $canOpen, $canClose)); + $inlineContext->getDelimiterStack()->push(new Delimiter($char, 1, $node, $canOpen, $canClose, $index)); return true; } diff --git a/src/Extension/Table/TableParser.php b/src/Extension/Table/TableParser.php index a005f8a97e..c4f441d123 100644 --- a/src/Extension/Table/TableParser.php +++ b/src/Extension/Table/TableParser.php @@ -15,6 +15,7 @@ namespace League\CommonMark\Extension\Table; +use League\CommonMark\Exception\LogicException; use League\CommonMark\Parser\Block\AbstractBlockContinueParser; use League\CommonMark\Parser\Block\BlockContinue; use League\CommonMark\Parser\Block\BlockContinueParserInterface; @@ -150,6 +151,9 @@ public function parseInlines(InlineParserEngineInterface $inlineParser): void } } + /** + * @throws LogicException + */ private function parseCell(string $cell, int $column, InlineParserEngineInterface $inlineParser): TableCell { $tableCell = new TableCell(TableCell::TYPE_DATA, $this->columns[$column] ?? null); diff --git a/src/Parser/Block/BlockContinueParserWithInlinesInterface.php b/src/Parser/Block/BlockContinueParserWithInlinesInterface.php index 6f826c9a91..595b73768c 100644 --- a/src/Parser/Block/BlockContinueParserWithInlinesInterface.php +++ b/src/Parser/Block/BlockContinueParserWithInlinesInterface.php @@ -13,12 +13,15 @@ namespace League\CommonMark\Parser\Block; +use League\CommonMark\Exception\LogicException; use League\CommonMark\Parser\InlineParserEngineInterface; interface BlockContinueParserWithInlinesInterface extends BlockContinueParserInterface { /** * Parse any inlines inside of the current block + * + * @throws LogicException */ public function parseInlines(InlineParserEngineInterface $inlineParser): void; } diff --git a/src/Parser/Inline/InlineParserInterface.php b/src/Parser/Inline/InlineParserInterface.php index fd13435bcf..322cf9849e 100644 --- a/src/Parser/Inline/InlineParserInterface.php +++ b/src/Parser/Inline/InlineParserInterface.php @@ -13,11 +13,15 @@ namespace League\CommonMark\Parser\Inline; +use League\CommonMark\Exception\LogicException; use League\CommonMark\Parser\InlineParserContext; interface InlineParserInterface { public function getMatchDefinition(): InlineParserMatch; + /** + * @throws LogicException + */ public function parse(InlineParserContext $inlineContext): bool; } diff --git a/src/Parser/InlineParserEngineInterface.php b/src/Parser/InlineParserEngineInterface.php index 8a0986dc13..cb48903162 100644 --- a/src/Parser/InlineParserEngineInterface.php +++ b/src/Parser/InlineParserEngineInterface.php @@ -13,6 +13,7 @@ namespace League\CommonMark\Parser; +use League\CommonMark\Exception\LogicException; use League\CommonMark\Node\Block\AbstractBlock; /** @@ -22,6 +23,8 @@ interface InlineParserEngineInterface { /** * Parse the given contents as inlines and insert them into the given block + * + * @throws LogicException */ public function parse(string $contents, AbstractBlock $block): void; } diff --git a/src/Parser/MarkdownParser.php b/src/Parser/MarkdownParser.php index 904c7c45b4..8412a95492 100644 --- a/src/Parser/MarkdownParser.php +++ b/src/Parser/MarkdownParser.php @@ -23,6 +23,7 @@ use League\CommonMark\Event\DocumentParsedEvent; use League\CommonMark\Event\DocumentPreParsedEvent; use League\CommonMark\Exception\CommonMarkException; +use League\CommonMark\Exception\LogicException; use League\CommonMark\Input\MarkdownInput; use League\CommonMark\Node\Block\Document; use League\CommonMark\Node\Block\Paragraph; @@ -266,6 +267,8 @@ private function finalize(BlockContinueParserInterface $blockParser, int $endLin /** * Walk through a block & children recursively, parsing string content into inline content where appropriate. + * + * @throws LogicException */ private function processInlines(int $inputSize): void { diff --git a/tests/unit/Delimiter/DelimiterStackTest.php b/tests/unit/Delimiter/DelimiterStackTest.php index 976b51f17a..d3c7a25860 100644 --- a/tests/unit/Delimiter/DelimiterStackTest.php +++ b/tests/unit/Delimiter/DelimiterStackTest.php @@ -13,7 +13,6 @@ namespace League\CommonMark\Tests\Unit\Delimiter; -use League\CommonMark\Delimiter\Bracket; use League\CommonMark\Delimiter\Delimiter; use League\CommonMark\Delimiter\DelimiterStack; use League\CommonMark\Node\Inline\Text; @@ -119,11 +118,11 @@ public function testRemoveDelimitersBetween(): void $text5 = new Text('*'); $stack = new DelimiterStack(); - $stack->push($delim1 = new Delimiter('*', 1, $text1, false, false)); - $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); - $stack->push($delim3 = new Delimiter('*', 1, $text3, false, false)); - $stack->push($delim4 = new Delimiter('_', 1, $text4, false, false)); - $stack->push($delim5 = new Delimiter('*', 1, $text5, false, false)); + $stack->push($delim1 = new Delimiter('*', 1, $text1, false, false, 0)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false, 1)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, false, 2)); + $stack->push($delim4 = new Delimiter('_', 1, $text4, false, false, 3)); + $stack->push($delim5 = new Delimiter('*', 1, $text5, false, false, 4)); // Make removeDelimiterBetween() callable using reflection $reflection = new \ReflectionClass($stack); @@ -155,19 +154,19 @@ public function testFindEarliest(): void $text3 = new Text('*'); $stack = new DelimiterStack(); - $stack->push($delim1 = new Delimiter('*', 1, $text1, true, false)); - $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false)); - $stack->push($delim3 = new Delimiter('*', 1, $text3, false, true)); + $stack->push($delim1 = new Delimiter('*', 1, $text1, true, false, 0)); + $stack->push($delim2 = new Delimiter('_', 1, $text2, false, false, 17)); + $stack->push($delim3 = new Delimiter('*', 1, $text3, false, true, 24)); // Use reflection to make findEarliest() callable $reflection = new \ReflectionClass($stack); $method = $reflection->getMethod('findEarliest'); $method->setAccessible(true); - $this->assertSame($delim1, $method->invoke($stack)); - $this->assertSame($delim2, $method->invoke($stack, $delim1)); - $this->assertSame($delim3, $method->invoke($stack, $delim2)); - $this->assertNull($method->invoke($stack, $delim3)); + $this->assertSame($delim1, $method->invoke($stack, -1)); + $this->assertSame($delim2, $method->invoke($stack, $delim1->getIndex())); + $this->assertSame($delim3, $method->invoke($stack, $delim2->getIndex())); + $this->assertNull($method->invoke($stack, $delim3->getIndex())); } public function testRemoveAll(): void @@ -259,20 +258,18 @@ public function testAddBracket(): void $firstBracket = $stack->getLastBracket(); $this->assertSame($text1, $firstBracket->getNode()); - $this->assertSame(0, $firstBracket->getIndex()); + $this->assertSame(0, $firstBracket->getPosition()); $this->assertFalse($firstBracket->isImage()); $this->assertNull($firstBracket->getPrevious()); - $this->assertNull($firstBracket->getPreviousDelimiter()); $this->assertFalse($firstBracket->hasNext()); $stack->push(new Delimiter('*', 1, $text2, true, false)); $stack->addBracket($text3, 2, true); $this->assertSame($text3, $stack->getLastBracket()->getNode()); - $this->assertSame(2, $stack->getLastBracket()->getIndex()); + $this->assertSame(2, $stack->getLastBracket()->getPosition()); $this->assertTrue($stack->getLastBracket()->isImage()); $this->assertSame($firstBracket, $stack->getLastBracket()->getPrevious()); - $this->assertSame($text2, $stack->getLastBracket()->getPreviousDelimiter()->getInlineNode()); $this->assertFalse($stack->getLastBracket()->hasNext()); $this->assertTrue($firstBracket->hasNext()); @@ -327,4 +324,31 @@ public function testDeactivateLinkOpeners(): void $this->assertFalse($bracket2->isActive()); $this->assertFalse($bracket3->isActive()); } + + public function testGetIndex(): void + { + $delim1 = new Delimiter('*', 1, new Text('*'), true, false); + $delim2 = new Delimiter('_', 1, new Text('_'), true, false, 10); + $delim3 = new Delimiter('*', 1, new Text('*'), true, true, 20); + $delim4 = new Delimiter('_', 1, new Text('_'), false, true); + $delim5 = new Delimiter('*', 1, new Text('*'), false, true); + + $stack = new DelimiterStack(); + + $stack->push($delim1); + $stack->push($delim2); + $stack->push($delim3); + $stack->push($delim4); + $stack->push($delim5); + + $reflection = new \ReflectionClass($stack); + $method = $reflection->getMethod('getIndex'); + $method->setAccessible(true); + + $this->assertSame(0, $method->invoke($stack, $delim1)); + $this->assertSame(10, $method->invoke($stack, $delim2)); + $this->assertSame(20, $method->invoke($stack, $delim3)); + $this->assertSame(21, $method->invoke($stack, $delim4)); + $this->assertSame(22, $method->invoke($stack, $delim5)); + } } From e1cfa8deb0f55c795d4bb7056803dcc843f81685 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:04 -0500 Subject: [PATCH 23/29] Use recursive lookup as a last resort to avoid throwing --- src/Delimiter/DelimiterStack.php | 15 +-------------- src/Extension/Table/TableParser.php | 4 ---- .../BlockContinueParserWithInlinesInterface.php | 3 --- src/Parser/Inline/InlineParserInterface.php | 4 ---- src/Parser/InlineParserEngineInterface.php | 3 --- src/Parser/MarkdownParser.php | 3 --- tests/unit/Delimiter/DelimiterStackTest.php | 2 +- 7 files changed, 2 insertions(+), 32 deletions(-) diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index 25003f942d..665b03799e 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -21,7 +21,6 @@ use League\CommonMark\Delimiter\Processor\CacheableDelimiterProcessorInterface; use League\CommonMark\Delimiter\Processor\DelimiterProcessorCollection; -use League\CommonMark\Exception\LogicException; use League\CommonMark\Node\Inline\AdjacentTextMerger; use League\CommonMark\Node\Node; @@ -81,9 +80,6 @@ public function getLastBracket(): ?Bracket return $this->brackets; } - /** - * @throws LogicException - */ private function findEarliest(int $stackBottom): ?DelimiterInterface { // Move back to first relevant delim. @@ -149,9 +145,6 @@ private function removeDelimiterAndNode(DelimiterInterface $delimiter): void $this->removeDelimiter($delimiter); } - /** - * @throws LogicException - */ private function removeDelimitersBetween(DelimiterInterface $opener, DelimiterInterface $closer): void { $delimiter = $closer->getPrevious(); @@ -165,8 +158,6 @@ private function removeDelimitersBetween(DelimiterInterface $opener, DelimiterIn /** * @param DelimiterInterface|int|null $stackBottom - * - * @throws LogicException if the index/position cannot be determined for some delimiter */ public function removeAll($stackBottom = null): void { @@ -230,8 +221,6 @@ public function searchByCharacter($characters): ?DelimiterInterface /** * @param DelimiterInterface|int|null $stackBottom * - * @throws LogicException if the index/position cannot be determined for any delimiter - * * @todo change $stackBottom to an int in 3.0 */ public function processDelimiters($stackBottom, DelimiterProcessorCollection $processors): void @@ -350,8 +339,6 @@ public function __destruct() /** * @deprecated This method will be dropped in 3.0 once all delimiters MUST have an index/position - * - * @throws LogicException if no index was defined on this delimiter, and no reasonable guess could be made based on its neighbors */ private function getIndex(?DelimiterInterface $delimiter): int { @@ -395,6 +382,6 @@ private function getIndex(?DelimiterInterface $delimiter): int } while ($next = $next->getNext()); // No index was defined on this delimiter, and none could be guesstimated based on the stack. - throw new LogicException('No index was defined on this delimiter, and none could be guessed based on the stack. Ensure you are passing the index when instantiating the Delimiter.'); + return $this->missingIndexCache[$delimiter] = $this->getIndex($delimiter->getPrevious()) + 1; } } diff --git a/src/Extension/Table/TableParser.php b/src/Extension/Table/TableParser.php index c4f441d123..a005f8a97e 100644 --- a/src/Extension/Table/TableParser.php +++ b/src/Extension/Table/TableParser.php @@ -15,7 +15,6 @@ namespace League\CommonMark\Extension\Table; -use League\CommonMark\Exception\LogicException; use League\CommonMark\Parser\Block\AbstractBlockContinueParser; use League\CommonMark\Parser\Block\BlockContinue; use League\CommonMark\Parser\Block\BlockContinueParserInterface; @@ -151,9 +150,6 @@ public function parseInlines(InlineParserEngineInterface $inlineParser): void } } - /** - * @throws LogicException - */ private function parseCell(string $cell, int $column, InlineParserEngineInterface $inlineParser): TableCell { $tableCell = new TableCell(TableCell::TYPE_DATA, $this->columns[$column] ?? null); diff --git a/src/Parser/Block/BlockContinueParserWithInlinesInterface.php b/src/Parser/Block/BlockContinueParserWithInlinesInterface.php index 595b73768c..6f826c9a91 100644 --- a/src/Parser/Block/BlockContinueParserWithInlinesInterface.php +++ b/src/Parser/Block/BlockContinueParserWithInlinesInterface.php @@ -13,15 +13,12 @@ namespace League\CommonMark\Parser\Block; -use League\CommonMark\Exception\LogicException; use League\CommonMark\Parser\InlineParserEngineInterface; interface BlockContinueParserWithInlinesInterface extends BlockContinueParserInterface { /** * Parse any inlines inside of the current block - * - * @throws LogicException */ public function parseInlines(InlineParserEngineInterface $inlineParser): void; } diff --git a/src/Parser/Inline/InlineParserInterface.php b/src/Parser/Inline/InlineParserInterface.php index 322cf9849e..fd13435bcf 100644 --- a/src/Parser/Inline/InlineParserInterface.php +++ b/src/Parser/Inline/InlineParserInterface.php @@ -13,15 +13,11 @@ namespace League\CommonMark\Parser\Inline; -use League\CommonMark\Exception\LogicException; use League\CommonMark\Parser\InlineParserContext; interface InlineParserInterface { public function getMatchDefinition(): InlineParserMatch; - /** - * @throws LogicException - */ public function parse(InlineParserContext $inlineContext): bool; } diff --git a/src/Parser/InlineParserEngineInterface.php b/src/Parser/InlineParserEngineInterface.php index cb48903162..8a0986dc13 100644 --- a/src/Parser/InlineParserEngineInterface.php +++ b/src/Parser/InlineParserEngineInterface.php @@ -13,7 +13,6 @@ namespace League\CommonMark\Parser; -use League\CommonMark\Exception\LogicException; use League\CommonMark\Node\Block\AbstractBlock; /** @@ -23,8 +22,6 @@ interface InlineParserEngineInterface { /** * Parse the given contents as inlines and insert them into the given block - * - * @throws LogicException */ public function parse(string $contents, AbstractBlock $block): void; } diff --git a/src/Parser/MarkdownParser.php b/src/Parser/MarkdownParser.php index 8412a95492..904c7c45b4 100644 --- a/src/Parser/MarkdownParser.php +++ b/src/Parser/MarkdownParser.php @@ -23,7 +23,6 @@ use League\CommonMark\Event\DocumentParsedEvent; use League\CommonMark\Event\DocumentPreParsedEvent; use League\CommonMark\Exception\CommonMarkException; -use League\CommonMark\Exception\LogicException; use League\CommonMark\Input\MarkdownInput; use League\CommonMark\Node\Block\Document; use League\CommonMark\Node\Block\Paragraph; @@ -267,8 +266,6 @@ private function finalize(BlockContinueParserInterface $blockParser, int $endLin /** * Walk through a block & children recursively, parsing string content into inline content where appropriate. - * - * @throws LogicException */ private function processInlines(int $inputSize): void { diff --git a/tests/unit/Delimiter/DelimiterStackTest.php b/tests/unit/Delimiter/DelimiterStackTest.php index d3c7a25860..8c7804425e 100644 --- a/tests/unit/Delimiter/DelimiterStackTest.php +++ b/tests/unit/Delimiter/DelimiterStackTest.php @@ -345,7 +345,7 @@ public function testGetIndex(): void $method = $reflection->getMethod('getIndex'); $method->setAccessible(true); - $this->assertSame(0, $method->invoke($stack, $delim1)); + $this->assertSame(9, $method->invoke($stack, $delim1)); $this->assertSame(10, $method->invoke($stack, $delim2)); $this->assertSame(20, $method->invoke($stack, $delim3)); $this->assertSame(21, $method->invoke($stack, $delim4)); From 540d850394a9eb1ba9acef6ee6f0e1e9571cc50a Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:04 -0500 Subject: [PATCH 24/29] Fix quadratic complexity parsing long backtick code spans with no matching closers --- CHANGELOG.md | 1 + .../Parser/Inline/BacktickParser.php | 72 +++++++++++++++++-- 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5c79e7f07..7024144cb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -110,6 +110,7 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi - Fixed quadratic complexity parsing emphasis and strikethrough delimiters - Fixed issue where having 500,000+ delimiters could trigger a [known segmentation fault issue in PHP's garbage collection](https://bugs.php.net/bug.php?id=68606) - Fixed quadratic complexity deactivating link openers +- Fixed quadratic complexity parsing long backtick code spans with no matching closers - Fixed catastrophic backtracking when parsing link labels/titles ## [2.4.1] - 2023-08-30 diff --git a/src/Extension/CommonMark/Parser/Inline/BacktickParser.php b/src/Extension/CommonMark/Parser/Inline/BacktickParser.php index 9618f2e676..3324fe39d0 100644 --- a/src/Extension/CommonMark/Parser/Inline/BacktickParser.php +++ b/src/Extension/CommonMark/Parser/Inline/BacktickParser.php @@ -18,12 +18,27 @@ use League\CommonMark\Extension\CommonMark\Node\Inline\Code; use League\CommonMark\Node\Inline\Text; +use League\CommonMark\Parser\Cursor; use League\CommonMark\Parser\Inline\InlineParserInterface; use League\CommonMark\Parser\Inline\InlineParserMatch; use League\CommonMark\Parser\InlineParserContext; final class BacktickParser implements InlineParserInterface { + /** + * Max bound for backtick code span delimiters. + * + * @see https://github.com/commonmark/cmark/commit/8ed5c9d + */ + private const MAX_BACKTICKS = 1000; + + /** @var \WeakReference|null */ + private ?\WeakReference $lastCursor = null; + private bool $lastCursorScanned = false; + + /** @var array backtick count => position of known ender */ + private array $seenBackticks = []; + public function getMatchDefinition(): InlineParserMatch { return InlineParserMatch::regex('`+'); @@ -38,11 +53,7 @@ public function parse(InlineParserContext $inlineContext): bool $currentPosition = $cursor->getPosition(); $previousState = $cursor->saveState(); - while ($matchingTicks = $cursor->match('/`+/m')) { - if ($matchingTicks !== $ticks) { - continue; - } - + if ($this->findMatchingTicks(\strlen($ticks), $cursor)) { $code = $cursor->getSubstring($currentPosition, $cursor->getPosition() - $currentPosition - \strlen($ticks)); $c = \preg_replace('/\n/m', ' ', $code) ?? ''; @@ -67,4 +78,55 @@ public function parse(InlineParserContext $inlineContext): bool return true; } + + /** + * Locates the matching closer for a backtick code span. + * + * Leverages some caching to avoid traversing the same cursor multiple times when + * we've already seen all the potential backtick closers. + * + * @see https://github.com/commonmark/cmark/commit/8ed5c9d + * + * @param int $openTickLength Number of backticks in the opening sequence + * @param Cursor $cursor Cursor to scan + * + * @return bool True if a matching closer was found, false otherwise + */ + private function findMatchingTicks(int $openTickLength, Cursor $cursor): bool + { + // Reset the seenBackticks cache if this is a new cursor + if ($this->lastCursor === null || $this->lastCursor->get() !== $cursor) { + $this->seenBackticks = []; + $this->lastCursor = \WeakReference::create($cursor); + $this->lastCursorScanned = false; + } + + if ($openTickLength > self::MAX_BACKTICKS) { + return false; + } + + // Return if we already know there's no closer + if ($this->lastCursorScanned && isset($this->seenBackticks[$openTickLength]) && $this->seenBackticks[$openTickLength] <= $cursor->getPosition()) { + return false; + } + + while ($ticks = $cursor->match('/`{1,' . self::MAX_BACKTICKS . '}/m')) { + $numTicks = \strlen($ticks); + + // Did we find the closer? + if ($numTicks === $openTickLength) { + return true; + } + + // Store position of closer + if ($numTicks <= self::MAX_BACKTICKS) { + $this->seenBackticks[$numTicks] = $cursor->getPosition() - $numTicks; + } + } + + // Got through whole input without finding closer + $this->lastCursorScanned = true; + + return false; + } } From 5ce491fff9bec6baaf8b0591fb092e9cabfa0729 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:04 -0500 Subject: [PATCH 25/29] Optimize repeated parsing of links without closing brace --- src/Util/LinkParserHelper.php | 41 ++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/Util/LinkParserHelper.php b/src/Util/LinkParserHelper.php index ae722f7a98..3e76c28faa 100644 --- a/src/Util/LinkParserHelper.php +++ b/src/Util/LinkParserHelper.php @@ -31,14 +31,7 @@ final class LinkParserHelper public static function parseLinkDestination(Cursor $cursor): ?string { if ($cursor->getCurrentCharacter() === '<') { - if ($res = $cursor->match(RegexHelper::REGEX_LINK_DESTINATION_BRACES)) { - // Chop off surrounding <..>: - return UrlEncoder::unescapeAndEncode( - RegexHelper::unescape(\substr($res, 1, -1)) - ); - } - - return null; + return self::parseDestinationBraces($cursor); } $destination = self::manuallyParseLinkDestination($cursor); @@ -137,4 +130,36 @@ private static function manuallyParseLinkDestination(Cursor $cursor): ?string return $destination; } + + /** @var \WeakReference|null */ + private static ?\WeakReference $lastCursor = null; + private static bool $lastCursorLacksClosingBrace = false; + + private static function parseDestinationBraces(Cursor $cursor): ?string + { + // Optimization: If we've previously parsed this cursor and returned `null`, we know + // that no closing brace exists, so we can skip the regex entirely. This helps avoid + // certain pathological cases where the regex engine can take a very long time to + // determine that no match exists. + if (self::$lastCursor !== null && self::$lastCursor->get() === $cursor) { + if (self::$lastCursorLacksClosingBrace) { + return null; + } + } else { + self::$lastCursor = \WeakReference::create($cursor); + } + + if ($res = $cursor->match(RegexHelper::REGEX_LINK_DESTINATION_BRACES)) { + self::$lastCursorLacksClosingBrace = false; + + // Chop off surrounding <..>: + return UrlEncoder::unescapeAndEncode( + RegexHelper::unescape(\substr($res, 1, -1)) + ); + } + + self::$lastCursorLacksClosingBrace = true; + + return null; + } } From 51567969678a3b83ecdd83e3b1c834b577fd79a2 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:04 -0500 Subject: [PATCH 26/29] Add max_delimiters_per_line config option --- .phpstorm.meta.php | 1 + CHANGELOG.md | 7 ++- docs/2.5/configuration.md | 2 + docs/2.5/security.md | 22 +++++++- docs/2.6/upgrading.md | 7 +++ src/Delimiter/DelimiterStack.php | 11 +++- src/Environment/Environment.php | 1 + src/Parser/InlineParserContext.php | 4 +- src/Parser/InlineParserEngine.php | 2 +- tests/functional/MaxDelimitersPerLineTest.php | 50 +++++++++++++++++++ 10 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 tests/functional/MaxDelimitersPerLineTest.php diff --git a/.phpstorm.meta.php b/.phpstorm.meta.php index 0b56ab530d..5eb9270da0 100644 --- a/.phpstorm.meta.php +++ b/.phpstorm.meta.php @@ -31,6 +31,7 @@ 'html_input', 'allow_unsafe_links', 'max_nesting_level', + 'max_delimiters_per_line', 'renderer', 'renderer/block_separator', 'renderer/inner_separator', diff --git a/CHANGELOG.md b/CHANGELOG.md index 7024144cb4..7fd3207bcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,14 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ### Added +- Added `max_delimiters_per_line` config option to prevent denial of service attacks when parsing malicious input +- Added `table/max_autocompleted_cells` config option to prevent denial of service attacks when parsing large tables +- The `AttributesExtension` now supports attributes without values (#985, #986) +- The `AutolinkExtension` exposes two new configuration options to override the default behavior (#969, #987): + - `autolink/allowed_protocols` - an array of protocols to allow autolinking for + - `autolink/default_protocol` - the default protocol to use when none is specified - Added `RegexHelper::isWhitespace()` method to check if a given character is an ASCII whitespace character - Added `CacheableDelimiterProcessorInterface` to ensure linear complexity for dynamic delimiter processing -- Added `table/max_autocompleted_cells` config option to prevent denial of service attacks when parsing large tables - Added `Bracket` delimiter type to optimize bracket parsing ### Changed diff --git a/docs/2.5/configuration.md b/docs/2.5/configuration.md index 472b99ff56..5991f9e1cf 100644 --- a/docs/2.5/configuration.md +++ b/docs/2.5/configuration.md @@ -27,6 +27,7 @@ $config = [ 'html_input' => 'escape', 'allow_unsafe_links' => false, 'max_nesting_level' => PHP_INT_MAX, + 'max_delimiters_per_line' => PHP_INT_MAX, 'slug_normalizer' => [ 'max_length' => 255, ], @@ -73,6 +74,7 @@ Here's a list of the core configuration options available: - `escape` - Escape all HTML - `allow_unsafe_links` - Remove risky link and image URLs by setting this to `false` (default: `true`) - `max_nesting_level` - The maximum nesting level for blocks (default: `PHP_INT_MAX`). Setting this to a positive integer can help protect against long parse times and/or segfaults if blocks are too deeply-nested. +- `max_delimiters_per_line` - The maximum number of delimiters (e.g. `*` or `_`) allowed in a single line (default: `PHP_INT_MAX`). Setting this to a positive integer can help protect against long parse times and/or segfaults if lines are too long. - `slug_normalizer` - Array of options for configuring how URL-safe slugs are created; see [the slug normalizer docs](/2.5/customization/slug-normalizer/#configuration) for more details - `instance` - An alternative normalizer to use (defaults to the included `SlugNormalizer`) - `max_length` - Limits the size of generated slugs (defaults to 255 characters) diff --git a/docs/2.5/security.md b/docs/2.5/security.md index b4bd8ab598..8573e69352 100644 --- a/docs/2.5/security.md +++ b/docs/2.5/security.md @@ -11,7 +11,8 @@ In order to be fully compliant with the CommonMark spec, certain security settin - `html_input`: How to handle raw HTML - `allow_unsafe_links`: Whether unsafe links are permitted -- `max_nesting_level`: Protected against long render times or segfaults +- `max_nesting_level`: Protect against long render times or segfaults +- `max_delimiters_per_line`: Protect against long parse times or rendering segfaults Further information about each option can be found below. @@ -88,6 +89,25 @@ echo $converter->convert($markdown); See the [configuration](/2.5/configuration/) section for more information. +## Max Delimiters Per Line + +Similarly to the maximum nesting level, **no maximum number of delimiters per line is enforced by default.** Delimiters can be nested (like `*a **b** c*`) or un-nested (like `*a* *b* *c*`) - in either case, having too many in a single line can result in long parse times. We therefore have a separate option to limit the number of delimiters per line. + +If you need to parse untrusted input, consider setting a reasonable `max_delimiters_per_line` (perhaps 100-1000) depending on your needs. Once this level is hit, any subsequent delimiters on that line will be rendered as plain text. + +### Example - Prevent too many delimiters + +```php +use League\CommonMark\CommonMarkConverter; + +$markdown = '*a* **b *c **d** c* b**'; // 8 delimiters (* and **) + +$converter = new CommonMarkConverter(['max_delimiters_per_line' => 6]); +echo $converter->convert($markdown); + +//

a **b *c d c* b**

+``` + ## Additional Filtering Although this library does offer these security features out-of-the-box, some users may opt to also run the HTML output through additional filtering layers (like HTMLPurifier). If you do this, make sure you **thoroughly** test your additional post-processing steps and configure them to work properly with the types of HTML elements and attributes that converted Markdown might produce, otherwise, you may end up with weird behavior like missing images, broken links, mismatched HTML tags, etc. diff --git a/docs/2.6/upgrading.md b/docs/2.6/upgrading.md index d19d1760a5..5d6e91c3c6 100644 --- a/docs/2.6/upgrading.md +++ b/docs/2.6/upgrading.md @@ -7,6 +7,13 @@ redirect_from: /upgrading/ # Upgrading from 2.5 to 2.6 +## `max_delimiters_per_line` Configuration Option + +The `max_delimiters_per_line` configuration option was added in 2.6 to help protect against malicious input that could +cause excessive memory usage or denial of service attacks. It defaults to `PHP_INT_MAX` (no limit) for backwards +compatibility, which is safe when parsing trusted input. However, if you're parsing untrusted input from users, you +should probably set this to a reasonable value (somewhere between `100` and `1000`) to protect against malicious inputs. + ## Custom Delimiter Processors If you're implementing a custom delimiter processor, and `getDelimiterUse()` has more logic than just a diff --git a/src/Delimiter/DelimiterStack.php b/src/Delimiter/DelimiterStack.php index 665b03799e..cf2a41e524 100644 --- a/src/Delimiter/DelimiterStack.php +++ b/src/Delimiter/DelimiterStack.php @@ -39,8 +39,13 @@ final class DelimiterStack */ private $missingIndexCache; - public function __construct() + + private int $remainingDelimiters = 0; + + public function __construct(int $maximumStackSize = PHP_INT_MAX) { + $this->remainingDelimiters = $maximumStackSize; + if (\PHP_VERSION_ID >= 80000) { /** @psalm-suppress PropertyTypeCoercion */ $this->missingIndexCache = new \WeakMap(); // @phpstan-ignore-line @@ -51,6 +56,10 @@ public function __construct() public function push(DelimiterInterface $newDelimiter): void { + if ($this->remainingDelimiters-- <= 0) { + return; + } + $newDelimiter->setPrevious($this->top); if ($this->top !== null) { diff --git a/src/Environment/Environment.php b/src/Environment/Environment.php index 3c24749bbb..a8112967e4 100644 --- a/src/Environment/Environment.php +++ b/src/Environment/Environment.php @@ -432,6 +432,7 @@ public static function createDefaultConfiguration(): Configuration 'html_input' => Expect::anyOf(HtmlFilter::STRIP, HtmlFilter::ALLOW, HtmlFilter::ESCAPE)->default(HtmlFilter::ALLOW), 'allow_unsafe_links' => Expect::bool(true), 'max_nesting_level' => Expect::type('int')->default(PHP_INT_MAX), + 'max_delimiters_per_line' => Expect::type('int')->default(PHP_INT_MAX), 'renderer' => Expect::structure([ 'block_separator' => Expect::string("\n"), 'inner_separator' => Expect::string("\n"), diff --git a/src/Parser/InlineParserContext.php b/src/Parser/InlineParserContext.php index 796f2f388c..9372904281 100644 --- a/src/Parser/InlineParserContext.php +++ b/src/Parser/InlineParserContext.php @@ -42,12 +42,12 @@ final class InlineParserContext */ private array $matches; - public function __construct(Cursor $contents, AbstractBlock $container, ReferenceMapInterface $referenceMap) + public function __construct(Cursor $contents, AbstractBlock $container, ReferenceMapInterface $referenceMap, int $maxDelimitersPerLine = PHP_INT_MAX) { $this->referenceMap = $referenceMap; $this->container = $container; $this->cursor = $contents; - $this->delimiterStack = new DelimiterStack(); + $this->delimiterStack = new DelimiterStack($maxDelimitersPerLine); } public function getContainer(): AbstractBlock diff --git a/src/Parser/InlineParserEngine.php b/src/Parser/InlineParserEngine.php index b91a63f72f..6a26979329 100644 --- a/src/Parser/InlineParserEngine.php +++ b/src/Parser/InlineParserEngine.php @@ -59,7 +59,7 @@ public function parse(string $contents, AbstractBlock $block): void $contents = \trim($contents); $cursor = new Cursor($contents); - $inlineParserContext = new InlineParserContext($cursor, $block, $this->referenceMap); + $inlineParserContext = new InlineParserContext($cursor, $block, $this->referenceMap, $this->environment->getConfiguration()->get('max_delimiters_per_line')); // Have all parsers look at the line to determine what they might want to parse and what positions they exist at foreach ($this->matchParsers($contents) as $matchPosition => $parsers) { diff --git a/tests/functional/MaxDelimitersPerLineTest.php b/tests/functional/MaxDelimitersPerLineTest.php new file mode 100644 index 0000000000..1de9697dc5 --- /dev/null +++ b/tests/functional/MaxDelimitersPerLineTest.php @@ -0,0 +1,50 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace League\CommonMark\Tests\Functional; + +use League\CommonMark\CommonMarkConverter; +use PHPUnit\Framework\TestCase; + +final class MaxDelimitersPerLineTest extends TestCase +{ + /** + * @dataProvider provideTestCases + */ + public function testIt(string $input, int $maxDelimsPerLine, string $expectedOutput): void + { + $converter = new CommonMarkConverter(['max_delimiters_per_line' => $maxDelimsPerLine]); + + $this->assertEquals($expectedOutput, \trim($converter->convert($input)->getContent())); + } + + /** + * @return iterable> + */ + public function provideTestCases(): iterable + { + yield ['*a* **b *c* b**', 6, '

a b c b

']; + + yield ['*a* **b *c **d** c* b**', 0, '

*a* **b *c **d** c* b**

']; + yield ['*a* **b *c **d** c* b**', 1, '

*a* **b *c **d** c* b**

']; + yield ['*a* **b *c **d** c* b**', 2, '

a **b *c **d** c* b**

']; + yield ['*a* **b *c **d** c* b**', 3, '

a **b *c **d** c* b**

']; + yield ['*a* **b *c **d** c* b**', 4, '

a **b *c **d** c* b**

']; + yield ['*a* **b *c **d** c* b**', 5, '

a **b *c **d** c* b**

']; + yield ['*a* **b *c **d** c* b**', 6, '

a **b *c d c* b**

']; + yield ['*a* **b *c **d** c* b**', 7, '

a **b c d c b**

']; + yield ['*a* **b *c **d** c* b**', 8, '

a b c d c b

']; + yield ['*a* **b *c **d** c* b**', 9, '

a b c d c b

']; + yield ['*a* **b *c **d** c* b**', 100, '

a b c d c b

']; + } +} From 8b9d95e8b5a13cb8fc2714adc20bbba49fb17d16 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:04 -0500 Subject: [PATCH 27/29] Fix pathological test suite failing not loading extensions in CI --- tests/pathological/test.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/pathological/test.php b/tests/pathological/test.php index 83936ec731..a911f3a336 100755 --- a/tests/pathological/test.php +++ b/tests/pathological/test.php @@ -291,7 +291,12 @@ $timeout = \max(5, \min(60, $timeout)); } - $command = ['php', '-n', 'convert.php']; + if (isset($_ENV['CI']) || isset($_SERVER['CI'])) { + $command = ['php', 'convert.php']; + } else { + $command = ['php', '-n', 'convert.php']; + } + if (isset($case['extension'])) { $command[] = $case['extension']; } From 92dbad9f833783f9ad885f1a3a3796577d529981 Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:05 -0500 Subject: [PATCH 28/29] Flag this as a security release --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fd3207bcd..9846fe0c97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi ## [Unreleased][unreleased] +This is a **security release** to address potential denial of service attacks when parsing specially crafted, +malicious input from untrusted sources (like user input). + ### Added - Added `max_delimiters_per_line` config option to prevent denial of service attacks when parsing malicious input From d777db8d71a4ebf84ddf93905af9a6f1a861a1ff Mon Sep 17 00:00:00 2001 From: Colin O'Dell Date: Sat, 7 Dec 2024 10:17:05 -0500 Subject: [PATCH 29/29] Add pathological test for deeply-nested blocks --- tests/pathological/convert.php | 10 +++++++--- tests/pathological/test.php | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/pathological/convert.php b/tests/pathological/convert.php index 0768db3a9b..51c6d42330 100755 --- a/tests/pathological/convert.php +++ b/tests/pathological/convert.php @@ -35,12 +35,16 @@ exit(1); } -$environment = new Environment(); +$config = []; +if (isset($argv[1])) { + $config = \json_decode($argv[1], true); +} + +$environment = new Environment($config); $environment->addExtension(new CommonMarkCoreExtension()); // Enable additional extensions if requested -$extension = $argv[1] ?? null; -switch ($argv[1] ?? null) { +switch ($argv[2] ?? null) { case 'table': $environment->addExtension(new TableExtension()); break; diff --git a/tests/pathological/test.php b/tests/pathological/test.php index a911f3a336..fea326a449 100755 --- a/tests/pathological/test.php +++ b/tests/pathological/test.php @@ -221,6 +221,22 @@ 'input' => static fn($n) => \str_repeat(">", $n) . \str_repeat(".", $n) . "\n", 'expected' => static fn($n) => \str_repeat("
\n", $n) . '

' . \str_repeat('.', $n) . "

\n" . \str_repeat("
\n", $n), ], + 'CVE-2023-24824 test 1' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-66g8-4hjf-77xh', + 'sizes' => [1_000, 10_000, 100_000], + 'input' => static fn($n) => \str_repeat(">", $n) . \str_repeat("a*", $n) . "\n", + 'configuration' => [ + 'max_nesting_level' => 1_000, + ], + ], + 'CVE-2023-24824 test 2' => [ + 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-66g8-4hjf-77xh', + 'sizes' => [500, 5_000, 50_000], + 'input' => static fn($n) => \str_repeat(" -", $n) . 'x' . \str_repeat("\n", $n), + 'configuration' => [ + 'max_nesting_level' => 500, + ], + ], 'CVE-2023-26485 test 1' => [ 'ref' => 'https://github.com/github/cmark-gfm/security/advisories/GHSA-r8vr-c48j-fcc5', 'sizes' => [50, 500, 5_000], // ideally should be 1000, 10_000, 100_000 but recursive rendering makes large sizes fail @@ -294,7 +310,7 @@ if (isset($_ENV['CI']) || isset($_SERVER['CI'])) { $command = ['php', 'convert.php']; } else { - $command = ['php', '-n', 'convert.php']; + $command = ['php', '-n', 'convert.php', \json_encode($case['configuration'] ?? [])]; } if (isset($case['extension'])) {