From 25d7e150462d6dcc8aec8489af2dfbd0f9cd22bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20Hus=C3=A1k?= Date: Wed, 6 May 2020 19:29:10 +0200 Subject: [PATCH] Transform 2 and 3 byte UTF-8 ranges to UTF-16 --- .../RegexParser.cs | 3 + .../RegexUtf8RangeTransformer.cs | 187 ++++++++++++++++++ .../PcreTests.cs | 30 ++- 3 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs diff --git a/src/Peachpie.Library.RegularExpressions/RegexParser.cs b/src/Peachpie.Library.RegularExpressions/RegexParser.cs index 1db0acf..693cc84 100644 --- a/src/Peachpie.Library.RegularExpressions/RegexParser.cs +++ b/src/Peachpie.Library.RegularExpressions/RegexParser.cs @@ -2732,6 +2732,9 @@ private void AddUnitType(int type) */ private void AddGroup() { + // Try to identify common patterns for matching UTF-8 ranges and convert them to UTF-16 + RegexUtf8RangeTransformer.TryTransformRanges(_concatenation); + if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) { _group.AddChild(_concatenation.ReverseLeft()); diff --git a/src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs b/src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs new file mode 100644 index 0000000..2b0dc91 --- /dev/null +++ b/src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs @@ -0,0 +1,187 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Text; + +namespace Peachpie.Library.RegularExpressions +{ + /// + /// Helper class to transform typical UTF-8 ranges expressed as sequences of byte ranges to UTF-16 ranges, + /// e.g. [\xC2-\xDF][\x80-\xBF] to [\u0080-\u07FF]. + /// + internal static class RegexUtf8RangeTransformer + { + /// + /// State of a helper automaton to identify the sequence of ranges and supply the resulting UTF-16 range. + /// All its states are immutable and static, the user traverses it by navigating through their references. + /// + private class MatchState + { + public string Utf16Range { get; } + + public bool IsFinal => Utf16Range != null; + + private delegate MatchState NextStateMatcher(char lower, char upper); + + private NextStateMatcher NextMatcher { get; } + + private MatchState(string utf16Range, NextStateMatcher next) + { + this.Utf16Range = utf16Range; + this.NextMatcher = next; + } + + private static MatchState CreateIntermediate(NextStateMatcher nextMatcher) => new MatchState(null, nextMatcher); + + private static MatchState CreateFinal(char utf16RangeFirst, char utf16RangeLast) + { + var charClass = new RegexCharClass(); + charClass.AddRange(utf16RangeFirst, utf16RangeLast); + + return new MatchState(charClass.ToStringClass(), (f, l) => null); + } + + public MatchState MatchNextState(char singleChar) => NextMatcher(singleChar, singleChar) ?? Start; + + public MatchState MatchNextState(string range) + { + // We are interested only in single ranges, e.g. [\x80-\xBF] + if (range.Length == 5 && range.StartsWith("\x00\x02\x00")) + { + return NextMatcher(range[3], (char)(range[4] - 1)) ?? Start; + } + else + { + return Start; + } + } + + /// + /// Initial state of the automaton waiting for the first range input. + /// + public static MatchState Start = CreateIntermediate((f, l) => + (f, l) switch + { + ('\xC2', '\xDF') => TwoByte1, // [\xC2-\xDF][\x80-\xBF] => [\u0080-\u07FF] + ('\xE0', '\xE0') => ThreeByteNoOverlongs1, // \xE0[\xA0-\xBF][\x80-\xBF] => [\u0800-\u0FFF] + ('\xE1', '\xEC') => ThreeByteStraight1, // [\xE1-\xEC][\x80-\xBF]{2} => [\u1000-\uCFFF] + ('\xED', '\xED') => ThreeBytePresurrogates1, // \xED[\x80-\x9F][\x80-\xBF] => [\uD000-\uD7FF] + ('\xEE', '\xEF') => ThreeBytePostsurrogates1, // [\xEE-\xEF][\x80-\xBF]{2} => [\uE000-\uFFFF] + _ => null + }); + + private static MatchState TwoByte1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? TwoByte2 : null); + + private static MatchState TwoByte2 = CreateFinal('\u0080', '\u07FF'); + + + private static MatchState ThreeByteNoOverlongs1 = CreateIntermediate((f, l) => (f, l) == ('\xA0', '\xBF') ? ThreeByteNoOverlongs2 : null); + + private static MatchState ThreeByteNoOverlongs2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeByteNoOverlongs3 : null); + + private static MatchState ThreeByteNoOverlongs3 = CreateFinal('\u0800', '\u0FFF'); + + + private static MatchState ThreeByteStraight1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeByteStraight2 : null); + + private static MatchState ThreeByteStraight2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeByteStraight3 : null); + + private static MatchState ThreeByteStraight3 = CreateFinal('\u1000', '\uCFFF'); + + + private static MatchState ThreeBytePresurrogates1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\x9F') ? ThreeBytePresurrogates2 : null); + + private static MatchState ThreeBytePresurrogates2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeBytePresurrogates3 : null); + + private static MatchState ThreeBytePresurrogates3 = CreateFinal('\uD000', '\uD7FF'); + + + private static MatchState ThreeBytePostsurrogates1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeBytePostsurrogates2 : null); + + private static MatchState ThreeBytePostsurrogates2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeBytePostsurrogates3 : null); + + private static MatchState ThreeBytePostsurrogates3 = CreateFinal('\uE000', '\uFFFF'); + } + + /// + /// Attempts to identify common patterns for matching UTF-8 ranges and convert them to UTF-16 ranges + /// by modifying the children of the given concatenation. + /// + /// + public static void TryTransformRanges(RegexNode concatenation) + { + if (concatenation.Children == null) + { + return; + } + + // Mark the state of the current search and its start in the node list + var matchState = MatchState.Start; + int iMatchStart = -1; + + for (int i = 0; i < concatenation.Children.Count; i++) + { + var child = concatenation.Children[i]; + switch (child.Type()) + { + case RegexNode.One: + // Single character is equivalent to an interval [c-c] + matchState = matchState.MatchNextState(child.Ch); + break; + + case RegexNode.Set: + matchState = matchState.MatchNextState(child.Str); + break; + + case RegexNode.Setloop: + if (child.M != child.N || child.M > 3) + { + goto default; + } + else + { + // Either the whole loop is accepted or nothing (there's no splitting the loop) + for (int j = 0; j < child.N; j++) + { + matchState = matchState.MatchNextState(child.Str); + } + } + break; + + default: + // Any other node type resets the matching logic + iMatchStart = -1; + matchState = MatchState.Start; + break; + } + + if (matchState != MatchState.Start) + { + if (iMatchStart == -1) + { + iMatchStart = i; + } + + if (matchState.IsFinal) + { + // Replace the matched sequence by a single range + concatenation.Children[iMatchStart] = new RegexNode(RegexNode.Set, concatenation.Options, matchState.Utf16Range); + concatenation.Children.RemoveRange(iMatchStart + 1, i - iMatchStart); + + // Fix iteration variable after the range removal + i = iMatchStart; + + // Reset the found match + iMatchStart = -1; + matchState = MatchState.Start; + } + } + else + { + // Reset the current match start in case of a partial but unsuccessful match + iMatchStart = -1; + } + } + } + } +} diff --git a/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs b/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs index 73e54de..1f3e153 100644 --- a/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs +++ b/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs @@ -1,4 +1,4 @@ -using System; +using System; using Xunit; namespace Peachpie.Library.RegularExpressions.Tests @@ -123,5 +123,33 @@ public void TestSkipVerb() Assert.Equal("ab", match(@"/(*SKIP)ab/", "aab").Value); Assert.False(match(@"/(ab(*SKIP)(*F)|abc)/", "abc").Success); } + + [Fact] + public void TestUtf8Ranges1() + { + Assert.Equal("\u0080", match(@"/[\xC2-\xDF][\x80-\xBF]/", "\u0080").Value); + Assert.Equal("ř", match(@"/[\xC2-\xDF][\x80-\xBF]/", "ř").Value); + Assert.Equal("\u07FF", match(@"/[\xC2-\xDF][\x80-\xBF]/", "\u07FF").Value); + + Assert.Equal("\u0800", match(@"/\xE0[\xA0-\xBF][\x80-\xBF]/", "\u0800").Value); + Assert.Equal("\u0FFF", match(@"/\xE0[\xA0-\xBF][\x80-\xBF]/", "\u0FFF").Value); + + Assert.Equal("\u1000", match(@"/[\xE1-\xEC][\x80-\xBF]{2}/", "\u1000").Value); + Assert.Equal("\uCFFF", match(@"/[\xE1-\xEC][\x80-\xBF]{2}/", "\uCFFF").Value); + + Assert.Equal("\uD000", match(@"/\xED[\x80-\x9F][\x80-\xBF]/", "\uD000").Value); + Assert.Equal("\uD7FF", match(@"/\xED[\x80-\x9F][\x80-\xBF]/", "\uD7FF").Value); + + Assert.Equal("\uE000", match(@"/[\xEE-\xEF][\x80-\xBF]{2}/", "\uE000").Value); + Assert.Equal("\uFFFF", match(@"/[\xEE-\xEF][\x80-\xBF]{2}/", "\uFFFF").Value); + } + + [Fact] + public void TestUtf8Ranges2() + { + string czechSentence = "Příliš žluťoučký kůň úpěl ďábelské ódy"; + + Assert.Equal(czechSentence, match(@"/([\x00-\x7F]|[\xC2-\xDF][\x80-\xBF])*/", czechSentence).Value); + } } }