From 11748169b5125a2c46e0f4339907d8436211c05c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20Hus=C3=A1k?= Date: Wed, 6 May 2020 23:00:26 +0200 Subject: [PATCH] Transform 4 byte UTF-8 ranges to UTF-16 --- .../RegexNode.cs | 3 + .../RegexParser.cs | 3 - .../RegexUtf8RangeTransformer.cs | 84 ++++++++++++++++--- .../PcreTests.cs | 26 ++++++ 4 files changed, 101 insertions(+), 15 deletions(-) diff --git a/src/Peachpie.Library.RegularExpressions/RegexNode.cs b/src/Peachpie.Library.RegularExpressions/RegexNode.cs index 6e743a4..59d416b 100644 --- a/src/Peachpie.Library.RegularExpressions/RegexNode.cs +++ b/src/Peachpie.Library.RegularExpressions/RegexNode.cs @@ -437,6 +437,9 @@ private RegexNode ReduceConcatenation() if (Children == null) return new RegexNode(Empty, Options); + // Try to identify common patterns for matching UTF-8 ranges and convert them to UTF-16 + RegexUtf8RangeTransformer.TryTransformRanges(this); + bool wasLastString = false; RegexOptions optionsLast = 0; RegexOptions optionsAt; diff --git a/src/Peachpie.Library.RegularExpressions/RegexParser.cs b/src/Peachpie.Library.RegularExpressions/RegexParser.cs index 693cc84..1db0acf 100644 --- a/src/Peachpie.Library.RegularExpressions/RegexParser.cs +++ b/src/Peachpie.Library.RegularExpressions/RegexParser.cs @@ -2732,9 +2732,6 @@ private void AddUnitType(int type) */ private void AddGroup() { - // Try to identify common patterns for matching UTF-8 ranges and convert them to UTF-16 - RegexUtf8RangeTransformer.TryTransformRanges(_concatenation); - if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) { _group.AddChild(_concatenation.ReverseLeft()); diff --git a/src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs b/src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs index 2b0dc91..c50d146 100644 --- a/src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs +++ b/src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs @@ -19,26 +19,38 @@ private class MatchState { public string Utf16Range { get; } + public string Utf16Range2 { get; } + public bool IsFinal => Utf16Range != null; private delegate MatchState NextStateMatcher(char lower, char upper); private NextStateMatcher NextMatcher { get; } - private MatchState(string utf16Range, NextStateMatcher next) + private MatchState(string utf16Range, string utf16Range2, NextStateMatcher next) { this.Utf16Range = utf16Range; + this.Utf16Range2 = utf16Range2; this.NextMatcher = next; } - private static MatchState CreateIntermediate(NextStateMatcher nextMatcher) => new MatchState(null, nextMatcher); + private static MatchState CreateIntermediate(NextStateMatcher nextMatcher) => new MatchState(null, null, nextMatcher); - private static MatchState CreateFinal(char utf16RangeFirst, char utf16RangeLast) + private static MatchState CreateFinal(char utf16RangeFirst, char utf16RangeLast, char? utf16Range2First = null, char? utf16Range2Last = null) { var charClass = new RegexCharClass(); charClass.AddRange(utf16RangeFirst, utf16RangeLast); - return new MatchState(charClass.ToStringClass(), (f, l) => null); + RegexCharClass charClass2 = null; + if (utf16Range2First != null) + { + Debug.Assert(utf16Range2Last != null); + + charClass2 = new RegexCharClass(); + charClass2.AddRange(utf16Range2First.Value, utf16Range2Last.Value); + } + + return new MatchState(charClass.ToStringClass(), charClass2?.ToStringClass(), (f, l) => null); } public MatchState MatchNextState(char singleChar) => NextMatcher(singleChar, singleChar) ?? Start; @@ -62,14 +74,23 @@ public MatchState MatchNextState(string range) public static MatchState Start = CreateIntermediate((f, l) => (f, l) switch { - ('\xC2', '\xDF') => TwoByte1, // [\xC2-\xDF][\x80-\xBF] => [\u0080-\u07FF] - ('\xE0', '\xE0') => ThreeByteNoOverlongs1, // \xE0[\xA0-\xBF][\x80-\xBF] => [\u0800-\u0FFF] - ('\xE1', '\xEC') => ThreeByteStraight1, // [\xE1-\xEC][\x80-\xBF]{2} => [\u1000-\uCFFF] - ('\xED', '\xED') => ThreeBytePresurrogates1, // \xED[\x80-\x9F][\x80-\xBF] => [\uD000-\uD7FF] - ('\xEE', '\xEF') => ThreeBytePostsurrogates1, // [\xEE-\xEF][\x80-\xBF]{2} => [\uE000-\uFFFF] + // 2 bytes + ('\xC2', '\xDF') => TwoByte1, // [\xC2-\xDF][\x80-\xBF] => [\u0080-\u07FF] + + // 3 bytes + ('\xE0', '\xE0') => ThreeByteNoOverlongs1, // \xE0[\xA0-\xBF][\x80-\xBF] => [\u0800-\u0FFF] + ('\xE1', '\xEC') => ThreeByteStraight1, // [\xE1-\xEC][\x80-\xBF]{2} => [\u1000-\uCFFF] + ('\xED', '\xED') => ThreeBytePresurrogates1, // \xED[\x80-\x9F][\x80-\xBF] => [\uD000-\uD7FF] + ('\xEE', '\xEF') => ThreeBytePostsurrogates1, // [\xEE-\xEF][\x80-\xBF]{2} => [\uE000-\uFFFF] + + // 4 bytes - conversion to surrogate pairs + ('\xF0', '\xF0') => FourByteFirst1, // \xF0[\x90-\xBF][\x80-\xBF]{2} => [\u10000-\u3FFFF] => [\uD800-\uD8BF][\uDC00-\uDFFF] + ('\xF1', '\xF3') => FourByteSecond1, // [\xF1-\xF3][\x80-\xBF]{3} => [\u40000-\uFFFFF] => [\uD8C0-\uDBBF][\uDC00-\uDFFF] + ('\xF4', '\xF4') => FourByteThird1, // \xF4[\x80-\x8F][\x80-\xBF]{2} => [\u100000-\u10FFFF] => [\uDBC0-\uDBFF][\uDC00-\uDFFF] _ => null }); + private static MatchState TwoByte1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? TwoByte2 : null); private static MatchState TwoByte2 = CreateFinal('\u0080', '\u07FF'); @@ -101,6 +122,33 @@ public MatchState MatchNextState(string range) private static MatchState ThreeBytePostsurrogates2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeBytePostsurrogates3 : null); private static MatchState ThreeBytePostsurrogates3 = CreateFinal('\uE000', '\uFFFF'); + + + private static MatchState FourByteFirst1 = CreateIntermediate((f, l) => (f, l) == ('\x90', '\xBF') ? FourByteFirst2 : null); + + private static MatchState FourByteFirst2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? FourByteFirst3 : null); + + private static MatchState FourByteFirst3 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? FourByteFirst4 : null); + + private static MatchState FourByteFirst4 = CreateFinal('\uD800', '\uD8BF', '\uDC00', '\uDFFF'); + + + private static MatchState FourByteSecond1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? FourByteSecond2 : null); + + private static MatchState FourByteSecond2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? FourByteSecond3 : null); + + private static MatchState FourByteSecond3 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? FourByteSecond4 : null); + + private static MatchState FourByteSecond4 = CreateFinal('\uD8C0', '\uDBBF', '\uDC00', '\uDFFF'); + + + private static MatchState FourByteThird1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\x8F') ? FourByteThird2 : null); + + private static MatchState FourByteThird2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? FourByteThird3 : null); + + private static MatchState FourByteThird3 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? FourByteThird4 : null); + + private static MatchState FourByteThird4 = CreateFinal('\uDBC0', '\uDBFF', '\uDC00', '\uDFFF'); } /// @@ -165,11 +213,23 @@ public static void TryTransformRanges(RegexNode concatenation) if (matchState.IsFinal) { // Replace the matched sequence by a single range - concatenation.Children[iMatchStart] = new RegexNode(RegexNode.Set, concatenation.Options, matchState.Utf16Range); - concatenation.Children.RemoveRange(iMatchStart + 1, i - iMatchStart); + + concatenation.Children[iMatchStart] = new RegexNode(RegexNode.Set, concatenation.Options, matchState.Utf16Range) { Next = concatenation }; + int replacedItems = 1; + + if (matchState.Utf16Range2 != null) + { + // We expect there are at least two nodes to be replaced for a 4-byte UTF-8 range match + Debug.Assert(i - iMatchStart >= 1); + + concatenation.Children[iMatchStart + 1] = new RegexNode(RegexNode.Set, concatenation.Options, matchState.Utf16Range2) { Next = concatenation }; + replacedItems++; + } + + concatenation.Children.RemoveRange(iMatchStart + replacedItems, i - (iMatchStart + replacedItems - 1)); // Fix iteration variable after the range removal - i = iMatchStart; + i = iMatchStart + (replacedItems - 1); // Reset the found match iMatchStart = -1; diff --git a/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs b/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs index 1f3e153..329fb14 100644 --- a/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs +++ b/tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs @@ -142,6 +142,15 @@ public void TestUtf8Ranges1() Assert.Equal("\uE000", match(@"/[\xEE-\xEF][\x80-\xBF]{2}/", "\uE000").Value); Assert.Equal("\uFFFF", match(@"/[\xEE-\xEF][\x80-\xBF]{2}/", "\uFFFF").Value); + + Assert.Equal("\U00010000", match(@"/\xF0[\x90-\xBF][\x80-\xBF]{2}/", "\U00010000").Value); + Assert.Equal("\U0003FFFF", match(@"/\xF0[\x90-\xBF][\x80-\xBF]{2}/", "\U0003FFFF").Value); + + Assert.Equal("\U00040000", match(@"/[\xF1-\xF3][\x80-\xBF]{3}/", "\U00040000").Value); + Assert.Equal("\U000FFFFF", match(@"/[\xF1-\xF3][\x80-\xBF]{3}/", "\U000FFFFF").Value); + + Assert.Equal("\U00100000", match(@"/\xF4[\x80-\x8F][\x80-\xBF]{2}/", "\U00100000").Value); + Assert.Equal("\U0010FFFF", match(@"/\xF4[\x80-\x8F][\x80-\xBF]{2}/", "\U0010FFFF").Value); } [Fact] @@ -150,6 +159,23 @@ public void TestUtf8Ranges2() string czechSentence = "Příliš žluťoučký kůň úpěl ďábelské ódy"; Assert.Equal(czechSentence, match(@"/([\x00-\x7F]|[\xC2-\xDF][\x80-\xBF])*/", czechSentence).Value); + + string utf8pattern = @"/ + ( + (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx + | [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx + | \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2 + | [\xE1-\xEC][\x80-\xBF]{2} + | \xED[\x80-\x9F][\x80-\xBF] + | [\xEE-\xEF][\x80-\xBF]{2} + | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 + | [\xF1-\xF3][\x80-\xBF]{3} + | \xF4[\x80-\x8F][\x80-\xBF]{2} + ){1,40} # ...one or more times + ) | . # anything else + /x"; + + Assert.Equal(czechSentence, replace(utf8pattern, "$1", czechSentence)); } } }