Skip to content

Commit

Permalink
Transform 2 and 3 byte UTF-8 ranges to UTF-16
Browse files Browse the repository at this point in the history
  • Loading branch information
roberthusak committed May 6, 2020
1 parent bc4f5e2 commit 25d7e15
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 1 deletion.
3 changes: 3 additions & 0 deletions src/Peachpie.Library.RegularExpressions/RegexParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2732,6 +2732,9 @@ private void AddUnitType(int type)
*/
private void AddGroup()
{
// Try to identify common patterns for matching UTF-8 ranges and convert them to UTF-16
RegexUtf8RangeTransformer.TryTransformRanges(_concatenation);

if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref)
{
_group.AddChild(_concatenation.ReverseLeft());
Expand Down
187 changes: 187 additions & 0 deletions src/Peachpie.Library.RegularExpressions/RegexUtf8RangeTransformer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;

namespace Peachpie.Library.RegularExpressions
{
/// <summary>
/// Helper class to transform typical UTF-8 ranges expressed as sequences of byte ranges to UTF-16 ranges,
/// e.g. <c>[\xC2-\xDF][\x80-\xBF]</c> to <c>[\u0080-\u07FF]</c>.
/// </summary>
internal static class RegexUtf8RangeTransformer
{
/// <summary>
/// State of a helper automaton to identify the sequence of ranges and supply the resulting UTF-16 range.
/// All its states are immutable and static, the user traverses it by navigating through their references.
/// </summary>
private class MatchState
{
public string Utf16Range { get; }

public bool IsFinal => Utf16Range != null;

private delegate MatchState NextStateMatcher(char lower, char upper);

private NextStateMatcher NextMatcher { get; }

private MatchState(string utf16Range, NextStateMatcher next)
{
this.Utf16Range = utf16Range;
this.NextMatcher = next;
}

private static MatchState CreateIntermediate(NextStateMatcher nextMatcher) => new MatchState(null, nextMatcher);

private static MatchState CreateFinal(char utf16RangeFirst, char utf16RangeLast)
{
var charClass = new RegexCharClass();
charClass.AddRange(utf16RangeFirst, utf16RangeLast);

return new MatchState(charClass.ToStringClass(), (f, l) => null);
}

public MatchState MatchNextState(char singleChar) => NextMatcher(singleChar, singleChar) ?? Start;

public MatchState MatchNextState(string range)
{
// We are interested only in single ranges, e.g. [\x80-\xBF]
if (range.Length == 5 && range.StartsWith("\x00\x02\x00"))
{
return NextMatcher(range[3], (char)(range[4] - 1)) ?? Start;
}
else
{
return Start;
}
}

/// <summary>
/// Initial state of the automaton waiting for the first range input.
/// </summary>
public static MatchState Start = CreateIntermediate((f, l) =>
(f, l) switch
{
('\xC2', '\xDF') => TwoByte1, // [\xC2-\xDF][\x80-\xBF] => [\u0080-\u07FF]
('\xE0', '\xE0') => ThreeByteNoOverlongs1, // \xE0[\xA0-\xBF][\x80-\xBF] => [\u0800-\u0FFF]
('\xE1', '\xEC') => ThreeByteStraight1, // [\xE1-\xEC][\x80-\xBF]{2} => [\u1000-\uCFFF]
('\xED', '\xED') => ThreeBytePresurrogates1, // \xED[\x80-\x9F][\x80-\xBF] => [\uD000-\uD7FF]
('\xEE', '\xEF') => ThreeBytePostsurrogates1, // [\xEE-\xEF][\x80-\xBF]{2} => [\uE000-\uFFFF]
_ => null
});

private static MatchState TwoByte1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? TwoByte2 : null);

private static MatchState TwoByte2 = CreateFinal('\u0080', '\u07FF');


private static MatchState ThreeByteNoOverlongs1 = CreateIntermediate((f, l) => (f, l) == ('\xA0', '\xBF') ? ThreeByteNoOverlongs2 : null);

private static MatchState ThreeByteNoOverlongs2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeByteNoOverlongs3 : null);

private static MatchState ThreeByteNoOverlongs3 = CreateFinal('\u0800', '\u0FFF');


private static MatchState ThreeByteStraight1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeByteStraight2 : null);

private static MatchState ThreeByteStraight2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeByteStraight3 : null);

private static MatchState ThreeByteStraight3 = CreateFinal('\u1000', '\uCFFF');


private static MatchState ThreeBytePresurrogates1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\x9F') ? ThreeBytePresurrogates2 : null);

private static MatchState ThreeBytePresurrogates2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeBytePresurrogates3 : null);

private static MatchState ThreeBytePresurrogates3 = CreateFinal('\uD000', '\uD7FF');


private static MatchState ThreeBytePostsurrogates1 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeBytePostsurrogates2 : null);

private static MatchState ThreeBytePostsurrogates2 = CreateIntermediate((f, l) => (f, l) == ('\x80', '\xBF') ? ThreeBytePostsurrogates3 : null);

private static MatchState ThreeBytePostsurrogates3 = CreateFinal('\uE000', '\uFFFF');
}

/// <summary>
/// Attempts to identify common patterns for matching UTF-8 ranges and convert them to UTF-16 ranges
/// by modifying the children of the given concatenation.
/// </summary>
/// <param name="concatenation"></param>
public static void TryTransformRanges(RegexNode concatenation)
{
if (concatenation.Children == null)
{
return;
}

// Mark the state of the current search and its start in the node list
var matchState = MatchState.Start;
int iMatchStart = -1;

for (int i = 0; i < concatenation.Children.Count; i++)
{
var child = concatenation.Children[i];
switch (child.Type())
{
case RegexNode.One:
// Single character is equivalent to an interval [c-c]
matchState = matchState.MatchNextState(child.Ch);
break;

case RegexNode.Set:
matchState = matchState.MatchNextState(child.Str);
break;

case RegexNode.Setloop:
if (child.M != child.N || child.M > 3)
{
goto default;
}
else
{
// Either the whole loop is accepted or nothing (there's no splitting the loop)
for (int j = 0; j < child.N; j++)
{
matchState = matchState.MatchNextState(child.Str);
}
}
break;

default:
// Any other node type resets the matching logic
iMatchStart = -1;
matchState = MatchState.Start;
break;
}

if (matchState != MatchState.Start)
{
if (iMatchStart == -1)
{
iMatchStart = i;
}

if (matchState.IsFinal)
{
// Replace the matched sequence by a single range
concatenation.Children[iMatchStart] = new RegexNode(RegexNode.Set, concatenation.Options, matchState.Utf16Range);
concatenation.Children.RemoveRange(iMatchStart + 1, i - iMatchStart);

// Fix iteration variable after the range removal
i = iMatchStart;

// Reset the found match
iMatchStart = -1;
matchState = MatchState.Start;
}
}
else
{
// Reset the current match start in case of a partial but unsuccessful match
iMatchStart = -1;
}
}
}
}
}
30 changes: 29 additions & 1 deletion tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System;
using System;
using Xunit;

namespace Peachpie.Library.RegularExpressions.Tests
Expand Down Expand Up @@ -123,5 +123,33 @@ public void TestSkipVerb()
Assert.Equal("ab", match(@"/(*SKIP)ab/", "aab").Value);
Assert.False(match(@"/(ab(*SKIP)(*F)|abc)/", "abc").Success);
}

[Fact]
public void TestUtf8Ranges1()
{
Assert.Equal("\u0080", match(@"/[\xC2-\xDF][\x80-\xBF]/", "\u0080").Value);
Assert.Equal("ř", match(@"/[\xC2-\xDF][\x80-\xBF]/", "ř").Value);
Assert.Equal("\u07FF", match(@"/[\xC2-\xDF][\x80-\xBF]/", "\u07FF").Value);

Assert.Equal("\u0800", match(@"/\xE0[\xA0-\xBF][\x80-\xBF]/", "\u0800").Value);
Assert.Equal("\u0FFF", match(@"/\xE0[\xA0-\xBF][\x80-\xBF]/", "\u0FFF").Value);

Assert.Equal("\u1000", match(@"/[\xE1-\xEC][\x80-\xBF]{2}/", "\u1000").Value);
Assert.Equal("\uCFFF", match(@"/[\xE1-\xEC][\x80-\xBF]{2}/", "\uCFFF").Value);

Assert.Equal("\uD000", match(@"/\xED[\x80-\x9F][\x80-\xBF]/", "\uD000").Value);
Assert.Equal("\uD7FF", match(@"/\xED[\x80-\x9F][\x80-\xBF]/", "\uD7FF").Value);

Assert.Equal("\uE000", match(@"/[\xEE-\xEF][\x80-\xBF]{2}/", "\uE000").Value);
Assert.Equal("\uFFFF", match(@"/[\xEE-\xEF][\x80-\xBF]{2}/", "\uFFFF").Value);
}

[Fact]
public void TestUtf8Ranges2()
{
string czechSentence = "Příliš žluťoučký kůň úpěl ďábelské ódy";

Assert.Equal(czechSentence, match(@"/([\x00-\x7F]|[\xC2-\xDF][\x80-\xBF])*/", czechSentence).Value);
}
}
}

0 comments on commit 25d7e15

Please # to comment.