Skip to content

Commit ae163da

Browse files
GrabYourPitchforksdotnet-bot
authored andcommitted
Add Utf8String skeleton (dotnet/coreclr#23209)
Utf8String is an experimental type that is string-like (heap-allocated, immutable, variable-length, null-terminated) but whose inner representation is UTF-8, not UTF-16. This is a skeleton implementation of the basic API shape. The ecosystem of APIs has not yet been built around it. All Utf8String-related code is currently surrounded by ifdefs to allow easy identification and removal from release branches. Signed-off-by: dotnet-bot <dotnet-bot@microsoft.com>
1 parent e9e902b commit ae163da

File tree

9 files changed

+202
-19
lines changed

9 files changed

+202
-19
lines changed

netcore/System.Private.CoreLib/shared/System.Private.CoreLib.Shared.projitems

+1
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,7 @@
801801
<Compile Include="$(MSBuildThisFileDirectory)System\Text\UTF8Encoding.cs" />
802802
<Compile Include="$(MSBuildThisFileDirectory)System\Text\ValueStringBuilder.cs" />
803803
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8.cs" />
804+
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Unicode\Utf8Utility.cs" />
804805
<Compile Include="$(MSBuildThisFileDirectory)System\TimeSpan.cs" />
805806
<Compile Include="$(MSBuildThisFileDirectory)System\ThreadAttributes.cs" />
806807
<Compile Include="$(MSBuildThisFileDirectory)System\Threading\AbandonedMutexException.cs" />

netcore/System.Private.CoreLib/shared/System/Memory.cs

+31-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Diagnostics;
77
using System.Runtime.CompilerServices;
88
using System.Runtime.InteropServices;
9+
using System.Text;
910
using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
1011
using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
1112

@@ -164,7 +165,13 @@ internal Memory(object obj, int start, int length)
164165
// No validation performed in release builds; caller must provide any necessary validation.
165166

166167
// 'obj is T[]' below also handles things like int[] <-> uint[] being convertible
167-
Debug.Assert((obj == null) || (typeof(T) == typeof(char) && obj is string) || (obj is T[]) || (obj is MemoryManager<T>));
168+
Debug.Assert((obj == null)
169+
|| (typeof(T) == typeof(char) && obj is string)
170+
#if FEATURE_UTF8STRING
171+
|| ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj is Utf8String)
172+
#endif // FEATURE_UTF8STRING
173+
|| (obj is T[])
174+
|| (obj is MemoryManager<T>));
168175

169176
_object = obj;
170177
_index = start;
@@ -212,6 +219,14 @@ public override string ToString()
212219
{
213220
return (_object is string str) ? str.Substring(_index, _length) : Span.ToString();
214221
}
222+
#if FEATURE_UTF8STRING
223+
else if (typeof(T) == typeof(Char8))
224+
{
225+
// TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
226+
Span<T> span = Span;
227+
return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)), span.Length));
228+
}
229+
#endif // FEATURE_UTF8STRING
215230
return string.Format("System.Memory<{0}>[{1}]", typeof(T).Name, _length);
216231
}
217232

@@ -317,6 +332,13 @@ public unsafe Span<T> Span
317332
refToReturn = ref Unsafe.As<char, T>(ref Unsafe.As<string>(tmpObject).GetRawStringData());
318333
lengthOfUnderlyingSpan = Unsafe.As<string>(tmpObject).Length;
319334
}
335+
#if FEATURE_UTF8STRING
336+
else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject.GetType() == typeof(Utf8String))
337+
{
338+
refToReturn = ref Unsafe.As<byte, T>(ref Unsafe.As<Utf8String>(tmpObject).DangerousGetMutableReference());
339+
lengthOfUnderlyingSpan = Unsafe.As<Utf8String>(tmpObject).Length;
340+
}
341+
#endif // FEATURE_UTF8STRING
320342
else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
321343
{
322344
// We know the object is not null, it's not a string, and it is variable-length. The only
@@ -427,6 +449,14 @@ public unsafe MemoryHandle Pin()
427449
ref char stringData = ref Unsafe.Add(ref s.GetRawStringData(), _index);
428450
return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
429451
}
452+
#if FEATURE_UTF8STRING
453+
else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject is Utf8String utf8String)
454+
{
455+
GCHandle handle = GCHandle.Alloc(tmpObject, GCHandleType.Pinned);
456+
ref byte stringData = ref utf8String.DangerousGetMutableReference(_index);
457+
return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
458+
}
459+
#endif // FEATURE_UTF8STRING
430460
else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
431461
{
432462
// 'tmpObject is T[]' below also handles things like int[] <-> uint[] being convertible

netcore/System.Private.CoreLib/shared/System/ReadOnlyMemory.cs

+31-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Diagnostics;
77
using System.Runtime.CompilerServices;
88
using System.Runtime.InteropServices;
9+
using System.Text;
910
using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
1011
using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
1112

@@ -99,7 +100,13 @@ internal ReadOnlyMemory(object obj, int start, int length)
99100
// No validation performed in release builds; caller must provide any necessary validation.
100101

101102
// 'obj is T[]' below also handles things like int[] <-> uint[] being convertible
102-
Debug.Assert((obj == null) || (typeof(T) == typeof(char) && obj is string) || (obj is T[]) || (obj is MemoryManager<T>));
103+
Debug.Assert((obj == null)
104+
|| (typeof(T) == typeof(char) && obj is string)
105+
#if FEATURE_UTF8STRING
106+
|| ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj is Utf8String)
107+
#endif // FEATURE_UTF8STRING
108+
|| (obj is T[])
109+
|| (obj is MemoryManager<T>));
103110

104111
_object = obj;
105112
_index = start;
@@ -141,6 +148,14 @@ public override string ToString()
141148
{
142149
return (_object is string str) ? str.Substring(_index, _length) : Span.ToString();
143150
}
151+
#if FEATURE_UTF8STRING
152+
else if (typeof(T) == typeof(Char8))
153+
{
154+
// TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
155+
ReadOnlySpan<T> span = Span;
156+
return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref MemoryMarshal.GetReference(span)), span.Length));
157+
}
158+
#endif // FEATURE_UTF8STRING
144159
return string.Format("System.ReadOnlyMemory<{0}>[{1}]", typeof(T).Name, _length);
145160
}
146161

@@ -239,6 +254,13 @@ public unsafe ReadOnlySpan<T> Span
239254
refToReturn = ref Unsafe.As<char, T>(ref Unsafe.As<string>(tmpObject).GetRawStringData());
240255
lengthOfUnderlyingSpan = Unsafe.As<string>(tmpObject).Length;
241256
}
257+
#if FEATURE_UTF8STRING
258+
else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject.GetType() == typeof(Utf8String))
259+
{
260+
refToReturn = ref Unsafe.As<byte, T>(ref Unsafe.As<Utf8String>(tmpObject).DangerousGetMutableReference());
261+
lengthOfUnderlyingSpan = Unsafe.As<Utf8String>(tmpObject).Length;
262+
}
263+
#endif // FEATURE_UTF8STRING
242264
else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
243265
{
244266
// We know the object is not null, it's not a string, and it is variable-length. The only
@@ -342,6 +364,14 @@ public unsafe MemoryHandle Pin()
342364
ref char stringData = ref Unsafe.Add(ref s.GetRawStringData(), _index);
343365
return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
344366
}
367+
#if FEATURE_UTF8STRING
368+
else if ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && tmpObject is Utf8String utf8String)
369+
{
370+
GCHandle handle = GCHandle.Alloc(tmpObject, GCHandleType.Pinned);
371+
ref byte stringData = ref utf8String.DangerousGetMutableReference(_index);
372+
return new MemoryHandle(Unsafe.AsPointer(ref stringData), handle);
373+
}
374+
#endif // FEATURE_UTF8STRING
345375
else if (RuntimeHelpers.ObjectHasComponentSize(tmpObject))
346376
{
347377
// 'tmpObject is T[]' below also handles things like int[] <-> uint[] being convertible

netcore/System.Private.CoreLib/shared/System/ReadOnlySpan.Fast.cs

+9-5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Diagnostics;
66
using System.Runtime.CompilerServices;
77
using System.Runtime.Versioning;
8+
using System.Text;
89
using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
910
using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
1011
using Internal.Runtime.CompilerServices;
@@ -240,12 +241,15 @@ public override string ToString()
240241
{
241242
if (typeof(T) == typeof(char))
242243
{
243-
unsafe
244-
{
245-
fixed (char* src = &Unsafe.As<T, char>(ref _pointer.Value))
246-
return new string(src, 0, _length);
247-
}
244+
return new string(new ReadOnlySpan<char>(ref Unsafe.As<T, char>(ref _pointer.Value), _length));
248245
}
246+
#if FEATURE_UTF8STRING
247+
else if (typeof(T) == typeof(Char8))
248+
{
249+
// TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
250+
return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref _pointer.Value), _length));
251+
}
252+
#endif // FEATURE_UTF8STRING
249253
return string.Format("System.ReadOnlySpan<{0}>[{1}]", typeof(T).Name, _length);
250254
}
251255

netcore/System.Private.CoreLib/shared/System/Runtime/InteropServices/MemoryMarshal.cs

+6-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,12 @@ public static bool TryGetArray<T>(ReadOnlyMemory<T> memory, out ArraySegment<T>
2828
// As an optimization, we skip the "is string?" check below if typeof(T) is not char,
2929
// as Memory<T> / ROM<T> can't possibly contain a string instance in this case.
3030

31-
if (obj != null && (typeof(T) != typeof(char) || obj.GetType() != typeof(string)))
31+
if (obj != null && !(
32+
(typeof(T) == typeof(char) && obj.GetType() == typeof(string))
33+
#if FEATURE_UTF8STRING
34+
|| ((typeof(T) == typeof(byte) || typeof(T) == typeof(Char8)) && obj.GetType() == typeof(Utf8String))
35+
#endif // FEATURE_UTF8STRING
36+
))
3237
{
3338
if (RuntimeHelpers.ObjectHasComponentSize(obj))
3439
{

netcore/System.Private.CoreLib/shared/System/Span.Fast.cs

+9-5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Diagnostics;
66
using System.Runtime.CompilerServices;
77
using System.Runtime.Versioning;
8+
using System.Text;
89
using EditorBrowsableAttribute = System.ComponentModel.EditorBrowsableAttribute;
910
using EditorBrowsableState = System.ComponentModel.EditorBrowsableState;
1011
using Internal.Runtime.CompilerServices;
@@ -319,12 +320,15 @@ public override string ToString()
319320
{
320321
if (typeof(T) == typeof(char))
321322
{
322-
unsafe
323-
{
324-
fixed (char* src = &Unsafe.As<T, char>(ref _pointer.Value))
325-
return new string(src, 0, _length);
326-
}
323+
return new string(new ReadOnlySpan<char>(ref Unsafe.As<T, char>(ref _pointer.Value), _length));
324+
}
325+
#if FEATURE_UTF8STRING
326+
else if (typeof(T) == typeof(Char8))
327+
{
328+
// TODO_UTF8STRING: Call into optimized transcoding routine when it's available.
329+
return Encoding.UTF8.GetString(new ReadOnlySpan<byte>(ref Unsafe.As<T, byte>(ref _pointer.Value), _length));
327330
}
331+
#endif // FEATURE_UTF8STRING
328332
return string.Format("System.Span<{0}>[{1}]", typeof(T).Name, _length);
329333
}
330334

netcore/System.Private.CoreLib/shared/System/String.cs

+8-5
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,13 @@ namespace System
2424
[System.Runtime.CompilerServices.TypeForwardedFrom("mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089")]
2525
public sealed partial class String : IComparable, IEnumerable, IConvertible, IEnumerable<char>, IComparable<string>, IEquatable<string>, ICloneable
2626
{
27-
// String constructors
28-
// These are special. The implementation methods for these have a different signature from the
29-
// declared constructors.
27+
/*
28+
* CONSTRUCTORS
29+
*
30+
* Defining a new constructor for string-like types (like String) requires changes both
31+
* to the managed code below and to the native VM code. See the comment at the top of
32+
* src/vm/ecall.cpp for instructions on how to add new overloads.
33+
*/
3034

3135
[MethodImplAttribute(MethodImplOptions.InternalCall)]
3236
public extern String(char[] value);
@@ -335,8 +339,7 @@ private unsafe string Ctor(ReadOnlySpan<char> value)
335339
return Empty;
336340

337341
string result = FastAllocateString(value.Length);
338-
fixed (char* dest = &result._firstChar, src = &MemoryMarshal.GetReference(value))
339-
wstrcpy(dest, src, value.Length);
342+
Buffer.Memmove(ref result._firstChar, ref MemoryMarshal.GetReference(value), (uint)value.Length);
340343
return result;
341344
}
342345

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Buffers;
6+
using System.Diagnostics;
7+
using System.IO;
8+
using System.Runtime.CompilerServices;
9+
10+
namespace System.Text.Unicode
11+
{
12+
internal static class Utf8Utility
13+
{
14+
/// <summary>
15+
/// The maximum number of bytes that can result from UTF-8 transcoding
16+
/// any Unicode scalar value.
17+
/// </summary>
18+
internal const int MaxBytesPerScalar = 4;
19+
20+
/// <summary>
21+
/// The UTF-8 representation of <see cref="UnicodeUtility.ReplacementChar"/>.
22+
/// </summary>
23+
private static ReadOnlySpan<byte> ReplacementCharSequence => new byte[] { 0xEF, 0xBF, 0xBD };
24+
25+
/// <summary>
26+
/// Returns the byte index in <paramref name="utf8Data"/> where the first invalid UTF-8 sequence begins,
27+
/// or -1 if the buffer contains no invalid sequences. Also outs the <paramref name="isAscii"/> parameter
28+
/// stating whether all data observed (up to the first invalid sequence or the end of the buffer, whichever
29+
/// comes first) is ASCII.
30+
/// </summary>
31+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
32+
public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
33+
{
34+
// TODO_UTF8STRING: Replace this with the faster drop-in replacement when it's available (coreclr #21948).
35+
36+
bool tempIsAscii = true;
37+
int originalDataLength = utf8Data.Length;
38+
39+
while (!utf8Data.IsEmpty)
40+
{
41+
if (Rune.DecodeFromUtf8(utf8Data, out Rune result, out int bytesConsumed) != OperationStatus.Done)
42+
{
43+
break;
44+
}
45+
46+
tempIsAscii &= result.IsAscii;
47+
utf8Data = utf8Data.Slice(bytesConsumed);
48+
}
49+
50+
isAscii = tempIsAscii;
51+
return (utf8Data.IsEmpty) ? -1 : (originalDataLength - utf8Data.Length);
52+
}
53+
54+
#if FEATURE_UTF8STRING
55+
/// <summary>
56+
/// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data;
57+
/// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as
58+
/// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced
59+
/// with U+FFD.
60+
/// </summary>
61+
public static Utf8String ValidateAndFixupUtf8String(Utf8String value)
62+
{
63+
if (Utf8String.IsNullOrEmpty(value))
64+
{
65+
return value;
66+
}
67+
68+
ReadOnlySpan<byte> valueAsBytes = value.AsBytes();
69+
70+
int idxOfFirstInvalidData = GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes, out _);
71+
if (idxOfFirstInvalidData < 0)
72+
{
73+
return value;
74+
}
75+
76+
// TODO_UTF8STRING: Replace this with the faster implementation once it's available.
77+
// (The faster implementation is in the dev/utf8string_bak branch currently.)
78+
79+
MemoryStream memStream = new MemoryStream();
80+
memStream.Write(valueAsBytes.Slice(0, idxOfFirstInvalidData));
81+
82+
valueAsBytes = valueAsBytes.Slice(idxOfFirstInvalidData);
83+
do
84+
{
85+
if (Rune.DecodeFromUtf8(valueAsBytes, out _, out int bytesConsumed) == OperationStatus.Done)
86+
{
87+
// Valid scalar value - copy data as-is to MemoryStream
88+
memStream.Write(valueAsBytes.Slice(0, bytesConsumed));
89+
}
90+
else
91+
{
92+
// Invalid scalar value - copy U+FFFD to MemoryStream
93+
memStream.Write(ReplacementCharSequence);
94+
}
95+
96+
valueAsBytes = valueAsBytes.Slice(bytesConsumed);
97+
} while (!valueAsBytes.IsEmpty);
98+
99+
bool success = memStream.TryGetBuffer(out ArraySegment<byte> memStreamBuffer);
100+
Debug.Assert(success, "Couldn't get underlying MemoryStream buffer.");
101+
102+
return Utf8String.DangerousCreateWithoutValidation(memStreamBuffer, assumeWellFormed: true);
103+
}
104+
#endif // FEATURE_UTF8STRING
105+
}
106+
}

netcore/System.Private.CoreLib/shared/System/Text/UnicodeUtility.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ internal static class UnicodeUtility
1111
/// <summary>
1212
/// The Unicode replacement character U+FFFD.
1313
/// </summary>
14-
public const uint ReplacementChar = 0xFFFDU;
14+
public const uint ReplacementChar = 0xFFFD;
1515

1616
/// <summary>
1717
/// Returns the Unicode plane (0 through 16, inclusive) which contains this code point.

0 commit comments

Comments
 (0)