From 289e9b1a5c0c4506668e8edb587ac9eef5e1016b Mon Sep 17 00:00:00 2001 From: anatawa12 Date: Thu, 1 Feb 2024 16:01:36 +0900 Subject: [PATCH] chore: throw exception when encounter unpaired surrogate instead of replace with U+FFFD --- LiteDB/Document/BsonValue.cs | 4 ++-- LiteDB/Engine/Disk/Serializer/BufferReader.cs | 8 ++++---- LiteDB/Engine/Disk/Serializer/BufferWriter.cs | 12 ++++++------ LiteDB/Engine/Structures/CollectionIndex.cs | 4 ++-- LiteDB/Utils/Encoding.cs | 12 ++++++++++++ 5 files changed, 26 insertions(+), 14 deletions(-) create mode 100644 LiteDB/Utils/Encoding.cs diff --git a/LiteDB/Document/BsonValue.cs b/LiteDB/Document/BsonValue.cs index ad4261b1a..572e549a9 100644 --- a/LiteDB/Document/BsonValue.cs +++ b/LiteDB/Document/BsonValue.cs @@ -648,7 +648,7 @@ internal virtual int GetBytesCount(bool recalc) case BsonType.Double: return 8; case BsonType.Decimal: return 16; - case BsonType.String: return Encoding.UTF8.GetByteCount(this.AsString); + case BsonType.String: return StringEncoding.UTF8.GetByteCount(this.AsString); case BsonType.Binary: return this.AsBinary.Length; case BsonType.ObjectId: return 12; @@ -674,7 +674,7 @@ protected int GetBytesCountElement(string key, BsonValue value) return 1 + // element type - Encoding.UTF8.GetByteCount(key) + // CString + StringEncoding.UTF8.GetByteCount(key) + // CString 1 + // CString \0 value.GetBytesCount(true) + (variant ? 5 : 0); // bytes.Length + 0x?? diff --git a/LiteDB/Engine/Disk/Serializer/BufferReader.cs b/LiteDB/Engine/Disk/Serializer/BufferReader.cs index 255ca4cb8..0118763bb 100644 --- a/LiteDB/Engine/Disk/Serializer/BufferReader.cs +++ b/LiteDB/Engine/Disk/Serializer/BufferReader.cs @@ -154,7 +154,7 @@ public string ReadString(int count) // if fits in current segment, use inner array - otherwise copy from multiples segments if (_currentPosition + count <= _current.Count) { - value = Encoding.UTF8.GetString(_current.Array, _current.Offset + _currentPosition, count); + value = StringEncoding.UTF8.GetString(_current.Array, _current.Offset + _currentPosition, count); this.MoveForward(count); } @@ -165,7 +165,7 @@ public string ReadString(int count) this.Read(buffer, 0, count); - value = Encoding.UTF8.GetString(buffer, 0, count); + value = StringEncoding.UTF8.GetString(buffer, 0, count); BufferPool.Return(buffer); } @@ -204,7 +204,7 @@ public string ReadCString() this.MoveForward(1); // +1 to '\0' - return Encoding.UTF8.GetString(mem.ToArray()); + return StringEncoding.UTF8.GetString(mem.ToArray()); } } } @@ -220,7 +220,7 @@ private bool TryReadCStringCurrentSegment(out string value) { if (_current[pos] == 0x00) { - value = Encoding.UTF8.GetString(_current.Array, _current.Offset + _currentPosition, count); + value = StringEncoding.UTF8.GetString(_current.Array, _current.Offset + _currentPosition, count); this.MoveForward(count + 1); // +1 means '\0' return true; } diff --git a/LiteDB/Engine/Disk/Serializer/BufferWriter.cs b/LiteDB/Engine/Disk/Serializer/BufferWriter.cs index 093aa66b2..caa43bf3e 100644 --- a/LiteDB/Engine/Disk/Serializer/BufferWriter.cs +++ b/LiteDB/Engine/Disk/Serializer/BufferWriter.cs @@ -152,13 +152,13 @@ public void WriteCString(string value) { if (value.IndexOf('\0') > -1) throw LiteException.InvalidNullCharInString(); - var bytesCount = Encoding.UTF8.GetByteCount(value); + var bytesCount = StringEncoding.UTF8.GetByteCount(value); var available = _current.Count - _currentPosition; // avaiable in current segment // can write direct in current segment (use < because need +1 \0) if (bytesCount < available) { - Encoding.UTF8.GetBytes(value, 0, value.Length, _current.Array, _current.Offset + _currentPosition); + StringEncoding.UTF8.GetBytes(value, 0, value.Length, _current.Array, _current.Offset + _currentPosition); _current[_currentPosition + bytesCount] = 0x00; @@ -168,7 +168,7 @@ public void WriteCString(string value) { var buffer = BufferPool.Rent(bytesCount); - Encoding.UTF8.GetBytes(value, 0, value.Length, buffer, 0); + StringEncoding.UTF8.GetBytes(value, 0, value.Length, buffer, 0); this.Write(buffer, 0, bytesCount); @@ -186,7 +186,7 @@ public void WriteCString(string value) /// public void WriteString(string value, bool specs) { - var count = Encoding.UTF8.GetByteCount(value); + var count = StringEncoding.UTF8.GetByteCount(value); if (specs) { @@ -195,7 +195,7 @@ public void WriteString(string value, bool specs) if (count <= _current.Count - _currentPosition) { - Encoding.UTF8.GetBytes(value, 0, value.Length, _current.Array, _current.Offset + _currentPosition); + StringEncoding.UTF8.GetBytes(value, 0, value.Length, _current.Array, _current.Offset + _currentPosition); this.MoveForward(count); } @@ -204,7 +204,7 @@ public void WriteString(string value, bool specs) // rent a buffer to be re-usable var buffer = BufferPool.Rent(count); - Encoding.UTF8.GetBytes(value, 0, value.Length, buffer, 0); + StringEncoding.UTF8.GetBytes(value, 0, value.Length, buffer, 0); this.Write(buffer, 0, count); diff --git a/LiteDB/Engine/Structures/CollectionIndex.cs b/LiteDB/Engine/Structures/CollectionIndex.cs index fce53b032..6ea3a0316 100644 --- a/LiteDB/Engine/Structures/CollectionIndex.cs +++ b/LiteDB/Engine/Structures/CollectionIndex.cs @@ -121,8 +121,8 @@ public static int GetLength(string name, string expr) return 1 + // Slot 1 + // IndexType - Encoding.UTF8.GetByteCount(name) + 1 + // Name + \0 - Encoding.UTF8.GetByteCount(expr) + 1 + // Expression + \0 + StringEncoding.UTF8.GetByteCount(name) + 1 + // Name + \0 + StringEncoding.UTF8.GetByteCount(expr) + 1 + // Expression + \0 1 + // Unique PageAddress.SIZE + // Head PageAddress.SIZE + // Tail diff --git a/LiteDB/Utils/Encoding.cs b/LiteDB/Utils/Encoding.cs new file mode 100644 index 000000000..ce28bea2f --- /dev/null +++ b/LiteDB/Utils/Encoding.cs @@ -0,0 +1,12 @@ +using System.Text; + +namespace LiteDB +{ + internal class StringEncoding + { + // Original Encoding.UTF8 will replace unpaired surrogate with U+FFFD, which is not suitable for database + // so, we need to use new UTF8Encoding(false, true) to make throw exception when unpaired surrogate is found + //public static System.Text.Encoding UTF8 = new UTF8Encoding(false, true); + public static Encoding UTF8 = new UTF8Encoding(false, true); + } +}