From 4b2c26ed4360215a93a6d3e62eed4bcc9cbb8e23 Mon Sep 17 00:00:00 2001 From: Brant Burnett Date: Mon, 6 Feb 2023 22:59:35 -0500 Subject: [PATCH] Switch decompression input and scratch to use ref byte (#50) Motivation ---------- This is a step towards code that doesn't require pinning which can help with GC when compression/decompression is run a lot. GC will be able to move memory even when in the middle of a compression or decompression run and update the ref pointers. Modifications ------------- Switch all usages of input, inputEnd, and scratch to be `ref byte` within the decompression logic. Redesign so that inputEnd points to the last byte in the input rather than just past the end of the input. Redesign RefillTag to return information rather than directly manipulating the pointer parameters since you can't pass a ref to a ref (except in .NET 7). Results ------- Smaller code size across all platforms, and a slight gain on modern frameworks. .NET 4.8 regresses a bit, hopefully we can improve that with tuning later. BenchmarkDotNet=v0.13.4, OS=Windows 11 (10.0.22000.1455/21H2) Intel Core i7-10850H CPU 2.70GHz, 1 CPU, 12 logical and 6 physical cores .NET SDK=7.0.102 [Host] : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2 MediumRun-.NET 6.0 : .NET 6.0.13 (6.0.1322.58009), X64 RyuJIT AVX2 MediumRun-.NET 7.0 : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2 MediumRun-.NET Framework 4.8 : .NET Framework 4.8 (4.8.4515.0), X64 RyuJIT VectorSize=256 IterationCount=15 LaunchCount=2 WarmupCount=10 | Method | Job | Runtime | Mean | Error | StdDev | Median | Ratio | RatioSD | Code Size | |-------- |----------------------------- |------------------- |----------:|---------:|---------:|----------:|------:|--------:|----------:| | Pointer | MediumRun-.NET 6.0 | .NET 6.0 | 103.15 us | 1.167 us | 1.674 us | 102.30 us | 1.00 | 0.00 | 6,034 B | | Ref | MediumRun-.NET 6.0 | .NET 6.0 | 102.52 us | 0.386 us | 0.516 us | 102.45 us | 0.99 | 0.02 | 5,784 B | | | | | | | | | | | | | Pointer | MediumRun-.NET 7.0 | .NET 7.0 | 91.47 us | 0.698 us | 1.045 us | 91.13 us | 1.00 | 0.00 | 5,197 B | | Ref | MediumRun-.NET 7.0 | .NET 7.0 | 89.62 us | 1.262 us | 1.888 us | 90.80 us | 0.98 | 0.03 | 4,609 B | | | | | | | | | | | | | Pointer | MediumRun-.NET Framework 4.8 | .NET Framework 4.8 | 104.69 us | 0.138 us | 0.203 us | 104.77 us | 1.00 | 0.00 | 6,213 B | | Ref | MediumRun-.NET Framework 4.8 | .NET Framework 4.8 | 116.17 us | 3.613 us | 5.408 us | 117.59 us | 1.11 | 0.05 | 5,846 B | --- Snappier/Internal/Constants.cs | 2 +- Snappier/Internal/Helpers.cs | 12 + Snappier/Internal/SnappyDecompressor.cs | 416 +++++++++++++----------- 3 files changed, 234 insertions(+), 196 deletions(-) diff --git a/Snappier/Internal/Constants.cs b/Snappier/Internal/Constants.cs index 5e3966b..944fed6 100644 --- a/Snappier/Internal/Constants.cs +++ b/Snappier/Internal/Constants.cs @@ -16,7 +16,7 @@ public enum ChunkType : byte public const byte Copy2ByteOffset = 2; public const byte Copy4ByteOffset = 3; - public const long MaximumTagLength = 5; + public const int MaximumTagLength = 5; public const int BlockLog = 16; public const long BlockSize = 1 << BlockLog; diff --git a/Snappier/Internal/Helpers.cs b/Snappier/Internal/Helpers.cs index 82a3915..1411db2 100644 --- a/Snappier/Internal/Helpers.cs +++ b/Snappier/Internal/Helpers.cs @@ -105,6 +105,18 @@ public static unsafe uint UnsafeReadUInt32(void* ptr) return result; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint UnsafeReadUInt32(ref byte ptr) + { + var result = Unsafe.ReadUnaligned(ref ptr); + if (!BitConverter.IsLittleEndian) + { + result = BinaryPrimitives.ReverseEndianness(result); + } + + return result; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe ulong UnsafeReadUInt64(void* ptr) { diff --git a/Snappier/Internal/SnappyDecompressor.cs b/Snappier/Internal/SnappyDecompressor.cs index ee00b24..73e0806 100644 --- a/Snappier/Internal/SnappyDecompressor.cs +++ b/Snappier/Internal/SnappyDecompressor.cs @@ -185,236 +185,256 @@ internal unsafe void DecompressAllTags(ReadOnlySpan inputSpan) unchecked { - fixed (byte* inputStart = inputSpan) + ref byte input = ref Unsafe.AsRef(in inputSpan[0]); + + // The reference Snappy implementation uses inputEnd as a pointer one byte past the end of the buffer. + // However, this is not safe when using ref locals. The ref must point to somewhere within the array + // so that GC can adjust the ref if the memory is moved. + ref byte inputEnd = ref Unsafe.Add(ref input, inputSpan.Length - 1); + + // Track the point in the input before which input is guaranteed to have at least Constants.MaxTagLength bytes left + ref byte inputLimitMinMaxTagLength = ref Unsafe.Subtract(ref inputEnd, Math.Min(inputSpan.Length, Constants.MaximumTagLength - 1) - 1); + + fixed (byte* buffer = _lookbackBuffer.Span) { - byte* inputEnd = inputStart + inputSpan.Length; - byte* input = inputStart; + byte* bufferEnd = buffer + _lookbackBuffer.Length; + byte* op = buffer + _lookbackPosition; - // Track the point in the input before which input is guaranteed to have at least Constants.MaxTagLength bytes left - byte* inputLimitMinMaxTagLength = inputEnd - Math.Min(inputEnd - input, Constants.MaximumTagLength - 1); + // Get a reference to the first byte in the scratch buffer, we'll reuse this so that we don't repeat range checks every time + ref byte scratch = ref _scratch[0]; - fixed (byte* buffer = _lookbackBuffer.Span) + if (_scratchLength > 0) { - byte* bufferEnd = buffer + _lookbackBuffer.Length; - byte* op = buffer + _lookbackPosition; + // Have partial tag remaining from a previous decompress run + // Get the combined tag in the scratch buffer, then run through + // special case processing that gets the tag from the scratch buffer + // and any literal data from the _input buffer + + // scratch will be the scratch buffer with only the tag if true is returned + (bool sufficientData, uint inputUsed) = RefillTagFromScratch(ref input, ref inputEnd, ref scratch); + input = ref Unsafe.Add(ref input, inputUsed); + if (!sufficientData) + { + return; + } - fixed (byte* scratchStart = _scratch) + // No more scratch for next cycle, we have a full buffer we're about to use + _scratchLength = 0; + + byte c = scratch; + scratch = ref Unsafe.Add(ref scratch, 1); + + if ((c & 0x03) == Constants.Literal) { - byte* scratch = scratchStart; + nint literalLength = (c >> 2) + 1; + if (literalLength >= 61) + { + // Long literal. + nint literalLengthLength = literalLength - 60; + uint literalLengthTemp = Helpers.UnsafeReadUInt32(ref scratch); + + literalLength = (nint) Helpers.ExtractLowBytes(literalLengthTemp, + (int) literalLengthLength) + 1; + } - if (_scratchLength > 0) + nint inputRemaining = Unsafe.ByteOffset(ref input, ref inputEnd) + 1; + if (inputRemaining < literalLength) { - // Have partial tag remaining from a previous decompress run - // Get the combined tag in the scratch buffer, then run through - // special case processing that gets the tag from the scratch buffer - // and any literal data from the _input buffer - - // scratch will be the scratch buffer with only the tag if true is returned - (bool sufficientData, uint inputUsed) = RefillTagFromScratch(ref Unsafe.AsRef(input), - ref Unsafe.AsRef(inputEnd), ref Unsafe.AsRef(scratch)); - input += inputUsed; - if (!sufficientData) - { - return; - } + Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in input, inputRemaining); + op += inputRemaining; + _remainingLiteral = (int) (literalLength - inputRemaining); + _lookbackPosition = (int)(op - buffer); + return; + } + else + { + Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in input, literalLength); + op += literalLength; + input = ref Unsafe.Add(ref input, literalLength); + } + } + else if ((c & 3) == Constants.Copy4ByteOffset) + { + uint copyOffset = Helpers.UnsafeReadUInt32(ref scratch); - // No more scratch for next cycle, we have a full buffer we're about to use - _scratchLength = 0; + nint length = (c >> 2) + 1; - byte c = scratch[0]; - scratch++; + AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); + op += length; + } + else + { + ushort entry = charTable[c]; + uint data = Helpers.UnsafeReadUInt32(ref scratch); - if ((c & 0x03) == Constants.Literal) - { - nint literalLength = (c >> 2) + 1; - if (literalLength >= 61) - { - // Long literal. - nint literalLengthLength = literalLength - 60; - uint literalLengthTemp = Helpers.UnsafeReadUInt32(scratch); + uint trailer = Helpers.ExtractLowBytes(data, c & 3); + nint length = entry & 0xff; - literalLength = (nint) Helpers.ExtractLowBytes(literalLengthTemp, - (int) literalLengthLength) + 1; - } + // copy_offset/256 is encoded in bits 8..10. By just fetching + // those bits, we get copy_offset (since the bit-field starts at + // bit 8). + uint copyOffset = (entry & 0x700u) + trailer; - nint inputRemaining = (nint)(inputEnd - input); - if (inputRemaining < literalLength) - { - Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in Unsafe.AsRef(input), inputRemaining); - op += inputRemaining; - _remainingLiteral = (int) (literalLength - inputRemaining); - _lookbackPosition = (int)(op - buffer); - return; - } - else - { - Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in Unsafe.AsRef(input), literalLength); - op += literalLength; - input += literalLength; - } - } - else if ((c & 3) == Constants.Copy4ByteOffset) - { - uint copyOffset = Helpers.UnsafeReadUInt32(scratch); + AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); + op += length; + } - nint length = (c >> 2) + 1; + // Make sure scratch is reset + scratch = ref _scratch[0]; + } - AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); - op += length; - } - else - { - ushort entry = charTable[c]; - uint data = Helpers.UnsafeReadUInt32(scratch); + if (!Unsafe.IsAddressLessThan(ref input, ref inputLimitMinMaxTagLength)) + { + uint newScratchLength = RefillTag(ref input, ref inputEnd, ref scratch); + if (newScratchLength == uint.MaxValue) + { + goto exit; + } - uint trailer = Helpers.ExtractLowBytes(data, c & 3); - nint length = entry & 0xff; + if (newScratchLength > 0) + { + // Data has been moved to the scratch buffer + input = ref scratch; + inputEnd = ref Unsafe.Add(ref input, newScratchLength - 1); + inputLimitMinMaxTagLength = ref Unsafe.Subtract(ref inputEnd, + Math.Min(newScratchLength, Constants.MaximumTagLength - 1) - 1); + } + } - // copy_offset/256 is encoded in bits 8..10. By just fetching - // those bits, we get copy_offset (since the bit-field starts at - // bit 8). - uint copyOffset = (entry & 0x700u) + trailer; + uint preload = Helpers.UnsafeReadUInt32(ref input); - AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); - op += length; - } + while (true) + { + byte c = (byte) preload; + input = ref Unsafe.Add(ref input, 1); - // Make sure scratch is reset - scratch = scratchStart; + if ((c & 0x03) == Constants.Literal) + { + nint literalLength = unchecked((c >> 2) + 1); + + if (TryFastAppend(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in input, Unsafe.ByteOffset(ref input, ref inputEnd) + 1, literalLength)) + { + Debug.Assert(literalLength < 61); + op += literalLength; + input = ref Unsafe.Add(ref input, literalLength); + // NOTE: There is no RefillTag here, as TryFastAppend() + // will not return true unless there's already at least five spare + // bytes in addition to the literal. + preload = Helpers.UnsafeReadUInt32(ref input); + continue; } - if (input >= inputLimitMinMaxTagLength) + if (literalLength >= 61) { - if (!RefillTag(ref input, ref inputEnd, scratch)) - { - goto exit; - } + // Long literal. + nint literalLengthLength = literalLength - 60; + uint literalLengthTemp = Helpers.UnsafeReadUInt32(ref input); - inputLimitMinMaxTagLength = inputEnd - Math.Min(inputEnd - input, - Constants.MaximumTagLength - 1); - } + literalLength = (nint) Helpers.ExtractLowBytes(literalLengthTemp, + (int) literalLengthLength) + 1; - uint preload = Helpers.UnsafeReadUInt32(input); + input = ref Unsafe.Add(ref input, literalLengthLength); + } - while (true) + nint inputRemaining = Unsafe.ByteOffset(ref input, ref inputEnd) + 1; + if (inputRemaining < literalLength) + { + Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in input, inputRemaining); + op += inputRemaining; + _remainingLiteral = (int) (literalLength - inputRemaining); + goto exit; + } + else { - byte c = (byte) preload; - input++; + Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in input, literalLength); + op += literalLength; + input = ref Unsafe.Add(ref input, literalLength); - if ((c & 0x03) == Constants.Literal) + if (!Unsafe.IsAddressLessThan(ref input, ref inputLimitMinMaxTagLength)) { - nint literalLength = unchecked((c >> 2) + 1); - - if (TryFastAppend(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in Unsafe.AsRef(input), (nint)(inputEnd - input), literalLength)) + uint newScratchLength = RefillTag(ref input, ref inputEnd, ref scratch); + if (newScratchLength == uint.MaxValue) { - Debug.Assert(literalLength < 61); - op += literalLength; - input += literalLength; - // NOTE: There is no RefillTag here, as TryFastAppend() - // will not return true unless there's already at least five spare - // bytes in addition to the literal. - preload = Helpers.UnsafeReadUInt32(input); - continue; + goto exit; } - if (literalLength >= 61) + if (newScratchLength > 0) { - // Long literal. - nint literalLengthLength = literalLength - 60; - uint literalLengthTemp = Helpers.UnsafeReadUInt32(input); - - literalLength = (nint) Helpers.ExtractLowBytes(literalLengthTemp, - (int) literalLengthLength) + 1; - - input += literalLengthLength; - } + // Data has been moved to the scratch buffer + input = ref scratch; + inputEnd = ref Unsafe.Add(ref input, newScratchLength - 1); + inputLimitMinMaxTagLength = ref Unsafe.Subtract(ref inputEnd, + Math.Min(newScratchLength, Constants.MaximumTagLength - 1) - 1); - nint inputRemaining = (nint)(inputEnd - input); - if (inputRemaining < literalLength) - { - Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in Unsafe.AsRef(input), inputRemaining); - op += inputRemaining; - _remainingLiteral = (int) (literalLength - inputRemaining); - goto exit; - } - else - { - Append(ref Unsafe.AsRef(op), ref Unsafe.AsRef(bufferEnd), in Unsafe.AsRef(input), literalLength); - op += literalLength; - input += literalLength; - - if (input >= inputLimitMinMaxTagLength) - { - if (!RefillTag(ref input, ref inputEnd, scratch)) - { - goto exit; - } - - inputLimitMinMaxTagLength = inputEnd - Math.Min(inputEnd - input, - Constants.MaximumTagLength - 1); - } - - preload = Helpers.UnsafeReadUInt32(input); } } - else - { - if ((c & 3) == Constants.Copy4ByteOffset) - { - uint copyOffset = Helpers.UnsafeReadUInt32(input); - input += 4; - nint length = (c >> 2) + 1; - AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); - op += length; - } - else - { - ushort entry = charTable[c]; + preload = Helpers.UnsafeReadUInt32(ref input); + } + } + else + { + if ((c & 3) == Constants.Copy4ByteOffset) + { + uint copyOffset = Helpers.UnsafeReadUInt32(ref input); + input = ref Unsafe.Add(ref input, 4); - // We don't use BitConverter to read because we might be reading past the end of the span - // But we know that's safe because we'll be doing it in _scratch with extra data on the end. - // This reduces this step by several operations - preload = Helpers.UnsafeReadUInt32(input); + nint length = (c >> 2) + 1; + AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); + op += length; + } + else + { + ushort entry = charTable[c]; - uint trailer = Helpers.ExtractLowBytes(preload, c & 3); - nint length = entry & 0xff; + // We don't use BitConverter to read because we might be reading past the end of the span + // But we know that's safe because we'll be doing it in _scratch with extra data on the end. + // This reduces this step by several operations + preload = Helpers.UnsafeReadUInt32(ref input); - // copy_offset/256 is encoded in bits 8..10. By just fetching - // those bits, we get copy_offset (since the bit-field starts at - // bit 8). - uint copyOffset = (entry & 0x700u) + trailer; + uint trailer = Helpers.ExtractLowBytes(preload, c & 3); + nint length = entry & 0xff; - AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); - op += length; + // copy_offset/256 is encoded in bits 8..10. By just fetching + // those bits, we get copy_offset (since the bit-field starts at + // bit 8). + uint copyOffset = (entry & 0x700u) + trailer; - input += c & 3; + AppendFromSelf(ref Unsafe.AsRef(op), ref Unsafe.AsRef(buffer), ref Unsafe.AsRef(bufferEnd), copyOffset, length); + op += length; - // By using the result of the previous load we reduce the critical - // dependency chain of ip to 4 cycles. - preload >>= (c & 3) * 8; - if (input < inputLimitMinMaxTagLength) continue; - } + input = ref Unsafe.Add(ref input, c & 3); - if (input >= inputLimitMinMaxTagLength) - { - if (!RefillTag(ref input, ref inputEnd, scratch)) - { - goto exit; - } + // By using the result of the previous load we reduce the critical + // dependency chain of ip to 4 cycles. + preload >>= (c & 3) * 8; + if (Unsafe.IsAddressLessThan(ref input, ref inputLimitMinMaxTagLength)) continue; + } - inputLimitMinMaxTagLength = inputEnd - Math.Min(inputEnd - input, - Constants.MaximumTagLength - 1); - } + if (!Unsafe.IsAddressLessThan(ref input, ref inputLimitMinMaxTagLength)) + { + uint newScratchLength = RefillTag(ref input, ref inputEnd, ref scratch); + if (newScratchLength == uint.MaxValue) + { + goto exit; + } - preload = Helpers.UnsafeReadUInt32(input); + if (newScratchLength > 0) + { + // Data has been moved to the scratch buffer + input = ref scratch; + inputEnd = ref Unsafe.Add(ref input, newScratchLength - 1); + inputLimitMinMaxTagLength = ref Unsafe.Subtract(ref inputEnd, + Math.Min(newScratchLength, Constants.MaximumTagLength - 1) - 1); } } - exit: ; // All input data is processed - _lookbackPosition = (int)(op - buffer); + preload = Helpers.UnsafeReadUInt32(ref input); } } + + exit: ; // All input data is processed + _lookbackPosition = (int)(op - buffer); } } } @@ -423,7 +443,7 @@ internal unsafe void DecompressAllTags(ReadOnlySpan inputSpan) { Debug.Assert(_scratchLength > 0); - if (!Unsafe.IsAddressLessThan(ref input, ref inputEnd)) + if (Unsafe.IsAddressGreaterThan(ref input, ref inputEnd)) { return (false, 0); } @@ -432,7 +452,7 @@ internal unsafe void DecompressAllTags(ReadOnlySpan inputSpan) uint entry = Constants.CharTable[scratch]; uint needed = (entry >> 11) + 1; // +1 byte for 'c' - uint toCopy = Math.Min((uint)Unsafe.ByteOffset(ref input, ref inputEnd), needed - _scratchLength); + uint toCopy = Math.Min((uint)Unsafe.ByteOffset(ref input, ref inputEnd) + 1, needed - _scratchLength); Unsafe.CopyBlockUnaligned(ref Unsafe.Add(ref scratch, _scratchLength), ref input, toCopy); _scratchLength += toCopy; @@ -446,40 +466,46 @@ internal unsafe void DecompressAllTags(ReadOnlySpan inputSpan) return (true, toCopy); } - private unsafe bool RefillTag(ref byte* input, ref byte* inputEnd, byte* scratch) + // Returns 0 if there is sufficient data available in the input buffer for the next tag AND enough extra padding to + // safely read preload without overrunning the buffer. + // + // Returns uint.MaxValue if there is insufficient data and the decompression should stop until more data is available. + // In this case any dangling unused bytes will be moved to scratch and _scratchLength for the next iteration. + // + // Returns a small number if we have enough data for this tag but not enough to safely load preload without a buffer + // overrun. In this case, further reads should be from scratch with a length up to the returned number. Scratch will + // always have some extra bytes on the end so we don't risk buffer overruns. + private uint RefillTag(ref byte input, ref byte inputEnd, ref byte scratch) { - if (input >= inputEnd) + if (Unsafe.IsAddressGreaterThan(ref input, ref inputEnd)) { - return false; + return uint.MaxValue; } // Read the tag character - byte c = *input; - uint entry = Constants.CharTable[c]; + uint entry = Constants.CharTable[input]; uint needed = (entry >> 11) + 1; // +1 byte for 'c' - uint inputLength = unchecked((uint)(inputEnd - input)); + uint inputLength = (uint)Unsafe.ByteOffset(ref input, ref inputEnd) + 1; if (inputLength < needed) { // Data is insufficient, copy to scratch - Unsafe.CopyBlockUnaligned(scratch, input, inputLength); + Unsafe.CopyBlockUnaligned(ref scratch, ref input, inputLength); _scratchLength = inputLength; - input = inputEnd; - return false; + return uint.MaxValue; } if (inputLength < Constants.MaximumTagLength) { // Have enough bytes, but copy to scratch so that we do not // read past end of input - Unsafe.CopyBlockUnaligned(scratch, input, inputLength); + Unsafe.CopyBlockUnaligned(ref scratch, ref input, inputLength); - input = scratch; - inputEnd = input + inputLength; + return inputLength; } - return true; + return 0; } #region Loopback Writer