[LOADED_DOCS: 3 files, no new loads]

Switch to BCL UTF-8 APIs for string (de)serialization

Replaced custom Utf8Transcoder logic with System.Text.Encoding.UTF8 and System.Text.Unicode.Utf8 for string encoding/decoding in AcBinarySerializer and AcBinaryDeserializer. PropertyMetadataBase now uses Encoding.UTF8.GetBytes for property name encoding. Retained Utf8Transcoder for any remaining SIMD/custom logic. No public API changes; internal refactoring for performance and maintainability.
This commit is contained in:
Loretta 2026-05-07 23:54:57 +02:00
parent 8eaae4dda3
commit 1d256ea386
5 changed files with 57 additions and 62 deletions

View File

@ -520,11 +520,11 @@ public static partial class AcBinaryDeserializer
var pos = _position;
_position += byteLength;
var src = _buffer.AsSpan(pos, byteLength);
var charCount = Utf8Transcoder.CountUtf8Chars(src);
var charCount = Encoding.UTF8.GetCharCount(src);
return string.Create(charCount, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
{
Utf8Transcoder.DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
System.Text.Unicode.Utf8.ToUtf16(state.Buffer.AsSpan(state.Pos, state.Len), chars, out _, out _, replaceInvalidSequences: false);
});
}
@ -562,7 +562,7 @@ public static partial class AcBinaryDeserializer
return string.Create(charLength, (Buffer: _buffer, Pos: pos, Len: byteLength), static (chars, state) =>
{
Utf8Transcoder.DecodeUtf8SinglePass(state.Buffer.AsSpan(state.Pos, state.Len), chars);
System.Text.Unicode.Utf8.ToUtf16(state.Buffer.AsSpan(state.Pos, state.Len), chars, out _, out _, replaceInvalidSequences: false);
});
}

View File

@ -691,7 +691,7 @@ public static partial class AcBinarySerializer
var savedPos = _position;
var encodeStart = savedPos + reserveSize;
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
System.Text.Unicode.Utf8.FromUtf16(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes), out _, out var bytesWritten, replaceInvalidSequences: false);
var actualVarUIntSize = VarUIntSize((uint)bytesWritten);
if (actualVarUIntSize < reserveSize)
@ -781,16 +781,14 @@ public static partial class AcBinarySerializer
var savedPos = _position;
var encodeStart = savedPos + reserveHeader;
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
System.Text.Unicode.Utf8.FromUtf16(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes), out _, out var bytesWritten, replaceInvalidSequences: false);
if (bytesWritten == charLength)
{
// ASCII override — FixStrAscii (≤31) or StringAscii (>31) with compact header
if (bytesWritten <= BinaryTypeCode.FixStrAsciiMaxLength)
{
var shift = reserveHeader - 1;
if (shift > 0)
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(savedPos + 1, bytesWritten));
_buffer.AsSpan(encodeStart, bytesWritten).CopyTo(_buffer.AsSpan(savedPos + 1, bytesWritten));
_buffer[savedPos] = BinaryTypeCode.EncodeFixStrAscii(bytesWritten);
_position = savedPos + 1 + bytesWritten;
}
@ -898,7 +896,7 @@ public static partial class AcBinarySerializer
var savedPos = _position;
var encodeStart = savedPos + cacheIdxSize + reserveHeader;
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes));
System.Text.Unicode.Utf8.FromUtf16(value.AsSpan(), _buffer.AsSpan(encodeStart, maxBytes), out _, out var bytesWritten, replaceInvalidSequences: false);
// Choose tier from actual bytesWritten (smallest fits)
var actualHeader = bytesWritten <= 255 ? 3 : 5;

View File

@ -117,7 +117,7 @@ public static partial class AcBinarySerializer
foreach (var (stringValue, properties) in analysis)
{
var byteLength = Utf8Transcoder.GetUtf8ByteCount(stringValue.AsSpan());
var byteLength = System.Text.Encoding.UTF8.GetByteCount(stringValue.AsSpan());
foreach (var (propPath, count) in properties)
{
if (!propertyStats.TryGetValue(propPath, out var list))

View File

@ -132,7 +132,7 @@ internal static class Utf8Transcoder
var c3 = Unsafe.Add(ref srcRefChar, srcIdx + 3);
if (((c0 | c1 | c2 | c3) & 0xFF80) == 0)
{
Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
Unsafe.Add(ref dstRef, dstIdx) = (byte)c0;
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)c1;
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)c2;
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)c3;
@ -155,7 +155,7 @@ internal static class Utf8Transcoder
// 2-byte: 110xxxxx 10xxxxxx → U+0080U+07FF
// Latin extended (Hungarian, Polish, Czech, Spanish, French, German diacritics),
// Greek, Cyrillic, Hebrew, Arabic.
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xC0 | (c >> 6));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | (c & 0x3F));
dstIdx += 2;
srcIdx += 1;
@ -164,7 +164,7 @@ internal static class Utf8Transcoder
{
// 3-byte BMP: 1110xxxx 10xxxxxx 10xxxxxx → U+0800U+FFFF (excluding surrogate range)
// CJK BMP, various other BMP scripts.
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xE0 | (c >> 12));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((c >> 6) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | (c & 0x3F));
dstIdx += 3;
@ -176,7 +176,7 @@ internal static class Utf8Transcoder
// High surrogate (0xD8000xDBFF) followed by low surrogate (0xDC000xDFFF).
var c2 = Unsafe.Add(ref srcRefChar, srcIdx + 1);
var codepoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
Unsafe.Add(ref dstRef, dstIdx) = (byte)(0xF0 | (codepoint >> 18));
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(0x80 | ((codepoint >> 12) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(0x80 | ((codepoint >> 6) & 0x3F));
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(0x80 | (codepoint & 0x3F));
@ -252,8 +252,8 @@ internal static class Utf8Transcoder
{
var v = Vector512.LoadUnsafe(ref srcRef, (uint)i);
var c_lt_0x80 = BitOperations.PopCount(Vector512.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector512.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0x80 = BitOperations.PopCount(Vector512.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector512.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0xD800 = BitOperations.PopCount(Vector512.LessThan(v, v_0xD800).ExtractMostSignificantBits());
var c_lt_0xDC00 = BitOperations.PopCount(Vector512.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
var c_lt_0xE000 = BitOperations.PopCount(Vector512.LessThan(v, v_0xE000).ExtractMostSignificantBits());
@ -279,8 +279,8 @@ internal static class Utf8Transcoder
{
var v = Vector256.LoadUnsafe(ref srcRef, (uint)i);
var c_lt_0x80 = BitOperations.PopCount(Vector256.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector256.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0x80 = BitOperations.PopCount(Vector256.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector256.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0xD800 = BitOperations.PopCount(Vector256.LessThan(v, v_0xD800).ExtractMostSignificantBits());
var c_lt_0xDC00 = BitOperations.PopCount(Vector256.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
var c_lt_0xE000 = BitOperations.PopCount(Vector256.LessThan(v, v_0xE000).ExtractMostSignificantBits());
@ -307,8 +307,8 @@ internal static class Utf8Transcoder
{
var v = Vector128.LoadUnsafe(ref srcRef, (uint)i);
var c_lt_0x80 = BitOperations.PopCount(Vector128.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector128.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0x80 = BitOperations.PopCount(Vector128.LessThan(v, v_0x80).ExtractMostSignificantBits());
var c_lt_0x800 = BitOperations.PopCount(Vector128.LessThan(v, v_0x800).ExtractMostSignificantBits());
var c_lt_0xD800 = BitOperations.PopCount(Vector128.LessThan(v, v_0xD800).ExtractMostSignificantBits());
var c_lt_0xDC00 = BitOperations.PopCount(Vector128.LessThan(v, v_0xDC00).ExtractMostSignificantBits());
var c_lt_0xE000 = BitOperations.PopCount(Vector128.LessThan(v, v_0xE000).ExtractMostSignificantBits());
@ -535,7 +535,7 @@ internal static class Utf8Transcoder
var dword = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, srcIdx));
if ((dword & 0x80808080u) == 0)
{
Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
Unsafe.Add(ref dstRef, dstIdx) = (byte)dword;
Unsafe.Add(ref dstRef, dstIdx + 1) = (byte)(dword >> 8);
Unsafe.Add(ref dstRef, dstIdx + 2) = (byte)(dword >> 16);
Unsafe.Add(ref dstRef, dstIdx + 3) = (byte)(dword >> 24);
@ -560,40 +560,40 @@ internal static class Utf8Transcoder
srcIdx += 1;
break;
case < 0xE0:
{
// 2-byte: 110xxxxx 10xxxxxx → U+0080U+07FF
// Latin extended, Cyrillic, Greek, Hebrew, Arabic.
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
srcIdx += 2;
break;
}
{
// 2-byte: 110xxxxx 10xxxxxx → U+0080U+07FF
// Latin extended, Cyrillic, Greek, Hebrew, Arabic.
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x1F) << 6) | (b1 & 0x3F));
srcIdx += 2;
break;
}
case < 0xF0:
{
// 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800U+FFFF
// CJK BMP, various other scripts.
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
srcIdx += 3;
break;
}
{
// 3-byte: 1110xxxx 10xxxxxx 10xxxxxx → U+0800U+FFFF
// CJK BMP, various other scripts.
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
Unsafe.Add(ref dstRef, dstIdx++) = (ushort)(((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F));
srcIdx += 3;
break;
}
default:
{
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000U+10FFFF
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
// No do-while: 4-byte sequences are typically isolated (single emoji in mixed text).
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
codepoint -= 0x10000;
Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
dstIdx += 2;
srcIdx += 4;
break;
}
{
// 4-byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx → U+10000U+10FFFF
// Supplementary plane (emoji, rare CJK ext) → UTF-16 surrogate pair.
// No do-while: 4-byte sequences are typically isolated (single emoji in mixed text).
var b1 = Unsafe.Add(ref srcRef, srcIdx + 1);
var b2 = Unsafe.Add(ref srcRef, srcIdx + 2);
var b3 = Unsafe.Add(ref srcRef, srcIdx + 3);
var codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
codepoint -= 0x10000;
Unsafe.Add(ref dstRef, dstIdx) = (ushort)(0xD800 | (codepoint >> 10));
Unsafe.Add(ref dstRef, dstIdx + 1) = (ushort)(0xDC00 | (codepoint & 0x3FF));
dstIdx += 2;
srcIdx += 4;
break;
}
}
}

View File

@ -99,15 +99,12 @@ public abstract class PropertyMetadataBase
[DynamicallyAccessedMembers(DynamicallyAccessedMemberTypes.PublicProperties)] Type declaringType)
{
Name = prop.Name;
// Ctor-once init: SIMD path via Utf8Transcoder (GetUtf8ByteCount + EncodeUtf8SinglePass)
// bypasses Encoding.UTF8 virtual-dispatch + encoder-fallback overhead. Ascii.FromUtf16
// would be slightly faster for the (overwhelmingly common) ASCII property name case, but
// the symmetric Utf8Transcoder API keeps this consistent with the binary serializer's
// writer-side BCL-free policy and handles non-ASCII property names without a fallback.
var nameByteCount = Utf8Transcoder.GetUtf8ByteCount(prop.Name.AsSpan());
var nameBytes = new byte[nameByteCount];
Utf8Transcoder.EncodeUtf8SinglePass(prop.Name.AsSpan(), nameBytes);
NameUtf8 = nameBytes;
// Single-pass UTF-8 encode via the string-overload — encode + exact-size byte[] allocation in
// one BCL call. Faster than the two-pass (GetByteCount + Utf8.FromUtf16) AND faster than
// worst-case-buffer + Utf8.FromUtf16 + trim/copy patterns: no ArrayPool rent overhead, no
// extra copy, the BCL's internal encoder uses the known string length to size the output array
// exactly without a separate counting pass.
NameUtf8 = System.Text.Encoding.UTF8.GetBytes(prop.Name);
DeclaringType = declaringType;
PropertyType = prop.PropertyType;