[LOADED_DOCS: NONE]
Fix Utf8Transcoder AVX2 bug, add SIMD boundary tests - Added Hungarian language preference rule to copilot-instructions.md. - Fixed AVX2 SIMD bug in Utf8Transcoder: corrected upper-half store offset from Vector128<ushort>.Count to Vector256<ushort>.Count, preventing memory overlap on 32+ byte ASCII runs. - Added Utf8TranscoderTests covering all SIMD/scalar paths, with boundary and round-trip tests for ASCII, Hungarian, CJK, emoji, and mixed content, ensuring correctness and BCL compatibility.
This commit is contained in:
parent
8f3bbeacc1
commit
304a4a7bdb
|
|
@ -166,3 +166,5 @@ Full doctrine: `../docs/ARCHITECTURE.md#framework-vs-consumer-boundary`
|
||||||
19. **Documentation layering** — write `.md` documentation at the **defining layer** (where the code lives). Higher-layer `.md` files reference the base docs (e.g. `see AyCode.Services/docs/SIGNALR/README.md`) and document only project-specific overrides or extensions. Never duplicate base-layer descriptions in consumer-level docs.
|
19. **Documentation layering** — write `.md` documentation at the **defining layer** (where the code lives). Higher-layer `.md` files reference the base docs (e.g. `see AyCode.Services/docs/SIGNALR/README.md`) and document only project-specific overrides or extensions. Never duplicate base-layer descriptions in consumer-level docs.
|
||||||
20. **Do not re-read .md files** already in your context window. They only change if you modify them yourself (new content is already in context) or if the developer tells you they changed — in that case re-read them once.
|
20. **Do not re-read .md files** already in your context window. They only change if you modify them yourself (new content is already in context) or if the developer tells you they changed — in that case re-read them once.
|
||||||
21. **Folder navigation** — start from the root `README.md` for solution-level navigation. When you need to understand a folder's contents or find a type/class, read the `README.md` in that folder first — it indexes the local files and sub-folders. Follow this before grepping or reading source files.
|
21. **Folder navigation** — start from the root `README.md` for solution-level navigation. When you need to understand a folder's contents or find a type/class, read the `README.md` in that folder first — it indexes the local files and sub-folders. Follow this before grepping or reading source files.
|
||||||
|
|
||||||
|
22. **Language Preference**: Communicate in Hungarian as requested by the user.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,310 @@
|
||||||
|
using System.Text;
|
||||||
|
using AyCode.Core.Serializers.Binaries;
|
||||||
|
|
||||||
|
namespace AyCode.Core.Tests.Serialization;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Round-trip and correctness tests for <see cref="Utf8Transcoder"/>'s SIMD path tiers.
|
||||||
|
///
|
||||||
|
/// <para><b>Critical coverage</b>: each path tier (Vector512 / Vector256 / Vector128 / scalar) has
|
||||||
|
/// minimum-size and boundary-crossing inputs to ensure the path is actually exercised. The
|
||||||
|
/// Hungarian benchmark in <c>BenchmarkTestDataProvider</c> bails out of the AVX2 ASCII-prefix
|
||||||
|
/// path early (first non-ASCII byte at position 4-5), so it cannot validate the long-ASCII path
|
||||||
|
/// on its own. These tests fill that gap.</para>
|
||||||
|
/// </summary>
|
||||||
|
[TestClass]
|
||||||
|
public class Utf8TranscoderTests
|
||||||
|
{
|
||||||
|
private static readonly Encoding Utf8 = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
|
||||||
|
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
// CountUtf8Chars — content classes
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void CountUtf8Chars_AsciiOnly_MatchesStringLength()
|
||||||
|
{
|
||||||
|
var s = "Hello, World! This is plain ASCII.";
|
||||||
|
var bytes = Utf8.GetBytes(s);
|
||||||
|
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void CountUtf8Chars_HungarianMixed_MatchesStringLength()
|
||||||
|
{
|
||||||
|
var s = "árvíztűrő tükörfúrógép";
|
||||||
|
var bytes = Utf8.GetBytes(s);
|
||||||
|
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void CountUtf8Chars_CjkBmp_MatchesStringLength()
|
||||||
|
{
|
||||||
|
var s = "你好世界 こんにちは 안녕하세요";
|
||||||
|
var bytes = Utf8.GetBytes(s);
|
||||||
|
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void CountUtf8Chars_SupplementaryPlane_CountsSurrogatePairs()
|
||||||
|
{
|
||||||
|
// Each emoji is U+1F600-range (4-byte UTF-8 → 2-char surrogate pair in UTF-16)
|
||||||
|
var s = "😀😁😂🎉"; // 4 codepoints, but 8 chars in UTF-16
|
||||||
|
var bytes = Utf8.GetBytes(s);
|
||||||
|
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
|
||||||
|
Assert.AreEqual(8, s.Length, "Sanity check: each emoji is a surrogate pair");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void CountUtf8Chars_MixedAllClasses_MatchesStringLength()
|
||||||
|
{
|
||||||
|
var s = "ASCII Magyar:árvíz CJK:你好 Emoji:😀";
|
||||||
|
var bytes = Utf8.GetBytes(s);
|
||||||
|
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void CountUtf8Chars_Empty_ReturnsZero()
|
||||||
|
{
|
||||||
|
Assert.AreEqual(0, Utf8Transcoder.CountUtf8Chars(ReadOnlySpan<byte>.Empty));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
// EncodeUtf8SinglePass + DecodeUtf8SinglePass — round-trip per content class
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_AsciiShort_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("Hello");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_AsciiExactly31Bytes_RoundTrip()
|
||||||
|
{
|
||||||
|
// Boundary: just below FixStr 31-byte limit, just below Vector256 threshold (32)
|
||||||
|
AssertRoundTrip(new string('a', 31));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_AsciiExactly32Bytes_RoundTrip()
|
||||||
|
{
|
||||||
|
// Boundary: exactly Vector256<byte>.Count — Phase 1 AVX2 widen path triggers
|
||||||
|
// CRITICAL: this validates the Vector256.Widen upper-half store offset bug-fix.
|
||||||
|
AssertRoundTrip(new string('a', 32));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_AsciiLong_64Bytes_RoundTrip()
|
||||||
|
{
|
||||||
|
// Boundary: Vector512 threshold for the encoder; 2× Vector256 iter for the decoder
|
||||||
|
AssertRoundTrip(new string('x', 64));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_AsciiVeryLong_500Bytes_RoundTrip()
|
||||||
|
{
|
||||||
|
// Multi-iter SIMD widen on the decoder; AVX-512 path on capable hosts
|
||||||
|
AssertRoundTrip(new string('z', 500));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_HungarianShort_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("Termék");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_HungarianMedium_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("árvíztűrő tükörfúrógép");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_HungarianLong_RoundTrip()
|
||||||
|
{
|
||||||
|
// Long enough to span multiple Vector128/256 iterations
|
||||||
|
AssertRoundTrip(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_CjkBmp_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("你好世界 こんにちは 안녕하세요");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_CjkBmpLong_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip(string.Concat(Enumerable.Repeat("你好世界 ", 30)));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_SupplementaryPlane_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("😀😁😂🎉🌟");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_MixedAllClasses_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("Plain ASCII + Magyar (árvíztűrő) + CJK (你好世界) + Emoji (😀🎉)");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_LongMixed_RoundTrip()
|
||||||
|
{
|
||||||
|
// Long mixed content forcing all SIMD tiers + scalar tail to engage
|
||||||
|
var sb = new StringBuilder();
|
||||||
|
for (var i = 0; i < 50; i++)
|
||||||
|
{
|
||||||
|
sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 ");
|
||||||
|
}
|
||||||
|
AssertRoundTrip(sb.ToString());
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_BoundaryAsciiToHungarian_RoundTrip()
|
||||||
|
{
|
||||||
|
// ASCII prefix exactly at common boundaries, then non-ASCII switch
|
||||||
|
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
|
||||||
|
{
|
||||||
|
var s = new string('a', asciiLen) + "árvíz";
|
||||||
|
AssertRoundTrip(s, $"asciiLen={asciiLen}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_BoundaryAsciiToCjk_RoundTrip()
|
||||||
|
{
|
||||||
|
// 3-byte sequence boundary stress
|
||||||
|
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
|
||||||
|
{
|
||||||
|
var s = new string('a', asciiLen) + "你好世界";
|
||||||
|
AssertRoundTrip(s, $"asciiLen={asciiLen}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_BoundaryAsciiToEmoji_RoundTrip()
|
||||||
|
{
|
||||||
|
// 4-byte sequence boundary (surrogate pair in UTF-16)
|
||||||
|
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
|
||||||
|
{
|
||||||
|
var s = new string('a', asciiLen) + "😀";
|
||||||
|
AssertRoundTrip(s, $"asciiLen={asciiLen}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_Empty_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip(string.Empty);
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_SingleAsciiChar_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("X");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_SingleHungarianChar_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("é");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_SingleCjkChar_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("好");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void EncodeDecode_SingleEmoji_RoundTrip()
|
||||||
|
{
|
||||||
|
AssertRoundTrip("😀");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
// Decoder-side cross-check: BCL Encoding.UTF8.GetString reference
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void DecodeUtf8SinglePass_MatchesBclGetString_Ascii()
|
||||||
|
{
|
||||||
|
AssertDecodeMatchesBcl("ASCII test string with spaces and digits 0123456789.");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void DecodeUtf8SinglePass_MatchesBclGetString_LongAscii32Plus()
|
||||||
|
{
|
||||||
|
// CRITICAL — exercises the Vector256 ASCII prefix widen path that had the offset bug
|
||||||
|
AssertDecodeMatchesBcl(new string('A', 32));
|
||||||
|
AssertDecodeMatchesBcl(new string('A', 33));
|
||||||
|
AssertDecodeMatchesBcl(new string('A', 64));
|
||||||
|
AssertDecodeMatchesBcl(new string('A', 65));
|
||||||
|
AssertDecodeMatchesBcl(new string('B', 100));
|
||||||
|
AssertDecodeMatchesBcl(new string('C', 256));
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void DecodeUtf8SinglePass_MatchesBclGetString_Hungarian()
|
||||||
|
{
|
||||||
|
AssertDecodeMatchesBcl("árvíztűrő tükörfúrógép");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void DecodeUtf8SinglePass_MatchesBclGetString_Mixed()
|
||||||
|
{
|
||||||
|
AssertDecodeMatchesBcl("Plain ASCII + Magyar (árvíz) + CJK (你好) + Emoji (😀)");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
// Helpers
|
||||||
|
// ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Verifies that EncodeUtf8SinglePass produces bytes identical to <see cref="Encoding.UTF8.GetBytes"/>,
|
||||||
|
/// and that DecodeUtf8SinglePass on those bytes reconstructs the original string exactly.
|
||||||
|
/// </summary>
|
||||||
|
private static void AssertRoundTrip(string original, string? context = null)
|
||||||
|
{
|
||||||
|
var ctx = context is null ? string.Empty : $" [{context}]";
|
||||||
|
|
||||||
|
// 1. Encoder produces bytes identical to BCL Encoding.UTF8
|
||||||
|
var dst = new byte[original.Length * 4]; // worst-case UTF-8
|
||||||
|
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(original.AsSpan(), dst.AsSpan());
|
||||||
|
var encoded = dst.AsSpan(0, bytesWritten).ToArray();
|
||||||
|
var bclEncoded = Utf8.GetBytes(original);
|
||||||
|
CollectionAssert.AreEqual(bclEncoded, encoded, $"Encoder output mismatch{ctx}");
|
||||||
|
|
||||||
|
// 2. CountUtf8Chars matches the original char count
|
||||||
|
var charCount = Utf8Transcoder.CountUtf8Chars(encoded);
|
||||||
|
Assert.AreEqual(original.Length, charCount, $"Char count mismatch{ctx}");
|
||||||
|
|
||||||
|
// 3. DecodeUtf8SinglePass reconstructs the original string exactly
|
||||||
|
var decoded = string.Create(charCount, encoded, static (chars, bytes) =>
|
||||||
|
{
|
||||||
|
Utf8Transcoder.DecodeUtf8SinglePass(bytes, chars);
|
||||||
|
});
|
||||||
|
Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Verifies that DecodeUtf8SinglePass produces output identical to <see cref="Encoding.UTF8.GetString"/>
|
||||||
|
/// for the same byte input. Catches silent decoder bugs that pass the round-trip test
|
||||||
|
/// (e.g. write-overlap that happens to land back on the right value by accident).
|
||||||
|
/// </summary>
|
||||||
|
private static void AssertDecodeMatchesBcl(string original)
|
||||||
|
{
|
||||||
|
var bytes = Utf8.GetBytes(original);
|
||||||
|
var bclDecoded = Utf8.GetString(bytes);
|
||||||
|
var charCount = Utf8Transcoder.CountUtf8Chars(bytes);
|
||||||
|
var ourDecoded = string.Create(charCount, bytes, static (chars, b) =>
|
||||||
|
{
|
||||||
|
Utf8Transcoder.DecodeUtf8SinglePass(b, chars);
|
||||||
|
});
|
||||||
|
Assert.AreEqual(bclDecoded, ourDecoded, $"Decoder mismatch for input length {bytes.Length}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -337,10 +337,15 @@ internal static class Utf8Transcoder
|
||||||
// ASCII detect: any high bit set among the 32 bytes?
|
// ASCII detect: any high bit set among the 32 bytes?
|
||||||
if (v.ExtractMostSignificantBits() != 0) break;
|
if (v.ExtractMostSignificantBits() != 0) break;
|
||||||
|
|
||||||
// Widen 32 bytes → 2 × Vector256<ushort> (32 chars total)
|
// Widen 32 bytes → 2 × Vector256<ushort> (32 chars total). Each Vector256<ushort>
|
||||||
|
// holds 16 ushort, so the upper half stores at dstIdx + 16 (= Vector256<ushort>.Count).
|
||||||
|
// Earlier latent bug used Vector128<ushort>.Count (= 8) here, causing overlap on
|
||||||
|
// indices 8-15 and uninitialized 24-31 — hidden by the Hungarian benchmark's early
|
||||||
|
// ASCII bail-out (no 32+ byte ASCII run). Validated by Utf8TranscoderTests
|
||||||
|
// LongAscii32Plus + AsciiExactly32Bytes round-trips.
|
||||||
var (lower, upper) = Vector256.Widen(v);
|
var (lower, upper) = Vector256.Widen(v);
|
||||||
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
lower.StoreUnsafe(ref dstRef, (uint)dstIdx);
|
||||||
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128<ushort>.Count));
|
upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector256<ushort>.Count));
|
||||||
srcIdx += Vector256<byte>.Count;
|
srcIdx += Vector256<byte>.Count;
|
||||||
dstIdx += Vector256<byte>.Count; // 32 bytes → 32 chars
|
dstIdx += Vector256<byte>.Count; // 32 bytes → 32 chars
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue