From 304a4a7bdbd9cc5edf724b7a286993af40c3f629 Mon Sep 17 00:00:00 2001 From: Loretta Date: Tue, 5 May 2026 23:08:11 +0200 Subject: [PATCH] [LOADED_DOCS: NONE] Fix Utf8Transcoder AVX2 bug, add SIMD boundary tests - Added Hungarian language preference rule to copilot-instructions.md. - Fixed AVX2 SIMD bug in Utf8Transcoder: corrected upper-half store offset from Vector128.Count to Vector256.Count, preventing memory overlap on 32+ byte ASCII runs. - Added Utf8TranscoderTests covering all SIMD/scalar paths, with boundary and round-trip tests for ASCII, Hungarian, CJK, emoji, and mixed content, ensuring correctness and BCL compatibility. --- .github/copilot-instructions.md | 2 + .../Serialization/Utf8TranscoderTests.cs | 310 ++++++++++++++++++ .../Serializers/Binaries/Utf8Transcoder.cs | 9 +- 3 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 1c77125..e7d72af 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -166,3 +166,5 @@ Full doctrine: `../docs/ARCHITECTURE.md#framework-vs-consumer-boundary` 19. **Documentation layering** — write `.md` documentation at the **defining layer** (where the code lives). Higher-layer `.md` files reference the base docs (e.g. `see AyCode.Services/docs/SIGNALR/README.md`) and document only project-specific overrides or extensions. Never duplicate base-layer descriptions in consumer-level docs. 20. **Do not re-read .md files** already in your context window. They only change if you modify them yourself (new content is already in context) or if the developer tells you they changed — in that case re-read them once. 21. **Folder navigation** — start from the root `README.md` for solution-level navigation. When you need to understand a folder's contents or find a type/class, read the `README.md` in that folder first — it indexes the local files and sub-folders. Follow this before grepping or reading source files. + +22. **Language Preference**: Communicate in Hungarian as requested by the user. diff --git a/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs new file mode 100644 index 0000000..0318c29 --- /dev/null +++ b/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs @@ -0,0 +1,310 @@ +using System.Text; +using AyCode.Core.Serializers.Binaries; + +namespace AyCode.Core.Tests.Serialization; + +/// +/// Round-trip and correctness tests for 's SIMD path tiers. +/// +/// Critical coverage: each path tier (Vector512 / Vector256 / Vector128 / scalar) has +/// minimum-size and boundary-crossing inputs to ensure the path is actually exercised. The +/// Hungarian benchmark in BenchmarkTestDataProvider bails out of the AVX2 ASCII-prefix +/// path early (first non-ASCII byte at position 4-5), so it cannot validate the long-ASCII path +/// on its own. These tests fill that gap. +/// +[TestClass] +public class Utf8TranscoderTests +{ + private static readonly Encoding Utf8 = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); + + // ────────────────────────────────────────────────────────────────────── + // CountUtf8Chars — content classes + // ────────────────────────────────────────────────────────────────────── + + [TestMethod] + public void CountUtf8Chars_AsciiOnly_MatchesStringLength() + { + var s = "Hello, World! This is plain ASCII."; + var bytes = Utf8.GetBytes(s); + Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); + } + + [TestMethod] + public void CountUtf8Chars_HungarianMixed_MatchesStringLength() + { + var s = "árvíztűrő tükörfúrógép"; + var bytes = Utf8.GetBytes(s); + Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); + } + + [TestMethod] + public void CountUtf8Chars_CjkBmp_MatchesStringLength() + { + var s = "你好世界 こんにちは 안녕하세요"; + var bytes = Utf8.GetBytes(s); + Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); + } + + [TestMethod] + public void CountUtf8Chars_SupplementaryPlane_CountsSurrogatePairs() + { + // Each emoji is U+1F600-range (4-byte UTF-8 → 2-char surrogate pair in UTF-16) + var s = "😀😁😂🎉"; // 4 codepoints, but 8 chars in UTF-16 + var bytes = Utf8.GetBytes(s); + Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); + Assert.AreEqual(8, s.Length, "Sanity check: each emoji is a surrogate pair"); + } + + [TestMethod] + public void CountUtf8Chars_MixedAllClasses_MatchesStringLength() + { + var s = "ASCII Magyar:árvíz CJK:你好 Emoji:😀"; + var bytes = Utf8.GetBytes(s); + Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); + } + + [TestMethod] + public void CountUtf8Chars_Empty_ReturnsZero() + { + Assert.AreEqual(0, Utf8Transcoder.CountUtf8Chars(ReadOnlySpan.Empty)); + } + + // ────────────────────────────────────────────────────────────────────── + // EncodeUtf8SinglePass + DecodeUtf8SinglePass — round-trip per content class + // ────────────────────────────────────────────────────────────────────── + + [TestMethod] + public void EncodeDecode_AsciiShort_RoundTrip() + { + AssertRoundTrip("Hello"); + } + + [TestMethod] + public void EncodeDecode_AsciiExactly31Bytes_RoundTrip() + { + // Boundary: just below FixStr 31-byte limit, just below Vector256 threshold (32) + AssertRoundTrip(new string('a', 31)); + } + + [TestMethod] + public void EncodeDecode_AsciiExactly32Bytes_RoundTrip() + { + // Boundary: exactly Vector256.Count — Phase 1 AVX2 widen path triggers + // CRITICAL: this validates the Vector256.Widen upper-half store offset bug-fix. + AssertRoundTrip(new string('a', 32)); + } + + [TestMethod] + public void EncodeDecode_AsciiLong_64Bytes_RoundTrip() + { + // Boundary: Vector512 threshold for the encoder; 2× Vector256 iter for the decoder + AssertRoundTrip(new string('x', 64)); + } + + [TestMethod] + public void EncodeDecode_AsciiVeryLong_500Bytes_RoundTrip() + { + // Multi-iter SIMD widen on the decoder; AVX-512 path on capable hosts + AssertRoundTrip(new string('z', 500)); + } + + [TestMethod] + public void EncodeDecode_HungarianShort_RoundTrip() + { + AssertRoundTrip("Termék"); + } + + [TestMethod] + public void EncodeDecode_HungarianMedium_RoundTrip() + { + AssertRoundTrip("árvíztűrő tükörfúrógép"); + } + + [TestMethod] + public void EncodeDecode_HungarianLong_RoundTrip() + { + // Long enough to span multiple Vector128/256 iterations + AssertRoundTrip(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20))); + } + + [TestMethod] + public void EncodeDecode_CjkBmp_RoundTrip() + { + AssertRoundTrip("你好世界 こんにちは 안녕하세요"); + } + + [TestMethod] + public void EncodeDecode_CjkBmpLong_RoundTrip() + { + AssertRoundTrip(string.Concat(Enumerable.Repeat("你好世界 ", 30))); + } + + [TestMethod] + public void EncodeDecode_SupplementaryPlane_RoundTrip() + { + AssertRoundTrip("😀😁😂🎉🌟"); + } + + [TestMethod] + public void EncodeDecode_MixedAllClasses_RoundTrip() + { + AssertRoundTrip("Plain ASCII + Magyar (árvíztűrő) + CJK (你好世界) + Emoji (😀🎉)"); + } + + [TestMethod] + public void EncodeDecode_LongMixed_RoundTrip() + { + // Long mixed content forcing all SIMD tiers + scalar tail to engage + var sb = new StringBuilder(); + for (var i = 0; i < 50; i++) + { + sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 "); + } + AssertRoundTrip(sb.ToString()); + } + + [TestMethod] + public void EncodeDecode_BoundaryAsciiToHungarian_RoundTrip() + { + // ASCII prefix exactly at common boundaries, then non-ASCII switch + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "árvíz"; + AssertRoundTrip(s, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void EncodeDecode_BoundaryAsciiToCjk_RoundTrip() + { + // 3-byte sequence boundary stress + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "你好世界"; + AssertRoundTrip(s, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void EncodeDecode_BoundaryAsciiToEmoji_RoundTrip() + { + // 4-byte sequence boundary (surrogate pair in UTF-16) + for (var asciiLen = 0; asciiLen <= 64; asciiLen++) + { + var s = new string('a', asciiLen) + "😀"; + AssertRoundTrip(s, $"asciiLen={asciiLen}"); + } + } + + [TestMethod] + public void EncodeDecode_Empty_RoundTrip() + { + AssertRoundTrip(string.Empty); + } + + [TestMethod] + public void EncodeDecode_SingleAsciiChar_RoundTrip() + { + AssertRoundTrip("X"); + } + + [TestMethod] + public void EncodeDecode_SingleHungarianChar_RoundTrip() + { + AssertRoundTrip("é"); + } + + [TestMethod] + public void EncodeDecode_SingleCjkChar_RoundTrip() + { + AssertRoundTrip("好"); + } + + [TestMethod] + public void EncodeDecode_SingleEmoji_RoundTrip() + { + AssertRoundTrip("😀"); + } + + // ────────────────────────────────────────────────────────────────────── + // Decoder-side cross-check: BCL Encoding.UTF8.GetString reference + // ────────────────────────────────────────────────────────────────────── + + [TestMethod] + public void DecodeUtf8SinglePass_MatchesBclGetString_Ascii() + { + AssertDecodeMatchesBcl("ASCII test string with spaces and digits 0123456789."); + } + + [TestMethod] + public void DecodeUtf8SinglePass_MatchesBclGetString_LongAscii32Plus() + { + // CRITICAL — exercises the Vector256 ASCII prefix widen path that had the offset bug + AssertDecodeMatchesBcl(new string('A', 32)); + AssertDecodeMatchesBcl(new string('A', 33)); + AssertDecodeMatchesBcl(new string('A', 64)); + AssertDecodeMatchesBcl(new string('A', 65)); + AssertDecodeMatchesBcl(new string('B', 100)); + AssertDecodeMatchesBcl(new string('C', 256)); + } + + [TestMethod] + public void DecodeUtf8SinglePass_MatchesBclGetString_Hungarian() + { + AssertDecodeMatchesBcl("árvíztűrő tükörfúrógép"); + } + + [TestMethod] + public void DecodeUtf8SinglePass_MatchesBclGetString_Mixed() + { + AssertDecodeMatchesBcl("Plain ASCII + Magyar (árvíz) + CJK (你好) + Emoji (😀)"); + } + + // ────────────────────────────────────────────────────────────────────── + // Helpers + // ────────────────────────────────────────────────────────────────────── + + /// + /// Verifies that EncodeUtf8SinglePass produces bytes identical to , + /// and that DecodeUtf8SinglePass on those bytes reconstructs the original string exactly. + /// + private static void AssertRoundTrip(string original, string? context = null) + { + var ctx = context is null ? string.Empty : $" [{context}]"; + + // 1. Encoder produces bytes identical to BCL Encoding.UTF8 + var dst = new byte[original.Length * 4]; // worst-case UTF-8 + var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(original.AsSpan(), dst.AsSpan()); + var encoded = dst.AsSpan(0, bytesWritten).ToArray(); + var bclEncoded = Utf8.GetBytes(original); + CollectionAssert.AreEqual(bclEncoded, encoded, $"Encoder output mismatch{ctx}"); + + // 2. CountUtf8Chars matches the original char count + var charCount = Utf8Transcoder.CountUtf8Chars(encoded); + Assert.AreEqual(original.Length, charCount, $"Char count mismatch{ctx}"); + + // 3. DecodeUtf8SinglePass reconstructs the original string exactly + var decoded = string.Create(charCount, encoded, static (chars, bytes) => + { + Utf8Transcoder.DecodeUtf8SinglePass(bytes, chars); + }); + Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}"); + } + + /// + /// Verifies that DecodeUtf8SinglePass produces output identical to + /// for the same byte input. Catches silent decoder bugs that pass the round-trip test + /// (e.g. write-overlap that happens to land back on the right value by accident). + /// + private static void AssertDecodeMatchesBcl(string original) + { + var bytes = Utf8.GetBytes(original); + var bclDecoded = Utf8.GetString(bytes); + var charCount = Utf8Transcoder.CountUtf8Chars(bytes); + var ourDecoded = string.Create(charCount, bytes, static (chars, b) => + { + Utf8Transcoder.DecodeUtf8SinglePass(b, chars); + }); + Assert.AreEqual(bclDecoded, ourDecoded, $"Decoder mismatch for input length {bytes.Length}"); + } +} diff --git a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs index a69adab..da833e8 100644 --- a/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs +++ b/AyCode.Core/Serializers/Binaries/Utf8Transcoder.cs @@ -337,10 +337,15 @@ internal static class Utf8Transcoder // ASCII detect: any high bit set among the 32 bytes? if (v.ExtractMostSignificantBits() != 0) break; - // Widen 32 bytes → 2 × Vector256 (32 chars total) + // Widen 32 bytes → 2 × Vector256 (32 chars total). Each Vector256 + // holds 16 ushort, so the upper half stores at dstIdx + 16 (= Vector256.Count). + // Earlier latent bug used Vector128.Count (= 8) here, causing overlap on + // indices 8-15 and uninitialized 24-31 — hidden by the Hungarian benchmark's early + // ASCII bail-out (no 32+ byte ASCII run). Validated by Utf8TranscoderTests + // LongAscii32Plus + AsciiExactly32Bytes round-trips. var (lower, upper) = Vector256.Widen(v); lower.StoreUnsafe(ref dstRef, (uint)dstIdx); - upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector128.Count)); + upper.StoreUnsafe(ref dstRef, (uint)(dstIdx + Vector256.Count)); srcIdx += Vector256.Count; dstIdx += Vector256.Count; // 32 bytes → 32 chars }