using System.Text; using AyCode.Core.Serializers.Binaries; namespace AyCode.Core.Tests.Serialization; /// /// Round-trip and correctness tests for 's SIMD path tiers. /// /// Critical coverage: each path tier (Vector512 / Vector256 / Vector128 / scalar) has /// minimum-size and boundary-crossing inputs to ensure the path is actually exercised. The /// Hungarian benchmark in BenchmarkTestDataProvider bails out of the AVX2 ASCII-prefix /// path early (first non-ASCII byte at position 4-5), so it cannot validate the long-ASCII path /// on its own. These tests fill that gap. /// [TestClass] public class Utf8TranscoderTests { private static readonly Encoding Utf8 = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); // ────────────────────────────────────────────────────────────────────── // CountUtf8Chars — content classes // ────────────────────────────────────────────────────────────────────── [TestMethod] public void CountUtf8Chars_AsciiOnly_MatchesStringLength() { var s = "Hello, World! This is plain ASCII."; var bytes = Utf8.GetBytes(s); Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); } [TestMethod] public void CountUtf8Chars_HungarianMixed_MatchesStringLength() { var s = "árvíztűrő tükörfúrógép"; var bytes = Utf8.GetBytes(s); Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); } [TestMethod] public void CountUtf8Chars_CjkBmp_MatchesStringLength() { var s = "你好世界 こんにちは 안녕하세요"; var bytes = Utf8.GetBytes(s); Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); } [TestMethod] public void CountUtf8Chars_SupplementaryPlane_CountsSurrogatePairs() { // Each emoji is U+1F600-range (4-byte UTF-8 → 2-char surrogate pair in UTF-16) var s = "😀😁😂🎉"; // 4 codepoints, but 8 chars in UTF-16 var bytes = Utf8.GetBytes(s); Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); Assert.AreEqual(8, s.Length, "Sanity check: each emoji is a surrogate pair"); } [TestMethod] public void CountUtf8Chars_MixedAllClasses_MatchesStringLength() { var s = "ASCII Magyar:árvíz CJK:你好 Emoji:😀"; var bytes = Utf8.GetBytes(s); Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes)); } [TestMethod] public void CountUtf8Chars_Empty_ReturnsZero() { Assert.AreEqual(0, Utf8Transcoder.CountUtf8Chars(ReadOnlySpan.Empty)); } // ────────────────────────────────────────────────────────────────────── // EncodeUtf8SinglePass + DecodeUtf8SinglePass — round-trip per content class // ────────────────────────────────────────────────────────────────────── [TestMethod] public void EncodeDecode_AsciiShort_RoundTrip() { AssertRoundTrip("Hello"); } [TestMethod] public void EncodeDecode_AsciiExactly31Bytes_RoundTrip() { // Boundary: just below FixStr 31-byte limit, just below Vector256 threshold (32) AssertRoundTrip(new string('a', 31)); } [TestMethod] public void EncodeDecode_AsciiExactly32Bytes_RoundTrip() { // Boundary: exactly Vector256.Count — Phase 1 AVX2 widen path triggers // CRITICAL: this validates the Vector256.Widen upper-half store offset bug-fix. AssertRoundTrip(new string('a', 32)); } [TestMethod] public void EncodeDecode_AsciiLong_64Bytes_RoundTrip() { // Boundary: Vector512 threshold for the encoder; 2× Vector256 iter for the decoder AssertRoundTrip(new string('x', 64)); } [TestMethod] public void EncodeDecode_AsciiVeryLong_500Bytes_RoundTrip() { // Multi-iter SIMD widen on the decoder; AVX-512 path on capable hosts AssertRoundTrip(new string('z', 500)); } [TestMethod] public void EncodeDecode_HungarianShort_RoundTrip() { AssertRoundTrip("Termék"); } [TestMethod] public void EncodeDecode_HungarianMedium_RoundTrip() { AssertRoundTrip("árvíztűrő tükörfúrógép"); } [TestMethod] public void EncodeDecode_HungarianLong_RoundTrip() { // Long enough to span multiple Vector128/256 iterations AssertRoundTrip(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20))); } [TestMethod] public void EncodeDecode_CjkBmp_RoundTrip() { AssertRoundTrip("你好世界 こんにちは 안녕하세요"); } [TestMethod] public void EncodeDecode_CjkBmpLong_RoundTrip() { AssertRoundTrip(string.Concat(Enumerable.Repeat("你好世界 ", 30))); } [TestMethod] public void EncodeDecode_SupplementaryPlane_RoundTrip() { AssertRoundTrip("😀😁😂🎉🌟"); } [TestMethod] public void EncodeDecode_MixedAllClasses_RoundTrip() { AssertRoundTrip("Plain ASCII + Magyar (árvíztűrő) + CJK (你好世界) + Emoji (😀🎉)"); } [TestMethod] public void EncodeDecode_LongMixed_RoundTrip() { // Long mixed content forcing all SIMD tiers + scalar tail to engage var sb = new StringBuilder(); for (var i = 0; i < 50; i++) { sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 "); } AssertRoundTrip(sb.ToString()); } [TestMethod] public void EncodeDecode_BoundaryAsciiToHungarian_RoundTrip() { // ASCII prefix exactly at common boundaries, then non-ASCII switch for (var asciiLen = 0; asciiLen <= 64; asciiLen++) { var s = new string('a', asciiLen) + "árvíz"; AssertRoundTrip(s, $"asciiLen={asciiLen}"); } } [TestMethod] public void EncodeDecode_BoundaryAsciiToCjk_RoundTrip() { // 3-byte sequence boundary stress for (var asciiLen = 0; asciiLen <= 64; asciiLen++) { var s = new string('a', asciiLen) + "你好世界"; AssertRoundTrip(s, $"asciiLen={asciiLen}"); } } [TestMethod] public void EncodeDecode_BoundaryAsciiToEmoji_RoundTrip() { // 4-byte sequence boundary (surrogate pair in UTF-16) for (var asciiLen = 0; asciiLen <= 64; asciiLen++) { var s = new string('a', asciiLen) + "😀"; AssertRoundTrip(s, $"asciiLen={asciiLen}"); } } [TestMethod] public void EncodeDecode_Empty_RoundTrip() { AssertRoundTrip(string.Empty); } [TestMethod] public void EncodeDecode_SingleAsciiChar_RoundTrip() { AssertRoundTrip("X"); } [TestMethod] public void EncodeDecode_SingleHungarianChar_RoundTrip() { AssertRoundTrip("é"); } [TestMethod] public void EncodeDecode_SingleCjkChar_RoundTrip() { AssertRoundTrip("好"); } [TestMethod] public void EncodeDecode_SingleEmoji_RoundTrip() { AssertRoundTrip("😀"); } // ────────────────────────────────────────────────────────────────────── // GetUtf8ByteCount — content classes // ────────────────────────────────────────────────────────────────────── [TestMethod] public void GetUtf8ByteCount_AsciiOnly_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("Hello, World! Plain ASCII text."); } [TestMethod] public void GetUtf8ByteCount_AsciiExactly7Bytes_MatchesBcl() { // Boundary: just below Vector128.Count (8) — scalar tail only AssertGetUtf8ByteCountMatchesBcl(new string('a', 7)); } [TestMethod] public void GetUtf8ByteCount_AsciiExactly8Bytes_MatchesBcl() { // Boundary: exactly Vector128.Count — Vector128 path triggers AssertGetUtf8ByteCountMatchesBcl(new string('a', 8)); } [TestMethod] public void GetUtf8ByteCount_AsciiExactly16Bytes_MatchesBcl() { // Boundary: exactly Vector256.Count — Vector256 path triggers AssertGetUtf8ByteCountMatchesBcl(new string('a', 16)); } [TestMethod] public void GetUtf8ByteCount_AsciiExactly32Bytes_MatchesBcl() { // Boundary: exactly Vector512.Count — Vector512 path triggers on AVX-512BW AssertGetUtf8ByteCountMatchesBcl(new string('a', 32)); } [TestMethod] public void GetUtf8ByteCount_AsciiVeryLong_500Chars_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl(new string('z', 500)); } [TestMethod] public void GetUtf8ByteCount_HungarianShort_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("Termék"); } [TestMethod] public void GetUtf8ByteCount_HungarianMedium_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("árvíztűrő tükörfúrógép"); } [TestMethod] public void GetUtf8ByteCount_HungarianLong_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20))); } [TestMethod] public void GetUtf8ByteCount_CjkBmp_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("你好世界 こんにちは 안녕하세요"); } [TestMethod] public void GetUtf8ByteCount_CjkBmpLong_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("你好世界 ", 30))); } [TestMethod] public void GetUtf8ByteCount_SupplementaryPlane_MatchesBcl() { // Each emoji is 2 UTF-16 chars (surrogate pair) → 4 UTF-8 bytes total AssertGetUtf8ByteCountMatchesBcl("😀😁😂🎉🌟"); } [TestMethod] public void GetUtf8ByteCount_MixedAllClasses_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("ASCII Magyar:árvíz CJK:你好 Emoji:😀"); } [TestMethod] public void GetUtf8ByteCount_LongMixed_MatchesBcl() { var sb = new StringBuilder(); for (var i = 0; i < 50; i++) { sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 "); } AssertGetUtf8ByteCountMatchesBcl(sb.ToString()); } [TestMethod] public void GetUtf8ByteCount_Empty_ReturnsZero() { Assert.AreEqual(0, Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan.Empty)); } [TestMethod] public void GetUtf8ByteCount_SingleAsciiChar_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("X"); } [TestMethod] public void GetUtf8ByteCount_SingleHungarianChar_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("é"); } [TestMethod] public void GetUtf8ByteCount_SingleCjkChar_MatchesBcl() { AssertGetUtf8ByteCountMatchesBcl("好"); } [TestMethod] public void GetUtf8ByteCount_SingleEmoji_MatchesBcl() { // Single emoji = surrogate pair, exact 4 bytes AssertGetUtf8ByteCountMatchesBcl("😀"); } [TestMethod] public void GetUtf8ByteCount_BoundaryAsciiToHungarian_MatchesBcl() { // Exercises split between SIMD ASCII region and 2-byte tail for (var asciiLen = 0; asciiLen <= 64; asciiLen++) { var s = new string('a', asciiLen) + "árvíz"; var expected = Utf8.GetByteCount(s); var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); } } [TestMethod] public void GetUtf8ByteCount_BoundaryAsciiToCjk_MatchesBcl() { // 3-byte sequence boundary stress for (var asciiLen = 0; asciiLen <= 64; asciiLen++) { var s = new string('a', asciiLen) + "你好世界"; var expected = Utf8.GetByteCount(s); var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); } } [TestMethod] public void GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl() { // CRITICAL: tests that surrogate pairs split across SIMD chunks still produce correct count. // High surrogate may land in chunk N, low surrogate in chunk N+1; total must remain 4 bytes. for (var asciiLen = 0; asciiLen <= 64; asciiLen++) { var s = new string('a', asciiLen) + "😀"; var expected = Utf8.GetByteCount(s); var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}"); } } [TestMethod] public void GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl() { // Surrogate pair split-stress: many emojis at varying offsets for (var prefixLen = 0; prefixLen <= 32; prefixLen++) { var s = new string('a', prefixLen) + "😀😁😂🎉🌟😀😁😂🎉🌟"; var expected = Utf8.GetByteCount(s); var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); Assert.AreEqual(expected, actual, $"prefixLen={prefixLen}"); } } [TestMethod] public void GetUtf8ByteCount_AgreesWithEncodeUtf8SinglePass_AllContentClasses() { // Round-trip contract: the byte count returned must equal the bytesWritten by EncodeUtf8SinglePass. // This is the load-bearing invariant for two-pass [VarUInt][bytes] writes in cold-fallback paths. var samples = new[] { "Hello", "árvíztűrő tükörfúrógép", "你好世界", "😀🎉🌟", "ASCII Magyar:árvíz CJK:你好 Emoji:😀", new string('z', 500), string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)) }; foreach (var s in samples) { var byteCountFromCounter = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan()); var dst = new byte[s.Length * 4]; var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(s.AsSpan(), dst.AsSpan()); Assert.AreEqual(bytesWritten, byteCountFromCounter, $"GetUtf8ByteCount disagrees with EncodeUtf8SinglePass for [{s.Substring(0, Math.Min(20, s.Length))}...]"); } } // ────────────────────────────────────────────────────────────────────── // Decoder-side cross-check: BCL Encoding.UTF8.GetString reference // ────────────────────────────────────────────────────────────────────── [TestMethod] public void DecodeUtf8SinglePass_MatchesBclGetString_Ascii() { AssertDecodeMatchesBcl("ASCII test string with spaces and digits 0123456789."); } [TestMethod] public void DecodeUtf8SinglePass_MatchesBclGetString_LongAscii32Plus() { // CRITICAL — exercises the Vector256 ASCII prefix widen path that had the offset bug AssertDecodeMatchesBcl(new string('A', 32)); AssertDecodeMatchesBcl(new string('A', 33)); AssertDecodeMatchesBcl(new string('A', 64)); AssertDecodeMatchesBcl(new string('A', 65)); AssertDecodeMatchesBcl(new string('B', 100)); AssertDecodeMatchesBcl(new string('C', 256)); } [TestMethod] public void DecodeUtf8SinglePass_MatchesBclGetString_Hungarian() { AssertDecodeMatchesBcl("árvíztűrő tükörfúrógép"); } [TestMethod] public void DecodeUtf8SinglePass_MatchesBclGetString_Mixed() { AssertDecodeMatchesBcl("Plain ASCII + Magyar (árvíz) + CJK (你好) + Emoji (😀)"); } // ────────────────────────────────────────────────────────────────────── // Helpers // ────────────────────────────────────────────────────────────────────── /// /// Verifies that EncodeUtf8SinglePass produces bytes identical to , /// and that DecodeUtf8SinglePass on those bytes reconstructs the original string exactly. /// private static void AssertRoundTrip(string original, string? context = null) { var ctx = context is null ? string.Empty : $" [{context}]"; // 1. Encoder produces bytes identical to BCL Encoding.UTF8 var dst = new byte[original.Length * 4]; // worst-case UTF-8 var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(original.AsSpan(), dst.AsSpan()); var encoded = dst.AsSpan(0, bytesWritten).ToArray(); var bclEncoded = Utf8.GetBytes(original); CollectionAssert.AreEqual(bclEncoded, encoded, $"Encoder output mismatch{ctx}"); // 2. CountUtf8Chars matches the original char count var charCount = Utf8Transcoder.CountUtf8Chars(encoded); Assert.AreEqual(original.Length, charCount, $"Char count mismatch{ctx}"); // 3. DecodeUtf8SinglePass reconstructs the original string exactly var decoded = string.Create(charCount, encoded, static (chars, bytes) => { Utf8Transcoder.DecodeUtf8SinglePass(bytes, chars); }); Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}"); } /// /// Verifies that matches /// for the same input. This is the BCL parity /// invariant — any divergence means the SIMD byte counter is producing wrong values that /// would corrupt VarUInt length prefixes in WriteStringUtf8Internal. /// private static void AssertGetUtf8ByteCountMatchesBcl(string original) { var expected = Utf8.GetByteCount(original); var actual = Utf8Transcoder.GetUtf8ByteCount(original.AsSpan()); Assert.AreEqual(expected, actual, $"GetUtf8ByteCount mismatch for input length {original.Length}"); } /// /// Verifies that DecodeUtf8SinglePass produces output identical to /// for the same byte input. Catches silent decoder bugs that pass the round-trip test /// (e.g. write-overlap that happens to land back on the right value by accident). /// private static void AssertDecodeMatchesBcl(string original) { var bytes = Utf8.GetBytes(original); var bclDecoded = Utf8.GetString(bytes); var charCount = Utf8Transcoder.CountUtf8Chars(bytes); var ourDecoded = string.Create(charCount, bytes, static (chars, b) => { Utf8Transcoder.DecodeUtf8SinglePass(b, chars); }); Assert.AreEqual(bclDecoded, ourDecoded, $"Decoder mismatch for input length {bytes.Length}"); } }