using System.Text;
using AyCode.Core.Serializers.Binaries;
namespace AyCode.Core.Tests.Serialization;
///
/// Round-trip and correctness tests for 's SIMD path tiers.
///
/// Critical coverage: each path tier (Vector512 / Vector256 / Vector128 / scalar) has
/// minimum-size and boundary-crossing inputs to ensure the path is actually exercised. The
/// Hungarian benchmark in BenchmarkTestDataProvider bails out of the AVX2 ASCII-prefix
/// path early (first non-ASCII byte at position 4-5), so it cannot validate the long-ASCII path
/// on its own. These tests fill that gap.
///
[TestClass]
public class Utf8TranscoderTests
{
private static readonly Encoding Utf8 = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
// ──────────────────────────────────────────────────────────────────────
// CountUtf8Chars — content classes
// ──────────────────────────────────────────────────────────────────────
[TestMethod]
public void CountUtf8Chars_AsciiOnly_MatchesStringLength()
{
var s = "Hello, World! This is plain ASCII.";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_HungarianMixed_MatchesStringLength()
{
var s = "árvíztűrő tükörfúrógép";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_CjkBmp_MatchesStringLength()
{
var s = "你好世界 こんにちは 안녕하세요";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_SupplementaryPlane_CountsSurrogatePairs()
{
// Each emoji is U+1F600-range (4-byte UTF-8 → 2-char surrogate pair in UTF-16)
var s = "😀😁😂🎉"; // 4 codepoints, but 8 chars in UTF-16
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
Assert.AreEqual(8, s.Length, "Sanity check: each emoji is a surrogate pair");
}
[TestMethod]
public void CountUtf8Chars_MixedAllClasses_MatchesStringLength()
{
var s = "ASCII Magyar:árvíz CJK:你好 Emoji:😀";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_Empty_ReturnsZero()
{
Assert.AreEqual(0, Utf8Transcoder.CountUtf8Chars(ReadOnlySpan.Empty));
}
// ──────────────────────────────────────────────────────────────────────
// EncodeUtf8SinglePass + DecodeUtf8SinglePass — round-trip per content class
// ──────────────────────────────────────────────────────────────────────
[TestMethod]
public void EncodeDecode_AsciiShort_RoundTrip()
{
AssertRoundTrip("Hello");
}
[TestMethod]
public void EncodeDecode_AsciiExactly31Bytes_RoundTrip()
{
// Boundary: just below FixStr 31-byte limit, just below Vector256 threshold (32)
AssertRoundTrip(new string('a', 31));
}
[TestMethod]
public void EncodeDecode_AsciiExactly32Bytes_RoundTrip()
{
// Boundary: exactly Vector256.Count — Phase 1 AVX2 widen path triggers
// CRITICAL: this validates the Vector256.Widen upper-half store offset bug-fix.
AssertRoundTrip(new string('a', 32));
}
[TestMethod]
public void EncodeDecode_AsciiLong_64Bytes_RoundTrip()
{
// Boundary: Vector512 threshold for the encoder; 2× Vector256 iter for the decoder
AssertRoundTrip(new string('x', 64));
}
[TestMethod]
public void EncodeDecode_AsciiVeryLong_500Bytes_RoundTrip()
{
// Multi-iter SIMD widen on the decoder; AVX-512 path on capable hosts
AssertRoundTrip(new string('z', 500));
}
[TestMethod]
public void EncodeDecode_HungarianShort_RoundTrip()
{
AssertRoundTrip("Termék");
}
[TestMethod]
public void EncodeDecode_HungarianMedium_RoundTrip()
{
AssertRoundTrip("árvíztűrő tükörfúrógép");
}
[TestMethod]
public void EncodeDecode_HungarianLong_RoundTrip()
{
// Long enough to span multiple Vector128/256 iterations
AssertRoundTrip(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)));
}
[TestMethod]
public void EncodeDecode_CjkBmp_RoundTrip()
{
AssertRoundTrip("你好世界 こんにちは 안녕하세요");
}
[TestMethod]
public void EncodeDecode_CjkBmpLong_RoundTrip()
{
AssertRoundTrip(string.Concat(Enumerable.Repeat("你好世界 ", 30)));
}
[TestMethod]
public void EncodeDecode_SupplementaryPlane_RoundTrip()
{
AssertRoundTrip("😀😁😂🎉🌟");
}
[TestMethod]
public void EncodeDecode_MixedAllClasses_RoundTrip()
{
AssertRoundTrip("Plain ASCII + Magyar (árvíztűrő) + CJK (你好世界) + Emoji (😀🎉)");
}
[TestMethod]
public void EncodeDecode_LongMixed_RoundTrip()
{
// Long mixed content forcing all SIMD tiers + scalar tail to engage
var sb = new StringBuilder();
for (var i = 0; i < 50; i++)
{
sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 ");
}
AssertRoundTrip(sb.ToString());
}
[TestMethod]
public void EncodeDecode_BoundaryAsciiToHungarian_RoundTrip()
{
// ASCII prefix exactly at common boundaries, then non-ASCII switch
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "árvíz";
AssertRoundTrip(s, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void EncodeDecode_BoundaryAsciiToCjk_RoundTrip()
{
// 3-byte sequence boundary stress
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "你好世界";
AssertRoundTrip(s, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void EncodeDecode_BoundaryAsciiToEmoji_RoundTrip()
{
// 4-byte sequence boundary (surrogate pair in UTF-16)
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "😀";
AssertRoundTrip(s, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void EncodeDecode_Empty_RoundTrip()
{
AssertRoundTrip(string.Empty);
}
[TestMethod]
public void EncodeDecode_SingleAsciiChar_RoundTrip()
{
AssertRoundTrip("X");
}
[TestMethod]
public void EncodeDecode_SingleHungarianChar_RoundTrip()
{
AssertRoundTrip("é");
}
[TestMethod]
public void EncodeDecode_SingleCjkChar_RoundTrip()
{
AssertRoundTrip("好");
}
[TestMethod]
public void EncodeDecode_SingleEmoji_RoundTrip()
{
AssertRoundTrip("😀");
}
// ──────────────────────────────────────────────────────────────────────
// GetUtf8ByteCount — content classes
// ──────────────────────────────────────────────────────────────────────
[TestMethod]
public void GetUtf8ByteCount_AsciiOnly_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("Hello, World! Plain ASCII text.");
}
[TestMethod]
public void GetUtf8ByteCount_AsciiExactly7Bytes_MatchesBcl()
{
// Boundary: just below Vector128.Count (8) — scalar tail only
AssertGetUtf8ByteCountMatchesBcl(new string('a', 7));
}
[TestMethod]
public void GetUtf8ByteCount_AsciiExactly8Bytes_MatchesBcl()
{
// Boundary: exactly Vector128.Count — Vector128 path triggers
AssertGetUtf8ByteCountMatchesBcl(new string('a', 8));
}
[TestMethod]
public void GetUtf8ByteCount_AsciiExactly16Bytes_MatchesBcl()
{
// Boundary: exactly Vector256.Count — Vector256 path triggers
AssertGetUtf8ByteCountMatchesBcl(new string('a', 16));
}
[TestMethod]
public void GetUtf8ByteCount_AsciiExactly32Bytes_MatchesBcl()
{
// Boundary: exactly Vector512.Count — Vector512 path triggers on AVX-512BW
AssertGetUtf8ByteCountMatchesBcl(new string('a', 32));
}
[TestMethod]
public void GetUtf8ByteCount_AsciiVeryLong_500Chars_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl(new string('z', 500));
}
[TestMethod]
public void GetUtf8ByteCount_HungarianShort_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("Termék");
}
[TestMethod]
public void GetUtf8ByteCount_HungarianMedium_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("árvíztűrő tükörfúrógép");
}
[TestMethod]
public void GetUtf8ByteCount_HungarianLong_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)));
}
[TestMethod]
public void GetUtf8ByteCount_CjkBmp_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("你好世界 こんにちは 안녕하세요");
}
[TestMethod]
public void GetUtf8ByteCount_CjkBmpLong_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl(string.Concat(Enumerable.Repeat("你好世界 ", 30)));
}
[TestMethod]
public void GetUtf8ByteCount_SupplementaryPlane_MatchesBcl()
{
// Each emoji is 2 UTF-16 chars (surrogate pair) → 4 UTF-8 bytes total
AssertGetUtf8ByteCountMatchesBcl("😀😁😂🎉🌟");
}
[TestMethod]
public void GetUtf8ByteCount_MixedAllClasses_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("ASCII Magyar:árvíz CJK:你好 Emoji:😀");
}
[TestMethod]
public void GetUtf8ByteCount_LongMixed_MatchesBcl()
{
var sb = new StringBuilder();
for (var i = 0; i < 50; i++)
{
sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 ");
}
AssertGetUtf8ByteCountMatchesBcl(sb.ToString());
}
[TestMethod]
public void GetUtf8ByteCount_Empty_ReturnsZero()
{
Assert.AreEqual(0, Utf8Transcoder.GetUtf8ByteCount(ReadOnlySpan.Empty));
}
[TestMethod]
public void GetUtf8ByteCount_SingleAsciiChar_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("X");
}
[TestMethod]
public void GetUtf8ByteCount_SingleHungarianChar_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("é");
}
[TestMethod]
public void GetUtf8ByteCount_SingleCjkChar_MatchesBcl()
{
AssertGetUtf8ByteCountMatchesBcl("好");
}
[TestMethod]
public void GetUtf8ByteCount_SingleEmoji_MatchesBcl()
{
// Single emoji = surrogate pair, exact 4 bytes
AssertGetUtf8ByteCountMatchesBcl("😀");
}
[TestMethod]
public void GetUtf8ByteCount_BoundaryAsciiToHungarian_MatchesBcl()
{
// Exercises split between SIMD ASCII region and 2-byte tail
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "árvíz";
var expected = Utf8.GetByteCount(s);
var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void GetUtf8ByteCount_BoundaryAsciiToCjk_MatchesBcl()
{
// 3-byte sequence boundary stress
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "你好世界";
var expected = Utf8.GetByteCount(s);
var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void GetUtf8ByteCount_BoundaryAsciiToEmoji_MatchesBcl()
{
// CRITICAL: tests that surrogate pairs split across SIMD chunks still produce correct count.
// High surrogate may land in chunk N, low surrogate in chunk N+1; total must remain 4 bytes.
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "😀";
var expected = Utf8.GetByteCount(s);
var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
Assert.AreEqual(expected, actual, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void GetUtf8ByteCount_MultipleEmojiBoundary_MatchesBcl()
{
// Surrogate pair split-stress: many emojis at varying offsets
for (var prefixLen = 0; prefixLen <= 32; prefixLen++)
{
var s = new string('a', prefixLen) + "😀😁😂🎉🌟😀😁😂🎉🌟";
var expected = Utf8.GetByteCount(s);
var actual = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
Assert.AreEqual(expected, actual, $"prefixLen={prefixLen}");
}
}
[TestMethod]
public void GetUtf8ByteCount_AgreesWithEncodeUtf8SinglePass_AllContentClasses()
{
// Round-trip contract: the byte count returned must equal the bytesWritten by EncodeUtf8SinglePass.
// This is the load-bearing invariant for two-pass [VarUInt][bytes] writes in cold-fallback paths.
var samples = new[]
{
"Hello",
"árvíztűrő tükörfúrógép",
"你好世界",
"😀🎉🌟",
"ASCII Magyar:árvíz CJK:你好 Emoji:😀",
new string('z', 500),
string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20))
};
foreach (var s in samples)
{
var byteCountFromCounter = Utf8Transcoder.GetUtf8ByteCount(s.AsSpan());
var dst = new byte[s.Length * 4];
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(s.AsSpan(), dst.AsSpan());
Assert.AreEqual(bytesWritten, byteCountFromCounter,
$"GetUtf8ByteCount disagrees with EncodeUtf8SinglePass for [{s.Substring(0, Math.Min(20, s.Length))}...]");
}
}
// ──────────────────────────────────────────────────────────────────────
// Decoder-side cross-check: BCL Encoding.UTF8.GetString reference
// ──────────────────────────────────────────────────────────────────────
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_Ascii()
{
AssertDecodeMatchesBcl("ASCII test string with spaces and digits 0123456789.");
}
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_LongAscii32Plus()
{
// CRITICAL — exercises the Vector256 ASCII prefix widen path that had the offset bug
AssertDecodeMatchesBcl(new string('A', 32));
AssertDecodeMatchesBcl(new string('A', 33));
AssertDecodeMatchesBcl(new string('A', 64));
AssertDecodeMatchesBcl(new string('A', 65));
AssertDecodeMatchesBcl(new string('B', 100));
AssertDecodeMatchesBcl(new string('C', 256));
}
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_Hungarian()
{
AssertDecodeMatchesBcl("árvíztűrő tükörfúrógép");
}
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_Mixed()
{
AssertDecodeMatchesBcl("Plain ASCII + Magyar (árvíz) + CJK (你好) + Emoji (😀)");
}
// ──────────────────────────────────────────────────────────────────────
// Helpers
// ──────────────────────────────────────────────────────────────────────
///
/// Verifies that EncodeUtf8SinglePass produces bytes identical to ,
/// and that DecodeUtf8SinglePass on those bytes reconstructs the original string exactly.
///
private static void AssertRoundTrip(string original, string? context = null)
{
var ctx = context is null ? string.Empty : $" [{context}]";
// 1. Encoder produces bytes identical to BCL Encoding.UTF8
var dst = new byte[original.Length * 4]; // worst-case UTF-8
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(original.AsSpan(), dst.AsSpan());
var encoded = dst.AsSpan(0, bytesWritten).ToArray();
var bclEncoded = Utf8.GetBytes(original);
CollectionAssert.AreEqual(bclEncoded, encoded, $"Encoder output mismatch{ctx}");
// 2. CountUtf8Chars matches the original char count
var charCount = Utf8Transcoder.CountUtf8Chars(encoded);
Assert.AreEqual(original.Length, charCount, $"Char count mismatch{ctx}");
// 3. DecodeUtf8SinglePass reconstructs the original string exactly
var decoded = string.Create(charCount, encoded, static (chars, bytes) =>
{
Utf8Transcoder.DecodeUtf8SinglePass(bytes, chars);
});
Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}");
}
///
/// Verifies that matches
/// for the same input. This is the BCL parity
/// invariant — any divergence means the SIMD byte counter is producing wrong values that
/// would corrupt VarUInt length prefixes in WriteStringUtf8Internal.
///
private static void AssertGetUtf8ByteCountMatchesBcl(string original)
{
var expected = Utf8.GetByteCount(original);
var actual = Utf8Transcoder.GetUtf8ByteCount(original.AsSpan());
Assert.AreEqual(expected, actual, $"GetUtf8ByteCount mismatch for input length {original.Length}");
}
///
/// Verifies that DecodeUtf8SinglePass produces output identical to
/// for the same byte input. Catches silent decoder bugs that pass the round-trip test
/// (e.g. write-overlap that happens to land back on the right value by accident).
///
private static void AssertDecodeMatchesBcl(string original)
{
var bytes = Utf8.GetBytes(original);
var bclDecoded = Utf8.GetString(bytes);
var charCount = Utf8Transcoder.CountUtf8Chars(bytes);
var ourDecoded = string.Create(charCount, bytes, static (chars, b) =>
{
Utf8Transcoder.DecodeUtf8SinglePass(b, chars);
});
Assert.AreEqual(bclDecoded, ourDecoded, $"Decoder mismatch for input length {bytes.Length}");
}
}