AyCode.Core/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs

311 lines
12 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System.Text;
using AyCode.Core.Serializers.Binaries;
namespace AyCode.Core.Tests.Serialization;
/// <summary>
/// Round-trip and correctness tests for <see cref="Utf8Transcoder"/>'s SIMD path tiers.
///
/// <para><b>Critical coverage</b>: each path tier (Vector512 / Vector256 / Vector128 / scalar) has
/// minimum-size and boundary-crossing inputs to ensure the path is actually exercised. The
/// Hungarian benchmark in <c>BenchmarkTestDataProvider</c> bails out of the AVX2 ASCII-prefix
/// path early (first non-ASCII byte at position 4-5), so it cannot validate the long-ASCII path
/// on its own. These tests fill that gap.</para>
/// </summary>
[TestClass]
public class Utf8TranscoderTests
{
private static readonly Encoding Utf8 = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
// ──────────────────────────────────────────────────────────────────────
// CountUtf8Chars — content classes
// ──────────────────────────────────────────────────────────────────────
[TestMethod]
public void CountUtf8Chars_AsciiOnly_MatchesStringLength()
{
var s = "Hello, World! This is plain ASCII.";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_HungarianMixed_MatchesStringLength()
{
var s = "árvíztűrő tükörfúrógép";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_CjkBmp_MatchesStringLength()
{
var s = "你好世界 こんにちは 안녕하세요";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_SupplementaryPlane_CountsSurrogatePairs()
{
// Each emoji is U+1F600-range (4-byte UTF-8 → 2-char surrogate pair in UTF-16)
var s = "😀😁😂🎉"; // 4 codepoints, but 8 chars in UTF-16
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
Assert.AreEqual(8, s.Length, "Sanity check: each emoji is a surrogate pair");
}
[TestMethod]
public void CountUtf8Chars_MixedAllClasses_MatchesStringLength()
{
var s = "ASCII Magyar:árvíz CJK:你好 Emoji:😀";
var bytes = Utf8.GetBytes(s);
Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
}
[TestMethod]
public void CountUtf8Chars_Empty_ReturnsZero()
{
Assert.AreEqual(0, Utf8Transcoder.CountUtf8Chars(ReadOnlySpan<byte>.Empty));
}
// ──────────────────────────────────────────────────────────────────────
// EncodeUtf8SinglePass + DecodeUtf8SinglePass — round-trip per content class
// ──────────────────────────────────────────────────────────────────────
[TestMethod]
public void EncodeDecode_AsciiShort_RoundTrip()
{
AssertRoundTrip("Hello");
}
[TestMethod]
public void EncodeDecode_AsciiExactly31Bytes_RoundTrip()
{
// Boundary: just below FixStr 31-byte limit, just below Vector256 threshold (32)
AssertRoundTrip(new string('a', 31));
}
[TestMethod]
public void EncodeDecode_AsciiExactly32Bytes_RoundTrip()
{
// Boundary: exactly Vector256<byte>.Count — Phase 1 AVX2 widen path triggers
// CRITICAL: this validates the Vector256.Widen upper-half store offset bug-fix.
AssertRoundTrip(new string('a', 32));
}
[TestMethod]
public void EncodeDecode_AsciiLong_64Bytes_RoundTrip()
{
// Boundary: Vector512 threshold for the encoder; 2× Vector256 iter for the decoder
AssertRoundTrip(new string('x', 64));
}
[TestMethod]
public void EncodeDecode_AsciiVeryLong_500Bytes_RoundTrip()
{
// Multi-iter SIMD widen on the decoder; AVX-512 path on capable hosts
AssertRoundTrip(new string('z', 500));
}
[TestMethod]
public void EncodeDecode_HungarianShort_RoundTrip()
{
AssertRoundTrip("Termék");
}
[TestMethod]
public void EncodeDecode_HungarianMedium_RoundTrip()
{
AssertRoundTrip("árvíztűrő tükörfúrógép");
}
[TestMethod]
public void EncodeDecode_HungarianLong_RoundTrip()
{
// Long enough to span multiple Vector128/256 iterations
AssertRoundTrip(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)));
}
[TestMethod]
public void EncodeDecode_CjkBmp_RoundTrip()
{
AssertRoundTrip("你好世界 こんにちは 안녕하세요");
}
[TestMethod]
public void EncodeDecode_CjkBmpLong_RoundTrip()
{
AssertRoundTrip(string.Concat(Enumerable.Repeat("你好世界 ", 30)));
}
[TestMethod]
public void EncodeDecode_SupplementaryPlane_RoundTrip()
{
AssertRoundTrip("😀😁😂🎉🌟");
}
[TestMethod]
public void EncodeDecode_MixedAllClasses_RoundTrip()
{
AssertRoundTrip("Plain ASCII + Magyar (árvíztűrő) + CJK (你好世界) + Emoji (😀🎉)");
}
[TestMethod]
public void EncodeDecode_LongMixed_RoundTrip()
{
// Long mixed content forcing all SIMD tiers + scalar tail to engage
var sb = new StringBuilder();
for (var i = 0; i < 50; i++)
{
sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 ");
}
AssertRoundTrip(sb.ToString());
}
[TestMethod]
public void EncodeDecode_BoundaryAsciiToHungarian_RoundTrip()
{
// ASCII prefix exactly at common boundaries, then non-ASCII switch
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "árvíz";
AssertRoundTrip(s, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void EncodeDecode_BoundaryAsciiToCjk_RoundTrip()
{
// 3-byte sequence boundary stress
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "你好世界";
AssertRoundTrip(s, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void EncodeDecode_BoundaryAsciiToEmoji_RoundTrip()
{
// 4-byte sequence boundary (surrogate pair in UTF-16)
for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
{
var s = new string('a', asciiLen) + "😀";
AssertRoundTrip(s, $"asciiLen={asciiLen}");
}
}
[TestMethod]
public void EncodeDecode_Empty_RoundTrip()
{
AssertRoundTrip(string.Empty);
}
[TestMethod]
public void EncodeDecode_SingleAsciiChar_RoundTrip()
{
AssertRoundTrip("X");
}
[TestMethod]
public void EncodeDecode_SingleHungarianChar_RoundTrip()
{
AssertRoundTrip("é");
}
[TestMethod]
public void EncodeDecode_SingleCjkChar_RoundTrip()
{
AssertRoundTrip("好");
}
[TestMethod]
public void EncodeDecode_SingleEmoji_RoundTrip()
{
AssertRoundTrip("😀");
}
// ──────────────────────────────────────────────────────────────────────
// Decoder-side cross-check: BCL Encoding.UTF8.GetString reference
// ──────────────────────────────────────────────────────────────────────
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_Ascii()
{
AssertDecodeMatchesBcl("ASCII test string with spaces and digits 0123456789.");
}
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_LongAscii32Plus()
{
// CRITICAL — exercises the Vector256 ASCII prefix widen path that had the offset bug
AssertDecodeMatchesBcl(new string('A', 32));
AssertDecodeMatchesBcl(new string('A', 33));
AssertDecodeMatchesBcl(new string('A', 64));
AssertDecodeMatchesBcl(new string('A', 65));
AssertDecodeMatchesBcl(new string('B', 100));
AssertDecodeMatchesBcl(new string('C', 256));
}
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_Hungarian()
{
AssertDecodeMatchesBcl("árvíztűrő tükörfúrógép");
}
[TestMethod]
public void DecodeUtf8SinglePass_MatchesBclGetString_Mixed()
{
AssertDecodeMatchesBcl("Plain ASCII + Magyar (árvíz) + CJK (你好) + Emoji (😀)");
}
// ──────────────────────────────────────────────────────────────────────
// Helpers
// ──────────────────────────────────────────────────────────────────────
/// <summary>
/// Verifies that EncodeUtf8SinglePass produces bytes identical to <see cref="Encoding.UTF8.GetBytes"/>,
/// and that DecodeUtf8SinglePass on those bytes reconstructs the original string exactly.
/// </summary>
private static void AssertRoundTrip(string original, string? context = null)
{
var ctx = context is null ? string.Empty : $" [{context}]";
// 1. Encoder produces bytes identical to BCL Encoding.UTF8
var dst = new byte[original.Length * 4]; // worst-case UTF-8
var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(original.AsSpan(), dst.AsSpan());
var encoded = dst.AsSpan(0, bytesWritten).ToArray();
var bclEncoded = Utf8.GetBytes(original);
CollectionAssert.AreEqual(bclEncoded, encoded, $"Encoder output mismatch{ctx}");
// 2. CountUtf8Chars matches the original char count
var charCount = Utf8Transcoder.CountUtf8Chars(encoded);
Assert.AreEqual(original.Length, charCount, $"Char count mismatch{ctx}");
// 3. DecodeUtf8SinglePass reconstructs the original string exactly
var decoded = string.Create(charCount, encoded, static (chars, bytes) =>
{
Utf8Transcoder.DecodeUtf8SinglePass(bytes, chars);
});
Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}");
}
/// <summary>
/// Verifies that DecodeUtf8SinglePass produces output identical to <see cref="Encoding.UTF8.GetString"/>
/// for the same byte input. Catches silent decoder bugs that pass the round-trip test
/// (e.g. write-overlap that happens to land back on the right value by accident).
/// </summary>
private static void AssertDecodeMatchesBcl(string original)
{
var bytes = Utf8.GetBytes(original);
var bclDecoded = Utf8.GetString(bytes);
var charCount = Utf8Transcoder.CountUtf8Chars(bytes);
var ourDecoded = string.Create(charCount, bytes, static (chars, b) =>
{
Utf8Transcoder.DecodeUtf8SinglePass(b, chars);
});
Assert.AreEqual(bclDecoded, ourDecoded, $"Decoder mismatch for input length {bytes.Length}");
}
}