AyCode.Core/AyCode.Core.Tests/Serialization/Utf8TranscoderTests.cs

using System.Text;
using AyCode.Core.Serializers.Binaries;

namespace AyCode.Core.Tests.Serialization;

/// <summary>
/// Round-trip and correctness tests for <see cref="Utf8Transcoder"/>'s SIMD path tiers.
///
/// <para><b>Critical coverage</b>: each path tier (Vector512 / Vector256 / Vector128 / scalar) has
/// minimum-size and boundary-crossing inputs to ensure the path is actually exercised. The
/// Hungarian benchmark in <c>BenchmarkTestDataProvider</c> bails out of the AVX2 ASCII-prefix
/// path early (first non-ASCII byte at position 4-5), so it cannot validate the long-ASCII path
/// on its own. These tests fill that gap.</para>
/// </summary>
[TestClass]
public class Utf8TranscoderTests
{
    private static readonly Encoding Utf8 = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);

    // ──────────────────────────────────────────────────────────────────────
    // CountUtf8Chars — content classes
    // ──────────────────────────────────────────────────────────────────────

    [TestMethod]
    public void CountUtf8Chars_AsciiOnly_MatchesStringLength()
    {
        var s = "Hello, World! This is plain ASCII.";
        var bytes = Utf8.GetBytes(s);
        Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
    }

    [TestMethod]
    public void CountUtf8Chars_HungarianMixed_MatchesStringLength()
    {
        var s = "árvíztűrő tükörfúrógép";
        var bytes = Utf8.GetBytes(s);
        Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
    }

    [TestMethod]
    public void CountUtf8Chars_CjkBmp_MatchesStringLength()
    {
        var s = "你好世界 こんにちは 안녕하세요";
        var bytes = Utf8.GetBytes(s);
        Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
    }

    [TestMethod]
    public void CountUtf8Chars_SupplementaryPlane_CountsSurrogatePairs()
    {
        // Each emoji is U+1F600-range (4-byte UTF-8 → 2-char surrogate pair in UTF-16)
        var s = "😀😁😂🎉";  // 4 codepoints, but 8 chars in UTF-16
        var bytes = Utf8.GetBytes(s);
        Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
        Assert.AreEqual(8, s.Length, "Sanity check: each emoji is a surrogate pair");
    }

    [TestMethod]
    public void CountUtf8Chars_MixedAllClasses_MatchesStringLength()
    {
        var s = "ASCII Magyar:árvíz CJK:你好 Emoji:😀";
        var bytes = Utf8.GetBytes(s);
        Assert.AreEqual(s.Length, Utf8Transcoder.CountUtf8Chars(bytes));
    }

    [TestMethod]
    public void CountUtf8Chars_Empty_ReturnsZero()
    {
        Assert.AreEqual(0, Utf8Transcoder.CountUtf8Chars(ReadOnlySpan<byte>.Empty));
    }

    // ──────────────────────────────────────────────────────────────────────
    // EncodeUtf8SinglePass + DecodeUtf8SinglePass — round-trip per content class
    // ──────────────────────────────────────────────────────────────────────

    [TestMethod]
    public void EncodeDecode_AsciiShort_RoundTrip()
    {
        AssertRoundTrip("Hello");
    }

    [TestMethod]
    public void EncodeDecode_AsciiExactly31Bytes_RoundTrip()
    {
        // Boundary: just below FixStr 31-byte limit, just below Vector256 threshold (32)
        AssertRoundTrip(new string('a', 31));
    }

    [TestMethod]
    public void EncodeDecode_AsciiExactly32Bytes_RoundTrip()
    {
        // Boundary: exactly Vector256<byte>.Count — Phase 1 AVX2 widen path triggers
        // CRITICAL: this validates the Vector256.Widen upper-half store offset bug-fix.
        AssertRoundTrip(new string('a', 32));
    }

    [TestMethod]
    public void EncodeDecode_AsciiLong_64Bytes_RoundTrip()
    {
        // Boundary: Vector512 threshold for the encoder; 2× Vector256 iter for the decoder
        AssertRoundTrip(new string('x', 64));
    }

    [TestMethod]
    public void EncodeDecode_AsciiVeryLong_500Bytes_RoundTrip()
    {
        // Multi-iter SIMD widen on the decoder; AVX-512 path on capable hosts
        AssertRoundTrip(new string('z', 500));
    }

    [TestMethod]
    public void EncodeDecode_HungarianShort_RoundTrip()
    {
        AssertRoundTrip("Termék");
    }

    [TestMethod]
    public void EncodeDecode_HungarianMedium_RoundTrip()
    {
        AssertRoundTrip("árvíztűrő tükörfúrógép");
    }

    [TestMethod]
    public void EncodeDecode_HungarianLong_RoundTrip()
    {
        // Long enough to span multiple Vector128/256 iterations
        AssertRoundTrip(string.Concat(Enumerable.Repeat("árvíztűrő tükörfúrógép ", 20)));
    }

    [TestMethod]
    public void EncodeDecode_CjkBmp_RoundTrip()
    {
        AssertRoundTrip("你好世界 こんにちは 안녕하세요");
    }

    [TestMethod]
    public void EncodeDecode_CjkBmpLong_RoundTrip()
    {
        AssertRoundTrip(string.Concat(Enumerable.Repeat("你好世界 ", 30)));
    }

    [TestMethod]
    public void EncodeDecode_SupplementaryPlane_RoundTrip()
    {
        AssertRoundTrip("😀😁😂🎉🌟");
    }

    [TestMethod]
    public void EncodeDecode_MixedAllClasses_RoundTrip()
    {
        AssertRoundTrip("Plain ASCII + Magyar (árvíztűrő) + CJK (你好世界) + Emoji (😀🎉)");
    }

    [TestMethod]
    public void EncodeDecode_LongMixed_RoundTrip()
    {
        // Long mixed content forcing all SIMD tiers + scalar tail to engage
        var sb = new StringBuilder();
        for (var i = 0; i < 50; i++)
        {
            sb.Append("ASCII run-").Append(i).Append(" Magyar:árvíz CJK:你好 ");
        }
        AssertRoundTrip(sb.ToString());
    }

    [TestMethod]
    public void EncodeDecode_BoundaryAsciiToHungarian_RoundTrip()
    {
        // ASCII prefix exactly at common boundaries, then non-ASCII switch
        for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
        {
            var s = new string('a', asciiLen) + "árvíz";
            AssertRoundTrip(s, $"asciiLen={asciiLen}");
        }
    }

    [TestMethod]
    public void EncodeDecode_BoundaryAsciiToCjk_RoundTrip()
    {
        // 3-byte sequence boundary stress
        for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
        {
            var s = new string('a', asciiLen) + "你好世界";
            AssertRoundTrip(s, $"asciiLen={asciiLen}");
        }
    }

    [TestMethod]
    public void EncodeDecode_BoundaryAsciiToEmoji_RoundTrip()
    {
        // 4-byte sequence boundary (surrogate pair in UTF-16)
        for (var asciiLen = 0; asciiLen <= 64; asciiLen++)
        {
            var s = new string('a', asciiLen) + "😀";
            AssertRoundTrip(s, $"asciiLen={asciiLen}");
        }
    }

    [TestMethod]
    public void EncodeDecode_Empty_RoundTrip()
    {
        AssertRoundTrip(string.Empty);
    }

    [TestMethod]
    public void EncodeDecode_SingleAsciiChar_RoundTrip()
    {
        AssertRoundTrip("X");
    }

    [TestMethod]
    public void EncodeDecode_SingleHungarianChar_RoundTrip()
    {
        AssertRoundTrip("é");
    }

    [TestMethod]
    public void EncodeDecode_SingleCjkChar_RoundTrip()
    {
        AssertRoundTrip("好");
    }

    [TestMethod]
    public void EncodeDecode_SingleEmoji_RoundTrip()
    {
        AssertRoundTrip("😀");
    }

    // ──────────────────────────────────────────────────────────────────────
    // Decoder-side cross-check: BCL Encoding.UTF8.GetString reference
    // ──────────────────────────────────────────────────────────────────────

    [TestMethod]
    public void DecodeUtf8SinglePass_MatchesBclGetString_Ascii()
    {
        AssertDecodeMatchesBcl("ASCII test string with spaces and digits 0123456789.");
    }

    [TestMethod]
    public void DecodeUtf8SinglePass_MatchesBclGetString_LongAscii32Plus()
    {
        // CRITICAL — exercises the Vector256 ASCII prefix widen path that had the offset bug
        AssertDecodeMatchesBcl(new string('A', 32));
        AssertDecodeMatchesBcl(new string('A', 33));
        AssertDecodeMatchesBcl(new string('A', 64));
        AssertDecodeMatchesBcl(new string('A', 65));
        AssertDecodeMatchesBcl(new string('B', 100));
        AssertDecodeMatchesBcl(new string('C', 256));
    }

    [TestMethod]
    public void DecodeUtf8SinglePass_MatchesBclGetString_Hungarian()
    {
        AssertDecodeMatchesBcl("árvíztűrő tükörfúrógép");
    }

    [TestMethod]
    public void DecodeUtf8SinglePass_MatchesBclGetString_Mixed()
    {
        AssertDecodeMatchesBcl("Plain ASCII + Magyar (árvíz) + CJK (你好) + Emoji (😀)");
    }

    // ──────────────────────────────────────────────────────────────────────
    // Helpers
    // ──────────────────────────────────────────────────────────────────────

    /// <summary>
    /// Verifies that EncodeUtf8SinglePass produces bytes identical to <see cref="Encoding.UTF8.GetBytes"/>,
    /// and that DecodeUtf8SinglePass on those bytes reconstructs the original string exactly.
    /// </summary>
    private static void AssertRoundTrip(string original, string? context = null)
    {
        var ctx = context is null ? string.Empty : $" [{context}]";

        // 1. Encoder produces bytes identical to BCL Encoding.UTF8
        var dst = new byte[original.Length * 4];  // worst-case UTF-8
        var bytesWritten = Utf8Transcoder.EncodeUtf8SinglePass(original.AsSpan(), dst.AsSpan());
        var encoded = dst.AsSpan(0, bytesWritten).ToArray();
        var bclEncoded = Utf8.GetBytes(original);
        CollectionAssert.AreEqual(bclEncoded, encoded, $"Encoder output mismatch{ctx}");

        // 2. CountUtf8Chars matches the original char count
        var charCount = Utf8Transcoder.CountUtf8Chars(encoded);
        Assert.AreEqual(original.Length, charCount, $"Char count mismatch{ctx}");

        // 3. DecodeUtf8SinglePass reconstructs the original string exactly
        var decoded = string.Create(charCount, encoded, static (chars, bytes) =>
        {
            Utf8Transcoder.DecodeUtf8SinglePass(bytes, chars);
        });
        Assert.AreEqual(original, decoded, $"Decoder output mismatch{ctx}");
    }

    /// <summary>
    /// Verifies that DecodeUtf8SinglePass produces output identical to <see cref="Encoding.UTF8.GetString"/>
    /// for the same byte input. Catches silent decoder bugs that pass the round-trip test
    /// (e.g. write-overlap that happens to land back on the right value by accident).
    /// </summary>
    private static void AssertDecodeMatchesBcl(string original)
    {
        var bytes = Utf8.GetBytes(original);
        var bclDecoded = Utf8.GetString(bytes);
        var charCount = Utf8Transcoder.CountUtf8Chars(bytes);
        var ourDecoded = string.Create(charCount, bytes, static (chars, b) =>
        {
            Utf8Transcoder.DecodeUtf8SinglePass(b, chars);
        });
        Assert.AreEqual(bclDecoded, ourDecoded, $"Decoder mismatch for input length {bytes.Length}");
    }
}