[LOADED_DOCS: 3 files, no new loads]

Move DrainFromAsync to tests, add in-memory benchmarks - Moved AsyncPipeReaderInputExtensions.DrainFromAsync from the main framework to test-only assembly; no longer public API. - Removed AcBinaryDeserializer.DeserializeFromPipeReaderAsync<T> from public API; tests now inline drain+deserialize logic. - Added AcBinaryInMemoryPipeBenchmark and AcBinaryInMemoryRawByteArrayBenchmark to complete 2x2 transport × wire-format benchmark matrix. - Refactored benchmark runner for interactive menu, settings, and CLI parsing. - Expanded XML docs for AsyncPipeReaderInput and AsyncPipeWriterOutput to clarify push-pattern and real-world usage. - Updated BINARY_ASYNCPIPE_TODO.md and related docs to reflect these changes.
2026-05-02 15:51:07 +02:00 · 2026-05-02 15:51:07 +02:00 · 67589f6b6f
parent 05f90a5639
commit 67589f6b6f
8 changed files with 672 additions and 169 deletions
--- a/AyCode.Core.Serializers.Console/Program.cs
+++ b/AyCode.Core.Serializers.Console/Program.cs
@ -1,6 +1,7 @@
 using AyCode.Core.Compression;
 using AyCode.Core.Serializers.Attributes;
 using AyCode.Core.Serializers.Binaries;
+using AyCode.Core.Tests.Serialization;   // DrainFromAsync extension (test-only, used by benchmark)
 using AyCode.Core.Tests.TestModels;
 using MemoryPack;
 using MessagePack;
@ -42,8 +43,8 @@ public static class Program
    private static int TestIterations = 1;
    private static int BenchmarkSamples = 1;       // Debug: single sample, fast iteration
 #else
-    private static int WarmupIterations = 5000; //5000
-    private static int TestIterations = 1000; //1000
+    private static int WarmupIterations = 100; //5000
+    private static int TestIterations = 10; //1000
    private static int BenchmarkSamples = 3;
 #endif

@ -61,6 +62,15 @@ public static class Program
    private const string IoString = "String";
    private const string IoNamedPipe = "NamedPipe";
    private const string IoNamedPipeRaw = "NamedPipe";
+    private const string IoInMemoryPipe = "Pipe(in-mem)";
+    private const string IoInMemoryRaw = "Bytes(in-mem)";
+
+    // Single source of truth for the chunk size used by ALL pipe-related benchmarks (NamedPipe PipeChunk,
+    // NamedPipe PipeRaw, in-memory Pipe, in-memory RawMem) AND the NamedPipe server's inBufferSize/outBufferSize.
+    // Same value across both layers ensures apples-to-apples comparison: chunked-streaming chunk-on-wire size
+    // matches the kernel pipe-buffer slot exactly. Tweak HERE when experimenting; do NOT scatter chunkSize
+    // overrides across individual benchmark rows.
+    private const int PipeChunkSize = 4096;

    // Dispatch mode identifiers — describes how property access / type dispatch happens for a given run.
    // SGen   = compile-time source generator path (Unsafe.As<T> direct fields, slot-array wrapper lookup).
@ -152,68 +162,97 @@ public static class Program
        // Done early so user is told immediately, not after warmup.
        ValidateMemoryPackSetup();

-        // Determine layer (which test data to run), opMode (ser/des/all), and serializerMode (standard/asyncpipe).
-        // CLI args take precedence; if no args, show interactive menu.
-        // serializerMode: "standard" = all serializers EXCEPT AsyncPipe; "asyncpipe" = ONLY the AsyncPipe streaming benchmark.
-        // The two are mutually exclusive — AsyncPipe never runs alongside the standard set, so its long-lived pipe
-        // setup / kernel-buffer overhead does not skew the steady-state Byte[] / IBufferWriter measurements.
-        string layer;
-        var opMode = "all";
-        var serializerMode = "standard";
+        // CLI mode (args provided): run once, parse args, exit. Backward-compatible behaviour.
+        if (args.Length > 0)
+        {
+            if (!TryParseCliArgs(args, out var layer, out var opMode, out var serializerMode))
+                return;  // profiler mode (already ran) or invalid args
+            RunBenchmark(layer, opMode, serializerMode);
+            return;
+        }

-        if (args.Length == 0)
+        // Interactive mode (no args): loop the menu so the user doesn't have to restart between runs.
+        // Q exits the menu (and the application).
+        while (true)
        {
            var selection = ShowInteractiveMenu();
            if (selection == null) return;  // user pressed Q
-            layer = selection.Value.layer;
-            serializerMode = selection.Value.serializerMode;
+
+            RunBenchmark(selection.Value.layer, "all", selection.Value.serializerMode);
+
+            System.Console.WriteLine();
+            System.Console.WriteLine("─────────────────────────────────────────────────────────────────────");
+            System.Console.WriteLine("Returning to menu — press any key to continue, or Q to quit...");
+            var key = System.Console.ReadKey(intercept: true);
+            if (key.Key == ConsoleKey.Q) return;
+            System.Console.WriteLine();
+        }
+    }
+
+    /// <summary>
+    /// Parses CLI arguments into (layer, opMode, serializerMode). Returns <c>false</c> if the args
+    /// indicate a special mode that has already been handled (e.g. <c>profiler</c>) or are invalid;
+    /// the caller should then exit without running the standard benchmark.
+    /// </summary>
+    private static bool TryParseCliArgs(string[] args, out string layer, out string opMode, out string serializerMode)
+    {
+        layer = "all";
+        opMode = "all";
+        serializerMode = "standard";
+
+        var arg = args[0].ToLower();
+
+        // Profiler mode: warmup only, then exit (for memory profiler analysis)
+        if (arg == "profiler")
+        {
+            RunProfilerMode();
+            return false;
+        }
+
+        // Quick mode: short warmup, few iterations, small sample count
+        if (arg == "quick")
+        {
+            WarmupIterations = 5;
+            TestIterations = 100;
+            BenchmarkSamples = 3;
+            layer = "all";
+        }
+        else if (arg is "core" or "comprehensive" or "edge" or "all")
+        {
+            layer = arg;
+        }
+        else if (arg is "asyncpipe" or "pipe")
+        {
+            // AsyncPipe-only mode: streaming I/O isolation across all test data.
+            layer = "all";
+            serializerMode = "asyncpipe";
+        }
+        else if (arg is "ser" or "serialize")
+        {
+            opMode = "serialize";
+            layer = "all";
+        }
+        else if (arg is "des" or "deserialize")
+        {
+            opMode = "deserialize";
+            layer = "all";
        }
        else
        {
-            var arg = args[0].ToLower();
-
-            // Profiler mode: warmup only, then exit (for memory profiler analysis)
-            if (arg == "profiler")
-            {
-                RunProfilerMode();
-                return;
-            }
-
-            // Quick mode: short warmup, few iterations, small sample count
-            if (arg == "quick")
-            {
-                WarmupIterations = 5;
-                TestIterations = 100;
-                BenchmarkSamples = 3;
-                layer = "all";
-            }
-            else if (arg is "core" or "comprehensive" or "edge" or "all")
-            {
-                layer = arg;
-            }
-            else if (arg is "asyncpipe" or "pipe")
-            {
-                // AsyncPipe-only mode: streaming I/O isolation across all test data.
-                layer = "all";
-                serializerMode = "asyncpipe";
-            }
-            else if (arg is "ser" or "serialize")
-            {
-                opMode = "serialize";
-                layer = "all";
-            }
-            else if (arg is "des" or "deserialize")
-            {
-                opMode = "deserialize";
-                layer = "all";
-            }
-            else
-            {
-                // Backwards compat: unknown arg → treat as layer keyword
-                layer = arg;
-            }
+            // Backwards compat: unknown arg → treat as layer keyword
+            layer = arg;
        }

+        return true;
+    }
+
+    /// <summary>
+    /// Runs the benchmark suite end-to-end for the given configuration: pre-warmup → per-cell warmup
+    /// + measurement → grouped results print → save to disk. Used by both the CLI and interactive
+    /// menu paths; the interactive loop calls this repeatedly without restarting the process.
+    /// </summary>
+    private static void RunBenchmark(string layer, string opMode, string serializerMode)
+    {
        System.Console.WriteLine("╔══════════════════════════════════════════════════════════════════════╗");
        System.Console.WriteLine("║          COMPREHENSIVE SERIALIZER BENCHMARK SUITE                    ║");
        System.Console.WriteLine("╚══════════════════════════════════════════════════════════════════════╝");
@ -449,7 +488,7 @@ public static class Program
            // fits blocking-free in one kernel pipe-buffer slot. Single source of truth for both app-level
            // wire chunk AND kernel transfer unit; change ONLY this line when tuning.
            var binaryFastModePipeChunkOnly = AcBinarySerializerOptions.FastMode;
-            binaryFastModePipeChunkOnly.BufferWriterChunkSize = 4096; //AsyncPipeWriterOutput.MaxChunkSize;
+            binaryFastModePipeChunkOnly.BufferWriterChunkSize = PipeChunkSize;

            return new List<ISerializerBenchmark>
            {
@ -463,6 +502,19 @@ public static class Program
                // the chunked-row above this isolates AsyncPipe-framework-overhead (Δ vs raw) from
                // kernel-transport-overhead (raw vs in-process Byte[]).
                new AcBinaryNamedPipeRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
+                // Chunked-framed AsyncPipe over an IN-MEMORY System.IO.Pipelines.Pipe (NO NamedPipe, NO kernel).
+                // Same chunked-streaming code path (SerializeChunkedFramed → AsyncPipeReaderInput) but with the
+                // kernel-pipe replaced by a managed-only Pipe. Eliminates per-chunk syscall overhead (~30 µs/chunk
+                // on NamedPipe → ~1-2 µs/chunk on in-memory Pipe). Side-by-side with the NamedPipe row above this
+                // isolates pure CPU cost of the chunked-streaming framework (vs kernel-pipe transport cost) — the
+                // in-memory Pipe row should be much closer to the raw-byte[] row, validating that NamedPipe loopback
+                // is the worst-case benchmark scenario for chunked-streaming and not representative of real network
+                // / file / cross-thread Pipe scenarios.
+                new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeChunk)"),
+                // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport at all). Apples-to-apples
+                // baseline for the in-memory chunked row above: same in-memory transport (zero kernel), but raw
+                // byte[] vs chunked-streaming wire format. Completes the 2x2 matrix [chunked,raw] × [kernel,memory].
+                new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkOnly, "FastMode (PipeRaw)"),
            };
        }

@ -484,7 +536,13 @@ public static class Program
        // allocates a fresh ABW. Independent of the AsyncPipe profile (different mechanism: alloc overhead
        // vs syscall count).
        var binaryFastModeBufWrChunk = AcBinarySerializerOptions.FastMode;
-        binaryFastModeBufWrChunk.BufferWriterChunkSize = 4096;
+        binaryFastModeBufWrChunk.BufferWriterChunkSize = PipeChunkSize;
+
+        // In-memory Pipe variant — same 4 KB chunkSize as the AsyncPipe mode, no kernel-pipe alignment
+        // concern (managed slabs are not page-aligned anyway). Drives SerializeChunkedFramed via the in-memory
+        // System.IO.Pipelines.Pipe (zero-copy slab handoff between producer and drain task).
+        var binaryFastModePipeChunkInMem = AcBinarySerializerOptions.FastMode;
+        binaryFastModePipeChunkInMem.BufferWriterChunkSize = PipeChunkSize;

        var defaultOptions = AcBinarySerializerOptions.Default;
        defaultOptions.UseStringInterning = StringInterningMode.None;
@ -517,8 +575,20 @@ public static class Program
            // allocation. Optimum for this scenario.
            new AcBinaryFreshBufferWriterBenchmark(testData.Order, binaryFastModeBufWrChunk, "FastMode (4KB)"),

-            // AsyncPipe streaming (AcBinaryNamedPipeBenchmark) is intentionally OMITTED here — run it via
-            // the dedicated AsyncPipe menu / CLI mode for isolated streaming-I/O measurements.
+            // AcBinary chunked-streaming over an IN-MEMORY Pipe (no kernel transport). Side-by-side with the
+            // Byte[] / IBufferWriter rows above this shows the chunked-streaming framework's pure CPU cost
+            // (no NamedPipe loopback noise) vs the simpler in-process serialize-then-deserialize patterns.
+            // The IO column shows "Pipe(in-mem)" — distinct from the NamedPipe AsyncPipe rows in [P] mode.
+            new AcBinaryInMemoryPipeBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeChunk)"),
+
+            // Raw byte[] over IN-MEMORY direct cross-thread handoff (no transport, no kernel, no Pipe). Apples-to-
+            // apples baseline for the in-memory chunked row above: same in-memory pattern, but raw byte[] vs
+            // chunked-streaming wire format. The IO column shows "Bytes(in-mem)".
+            new AcBinaryInMemoryRawByteArrayBenchmark(testData.Order, binaryFastModePipeChunkInMem, "FastMode (PipeRaw)"),
+
+            // AsyncPipe streaming over kernel NamedPipe (AcBinaryNamedPipeBenchmark) is intentionally OMITTED
+            // here — run it via the dedicated AsyncPipe menu [P] / CLI mode for isolated kernel-transport
+            // measurements.

            // ============================================================
            // MemoryPack — three I/O modes for apples-to-apples comparison
@ -724,37 +794,80 @@ public static class Program

    /// <summary>
    /// Interactive menu shown when no CLI args. Returns the layer keyword (core/comprehensive/edge/all) or null on Quit.
+    /// Loops on settings-changes ([S]) — user is returned to this menu after modifying iteration counts.
    /// </summary>
    private static (string layer, string serializerMode)? ShowInteractiveMenu()
    {
-        System.Console.WriteLine();
-        System.Console.WriteLine("╔══════════════════════════════════════════════════════════╗");
-        System.Console.WriteLine("║          AcBinary Benchmark Suite                        ║");
-        System.Console.WriteLine("╚══════════════════════════════════════════════════════════╝");
-        System.Console.WriteLine();
-        System.Console.WriteLine("Select benchmark layer:");
-        System.Console.WriteLine();
-        System.Console.WriteLine("  [1] Core           — daily iteration");
-        System.Console.WriteLine("  [2] Comprehensive  — release validation");
-        System.Console.WriteLine("  [3] Edge cases     — refactor verification");
-        System.Console.WriteLine("  [A] All layers");
-        System.Console.WriteLine("  [P] AsyncPipe      — streaming I/O isolation (only AsyncPipe, all test data)");
-        System.Console.WriteLine("  [Q] Quit");
-        System.Console.Write("\nSelection: ");
-
-        var key = System.Console.ReadKey(intercept: false).KeyChar;
-        System.Console.WriteLine();
-
-        return char.ToLower(key) switch
+        while (true)
        {
-            '1' => ("core", "standard"),
-            '2' => ("comprehensive", "standard"),
-            '3' => ("edge", "standard"),
-            'a' => ("all", "standard"),
-            'p' => ("all", "asyncpipe"),
-            'q' => null,
-            _ => ("all", "standard")
-        };
+            System.Console.WriteLine();
+            System.Console.WriteLine("╔══════════════════════════════════════════════════════════╗");
+            System.Console.WriteLine("║          AcBinary Benchmark Suite                        ║");
+            System.Console.WriteLine("╚══════════════════════════════════════════════════════════╝");
+            System.Console.WriteLine();
+            System.Console.WriteLine("Select benchmark layer:");
+            System.Console.WriteLine();
+            System.Console.WriteLine("  [1] Core           — daily iteration");
+            System.Console.WriteLine("  [2] Comprehensive  — release validation");
+            System.Console.WriteLine("  [3] Edge cases     — refactor verification");
+            System.Console.WriteLine("  [A] All layers");
+            System.Console.WriteLine("  [P] AsyncPipe      — streaming I/O isolation (only AsyncPipe, all test data)");
+            System.Console.WriteLine($"  [S] Settings       — modify Warmup ({WarmupIterations}) / Iterations ({TestIterations}) / Samples ({BenchmarkSamples})");
+            System.Console.WriteLine("  [Q] Quit");
+            System.Console.Write("\nSelection: ");
+
+            var key = System.Console.ReadKey(intercept: false).KeyChar;
+            System.Console.WriteLine();
+
+            switch (char.ToLower(key))
+            {
+                case '1': return ("core", "standard");
+                case '2': return ("comprehensive", "standard");
+                case '3': return ("edge", "standard");
+                case 'a': return ("all", "standard");
+                case 'p': return ("all", "asyncpipe");
+                case 's':
+                    ShowSettingsMenu();
+                    continue;  // re-display the main menu after settings update
+                case 'q': return null;
+                default: return ("all", "standard");
+            }
+        }
+    }
+
+    /// <summary>
+    /// Settings sub-menu — prompts for Warmup / Iterations / Samples values. Empty input keeps the
+    /// current value. Validation: WarmupIterations ≥ 0; TestIterations ≥ 1; BenchmarkSamples ≥ 1.
+    /// Returns to the caller (which re-displays the main menu).
+    /// </summary>
+    private static void ShowSettingsMenu()
+    {
+        System.Console.WriteLine();
+        System.Console.WriteLine("─────────────────────────────────────────────");
+        System.Console.WriteLine("Settings — press Enter to keep current value");
+        System.Console.WriteLine("─────────────────────────────────────────────");
+        System.Console.WriteLine();
+
+        WarmupIterations = PromptInt("WarmupIterations", WarmupIterations, min: 0);
+        TestIterations = PromptInt("TestIterations  ", TestIterations, min: 1);
+        BenchmarkSamples = PromptInt("BenchmarkSamples", BenchmarkSamples, min: 1);
+
+        System.Console.WriteLine();
+        System.Console.WriteLine($"✓ Settings updated: Warmup={WarmupIterations} | Iterations={TestIterations} | Samples={BenchmarkSamples}");
+    }
+
+    /// <summary>
+    /// Prompts the user for an integer with a default (current value). Returns the current value if
+    /// the user presses Enter on empty input or if parsing fails / value is below the minimum.
+    /// </summary>
+    private static int PromptInt(string name, int currentValue, int min)
+    {
+        System.Console.Write($"  {name} [{currentValue}]: ");
+        var input = System.Console.ReadLine()?.Trim() ?? "";
+        if (input.Length == 0) return currentValue;
+        if (int.TryParse(input, out var newValue) && newValue >= min) return newValue;
+        System.Console.WriteLine($"    ! Invalid value (need int ≥ {min}) — keeping {currentValue}");
+        return currentValue;
    }

    /// <summary>
@ -1264,6 +1377,185 @@ public static class Program
        }
    }

+    /// <summary>
+    /// Same chunked-framed AsyncPipe code path as <see cref="AcBinaryNamedPipeBenchmark"/>, but the transport
+    /// is an in-memory <see cref="System.IO.Pipelines.Pipe"/> instead of a kernel <c>NamedPipe</c>. The Pipe's
+    /// <c>Writer</c>/<c>Reader</c> pair is a managed-only zero-copy slab handoff — no syscalls, no kernel
+    /// buffer copy, no IRP queueing.
+    ///
+    /// <para><b>Why this benchmark matters</b>: by holding ALL other variables constant (same SerializeChunkedFramed,
+    /// same AsyncPipeReaderInput, same drain task, same consumer task, same multi-message wire format), this
+    /// row isolates the <b>kernel-NamedPipe transport overhead</b> from the chunked-streaming framework's pure
+    /// CPU cost. The expected delta vs <see cref="AcBinaryNamedPipeBenchmark"/>: per-chunk overhead drops from
+    /// ~25-30 µs (kernel-syscall pair + IRP) to ~1-2 µs (managed slab handoff). Multi-chunk Large-message rows
+    /// should converge dramatically toward <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/>.</para>
+    ///
+    /// <para><b>Real-world relevance</b>: in-memory Pipe is the typical primitive used for cross-thread serializer
+    /// pipelines inside a single process (e.g. SignalR's Kestrel transport adapter, gRPC framework internals,
+    /// custom message brokers). The numbers from this row reflect that scenario, NOT the kernel-pipe loopback
+    /// of the NamedPipe benchmark.</para>
+    /// </summary>
+    private sealed class AcBinaryInMemoryPipeBenchmark : ISerializerBenchmark, IDisposable
+    {
+        private readonly TestOrder _order;
+        private readonly AcBinarySerializerOptions _options;
+        private readonly byte[] _serialized; // for SerializedSize reporting only
+
+        // Long-lived in-memory pipe lifecycle (set up once in ctor — NOT timed).
+        private readonly Pipe _pipe;
+        private readonly PipeWriter _pipeWriter;
+        private readonly PipeReader _pipeReader;
+
+        // Long-lived multi-message receive infrastructure (set up once in ctor) — same pattern as the NamedPipe
+        // variant: drain pumps reader into AsyncPipeReaderInput, consumer task drives Deserialize<T>(input).
+        private readonly AsyncPipeReaderInput _input;
+        private readonly CancellationTokenSource _cts;
+        private readonly Task _drainTask;
+        private readonly Task _consumerTask;
+        private readonly ManualResetEventSlim _consumeRequest = new(false);
+        private readonly ManualResetEventSlim _consumeDone = new(false);
+        private object? _lastResult;
+        private bool _captureResult;
+        private bool _disposed;
+
+        public string Engine => EngineAcBinary;
+        public string IoMode => IoInMemoryPipe;
+        public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime;
+        public string OptionsPreset { get; }
+        public int SerializedSize => _serialized.Length;
+        public long SetupSerializeAllocBytes { get; }
+        public long SetupDeserializeAllocBytes { get; }
+        public bool IsRoundTripOnly => true;
+        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=Pipe(in-memory,multiMessage,2-task)");
+
+        public AcBinaryInMemoryPipeBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
+        {
+            _order = order;
+            _options = options;
+            OptionsPreset = optionsPreset;
+
+            _serialized = AcBinarySerializer.Serialize(order, _options);
+
+            // === SERIALIZE-side setup measurement ===
+            // In-memory Pipe construction. NO kernel-pipe pair, NO Connect handshake — just a managed Pipe object
+            // and a reference to its Writer side. PipeWriterImpl (parallel-flush capable, NOT StreamPipeWriter).
+            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
+            var beforeSer = GC.GetAllocatedBytesForCurrentThread();
+            _pipe = new Pipe();
+            _pipeWriter = _pipe.Writer;
+            var afterSer = GC.GetAllocatedBytesForCurrentThread();
+            SetupSerializeAllocBytes = afterSer - beforeSer;
+
+            // === DESERIALIZE-side setup measurement ===
+            // PipeReader reference + AsyncPipeReaderInput (ArrayPool rent + ManualResetEventSlim) + drain task +
+            // consumer task scaffolding. Identical to the NamedPipe variant on the receive side.
+            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
+            var beforeDes = GC.GetAllocatedBytesForCurrentThread();
+            _pipeReader = _pipe.Reader;
+            _input = new AsyncPipeReaderInput(_options.BufferWriterChunkSize * 2, multiMessage: true);
+            _cts = new CancellationTokenSource();
+            _drainTask = Task.Run(() => _input.DrainFromAsync(_pipeReader, _cts.Token));
+            _consumerTask = Task.Run(ConsumeLoop);
+            var afterDes = GC.GetAllocatedBytesForCurrentThread();
+            SetupDeserializeAllocBytes = afterDes - beforeDes;
+        }
+
+        // BG consumer: parks on _consumeRequest, runs Deserialize<T>(_input) when signaled, signals _consumeDone.
+        // Mirror of AcBinaryNamedPipeBenchmark.ConsumeLoop — same pattern, same MRES protocol.
+        private void ConsumeLoop()
+        {
+            var ct = _cts.Token;
+            try
+            {
+                while (true)
+                {
+                    _consumeRequest.Wait(ct);
+                    if (ct.IsCancellationRequested) return;
+                    _consumeRequest.Reset();
+
+                    try
+                    {
+                        var result = AcBinaryDeserializer.Deserialize<TestOrder>(_input, _options);
+                        if (_captureResult) _lastResult = result;
+                    }
+                    catch
+                    {
+                        // Swallow — see ConsumeLoop in NamedPipe variant for rationale.
+                    }
+                    finally
+                    {
+                        _consumeDone.Set();
+                    }
+                }
+            }
+            catch (OperationCanceledException)
+            {
+                // Cooperative cancel — Dispose path. Swallow.
+            }
+        }
+
+        public void Warmup(int iterations)
+        {
+            for (var i = 0; i < iterations; i++) Serialize();
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        public void Serialize()
+        {
+            // Same 2-task streaming pipeline as NamedPipe variant — only the transport differs (in-memory Pipe
+            // instead of kernel NamedPipe). Per-chunk SerializeChunkedFramed → PipeWriter slab → drain task
+            // reads from PipeReader → input.Feed → consumer Deserialize<T> consumes byte-by-byte.
+            _consumeDone.Reset();
+            _consumeRequest.Set();
+
+            AcBinarySerializer.SerializeChunkedFramed(_order, _pipeWriter, _options);
+
+            _consumeDone.Wait();
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        public void Deserialize()
+        {
+            // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
+        }
+
+        public bool VerifyRoundTrip()
+        {
+            _captureResult = true;
+            try
+            {
+                Serialize();
+                var result = _lastResult as TestOrder;
+                return result != null && DeepEqualsViaJson(_order, result);
+            }
+            finally
+            {
+                _captureResult = false;
+                _lastResult = null;
+            }
+        }
+
+        public void Dispose()
+        {
+            if (_disposed) return;
+            _disposed = true;
+
+            // Cancel drain + consumer tasks → both exit. Pulse _consumeRequest in case consumer is parked.
+            try { _cts.Cancel(); } catch { /* swallow on teardown */ }
+            try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
+            try { _drainTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
+            try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
+
+            // Complete writer + reader (in-memory Pipe — no underlying stream to dispose).
+            try { _pipeWriter.CompleteAsync().AsTask().Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
+            try { _pipeReader.Complete(); } catch { /* swallow on teardown */ }
+            try { _input.Dispose(); } catch { /* swallow on teardown */ }
+            try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
+            try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
+            try { _cts.Dispose(); } catch { /* swallow on teardown */ }
+        }
+    }
+
    /// <summary>
    /// Raw <c>byte[]</c> over a long-lived NamedPipe — NO chunk-framing, NO <c>AsyncPipeReaderInput</c>,
    /// NO sliding-window buffer. Calling thread serialises + writes; a long-lived background consumer task
@ -1479,6 +1771,174 @@ public static class Program
        }
    }

+    /// <summary>
+    /// Raw <c>byte[]</c> over an in-memory cross-thread handoff — NO transport (no NamedPipe, no Pipe, no
+    /// Channel<see langword="&lt;T&gt;"/>). Calling thread serialises into a fresh <c>byte[]</c>, hands it to a
+    /// background consumer task via a single byte[] slot + MRES pair; the consumer deserialises and signals done.
+    ///
+    /// <para><b>Why this benchmark matters</b>: completes the 2x2 transport × wire-format matrix:</para>
+    /// <list type="bullet">
+    ///   <item><description><b>NamedPipe + Chunked</b> = <see cref="AcBinaryNamedPipeBenchmark"/></description></item>
+    ///   <item><description><b>NamedPipe + Raw</b> = <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/></description></item>
+    ///   <item><description><b>In-memory Pipe + Chunked</b> = <see cref="AcBinaryInMemoryPipeBenchmark"/></description></item>
+    ///   <item><description><b>In-memory + Raw</b> = THIS row — apples-to-apples baseline for the in-memory chunked row</description></item>
+    /// </list>
+    /// <para>Side-by-side with <see cref="AcBinaryInMemoryPipeBenchmark"/> this isolates the chunked-streaming
+    /// framework's pure CPU cost, with the same in-memory transport (zero kernel involvement) on both sides.
+    /// Side-by-side with <see cref="AcBinaryNamedPipeRawByteArrayBenchmark"/> this isolates the kernel-NamedPipe
+    /// overhead on the raw-byte[] side.</para>
+    /// </summary>
+    private sealed class AcBinaryInMemoryRawByteArrayBenchmark : ISerializerBenchmark, IDisposable
+    {
+        private readonly TestOrder _order;
+        private readonly AcBinarySerializerOptions _options;
+        private readonly byte[] _serialized; // for SerializedSize reporting only
+
+        // Long-lived consumer-task infrastructure (Deserialize on BG thread, signaled per iter).
+        // No transport — just a byte[] slot for handoff between calling thread and consumer task.
+        private readonly CancellationTokenSource _cts;
+        private readonly Task _consumerTask;
+        private readonly ManualResetEventSlim _consumeRequest = new(false);
+        private readonly ManualResetEventSlim _consumeDone = new(false);
+        private byte[]? _pendingBytes;             // calling thread → consumer task handoff slot
+        private object? _lastResult;               // captured during VerifyRoundTrip; null in benchmark iters
+        private bool _captureResult;
+        private bool _disposed;
+
+        public string Engine => EngineAcBinary;
+        public string IoMode => IoInMemoryRaw;
+        public string DispatchMode => _options.UseGeneratedCode ? ModeSGen : ModeRuntime;
+        public string OptionsPreset { get; }
+        public int SerializedSize => _serialized.Length;
+        public long SetupSerializeAllocBytes { get; }
+        public long SetupDeserializeAllocBytes { get; }
+        public bool IsRoundTripOnly => true;
+        public string OptionsDescription => BuildAcBinaryOptionsDescription(_options, $", BufferSize={_options.BufferWriterChunkSize}B, Transport=in-memory(raw,2-task)");
+
+        public AcBinaryInMemoryRawByteArrayBenchmark(TestOrder order, AcBinarySerializerOptions options, string optionsPreset)
+        {
+            _order = order;
+            _options = options;
+            OptionsPreset = optionsPreset;
+
+            _serialized = AcBinarySerializer.Serialize(order, _options);
+
+            // === SERIALIZE-side setup measurement ===
+            // Nothing to set up — calling thread allocates byte[] per iter via AcBinarySerializer.Serialize.
+            SetupSerializeAllocBytes = 0;
+
+            // === DESERIALIZE-side setup measurement ===
+            // 1× background consumer-task + 2× MRES (request / done) + cancellation source.
+            GC.Collect(); GC.WaitForPendingFinalizers(); GC.Collect();
+            var beforeDes = GC.GetAllocatedBytesForCurrentThread();
+            _cts = new CancellationTokenSource();
+            _consumerTask = Task.Run(ConsumerLoop);
+            var afterDes = GC.GetAllocatedBytesForCurrentThread();
+            SetupDeserializeAllocBytes = afterDes - beforeDes;
+        }
+
+        // BG consumer: parks on _consumeRequest, picks up the byte[] from _pendingBytes, runs Deserialize<T>(bytes),
+        // signals _consumeDone. Direct in-process handoff — no transport syscall, no buffer copy beyond the byte[]
+        // reference itself (zero-copy by reference).
+        private void ConsumerLoop()
+        {
+            var ct = _cts.Token;
+            try
+            {
+                while (true)
+                {
+                    _consumeRequest.Wait(ct);
+                    if (ct.IsCancellationRequested) return;
+                    _consumeRequest.Reset();
+
+                    try
+                    {
+                        var bytes = _pendingBytes;
+                        if (bytes != null)
+                        {
+                            var result = AcBinaryDeserializer.Deserialize<TestOrder>(bytes, _options);
+                            if (_captureResult) _lastResult = result;
+                        }
+                    }
+                    catch
+                    {
+                        // Swallow — see ConsumerLoop in NamedPipe variant for rationale.
+                    }
+                    finally
+                    {
+                        _consumeDone.Set();
+                    }
+                }
+            }
+            catch (OperationCanceledException)
+            {
+                // Cooperative cancel — Dispose path. Swallow.
+            }
+        }
+
+        public void Warmup(int iterations)
+        {
+            for (var i = 0; i < iterations; i++) Serialize();
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        public void Serialize()
+        {
+            // 2-task in-memory pipeline:
+            // 1. Calling thread serialises → fresh byte[] (per-iter alloc, matches AcBinaryBenchmark contract).
+            // 2. Calling thread parks the byte[] into _pendingBytes and signals consumer task. Consumer task
+            //    picks up the reference (zero-copy) and runs Deserialize<T>(bytes).
+            // 3. Calling thread waits for _consumeDone (consumer task finished Des).
+            //
+            // Same architectural limitation as the NamedPipe-raw variant: Des cannot start until full bytes
+            // are available. Only the per-iter Ser↔Des thread-handoff overlaps slightly (calling thread starts
+            // signalling and waiting while consumer thread takes the byte[]).
+            var bytes = AcBinarySerializer.Serialize(_order, _options);
+
+            _pendingBytes = bytes;
+            _consumeDone.Reset();
+            _consumeRequest.Set();
+
+            _consumeDone.Wait();
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)]
+        public void Deserialize()
+        {
+            // No-op: per-iter round-trip is captured in Serialize(). See IsRoundTripOnly contract.
+        }
+
+        public bool VerifyRoundTrip()
+        {
+            _captureResult = true;
+            try
+            {
+                Serialize();
+                var result = _lastResult as TestOrder;
+                return result != null && DeepEqualsViaJson(_order, result);
+            }
+            finally
+            {
+                _captureResult = false;
+                _lastResult = null;
+            }
+        }
+
+        public void Dispose()
+        {
+            if (_disposed) return;
+            _disposed = true;
+
+            try { _cts.Cancel(); } catch { /* swallow on teardown */ }
+            try { _consumeRequest.Set(); } catch { /* nudge in case consumer Wait is parked */ }
+            try { _consumerTask.Wait(TimeSpan.FromSeconds(2)); } catch { /* swallow on teardown */ }
+
+            try { _consumeRequest.Dispose(); } catch { /* swallow on teardown */ }
+            try { _consumeDone.Dispose(); } catch { /* swallow on teardown */ }
+            try { _cts.Dispose(); } catch { /* swallow on teardown */ }
+        }
+    }
+
    /// <summary>
    /// Benchmarks MemoryPack via the IBufferWriter overload, allocating a FRESH ArrayBufferWriter on EVERY call.
    /// Apples-to-apples counterpart to AcBinaryFreshBufferWriterBenchmark.
--- a/AyCode.Core.Tests/Serialization/AcBinarySerializerNamedPipeTests.cs
+++ b/AyCode.Core.Tests/Serialization/AcBinarySerializerNamedPipeTests.cs
@ -14,10 +14,11 @@ namespace AyCode.Core.Tests.Serialization;
 /// the tests own the <see cref="NamedPipeServerStream"/> / <see cref="NamedPipeClientStream"/>
 /// lifecycle directly and call the generic
 /// <see cref="AcBinarySerializer.SerializeChunked{T}(T, PipeWriter, AcBinarySerializerOptions)"/> +
-/// <see cref="AcBinaryDeserializer.DeserializeFromPipeReaderAsync{T}"/> primitives. This proves
-/// the streaming framework works on arbitrary <c>PipeWriter</c>/<c>PipeReader</c> sources
-/// (NamedPipe, FileStream, NetworkStream, custom transports) without per-transport adapters in
-/// the framework.</para>
+/// <see cref="AcBinaryDeserializer.Deserialize{T}(AsyncPipeReaderInput, AcBinarySerializerOptions)"/>
+/// primitives, with the receive-side drain implemented via the test-only
+/// <see cref="AsyncPipeReaderInputExtensions.DrainFromAsync"/> extension. This proves the streaming
+/// framework works on arbitrary <c>PipeWriter</c>/<c>PipeReader</c> sources (NamedPipe, FileStream,
+/// NetworkStream, custom transports) without per-transport adapters in the framework.</para>
 ///
 /// <para>With <c>BufferWriterChunkSize = 256</c>, even small test payloads cross multiple chunk
 /// boundaries on the wire — exercises the real chunking + sliding-window cycling behavior.</para>
@ -104,8 +105,10 @@ public class AcBinarySerializerNamedPipeTests
    /// <summary>
    /// Owns the full NamedPipe lifecycle: binds server, accepts connect, drives the generic
    /// <see cref="AcBinarySerializer.SerializeChunked{T}(T, PipeWriter, AcBinarySerializerOptions)"/> on
-    /// the client side and <see cref="AcBinaryDeserializer.DeserializeFromPipeReaderAsync{T}"/>
-    /// on the server side. The framework helpers know nothing about NamedPipe — only PipeWriter /
+    /// the client side, and on the server side runs the canonical drain+deserialize pair
+    /// (test-only <see cref="AsyncPipeReaderInputExtensions.DrainFromAsync"/> on the calling thread,
+    /// <see cref="AcBinaryDeserializer.Deserialize{T}(AsyncPipeReaderInput, AcBinarySerializerOptions)"/>
+    /// on a Task.Run BG thread). The framework helpers know nothing about NamedPipe — only PipeWriter /
    /// PipeReader.
    /// </summary>
    private static async Task<T?> RunNamedPipeRoundTripAsync<T>(string pipeName, T original, AcBinarySerializerOptions opts)
@ -119,7 +122,12 @@ public class AcBinarySerializerNamedPipeTests
            await pipeServer.WaitForConnectionAsync().ConfigureAwait(false);
            var pipeReader = PipeReader.Create(pipeServer);

-            return await AcBinaryDeserializer.DeserializeFromPipeReaderAsync<T>(pipeReader, opts).ConfigureAwait(false);
+            // Inlined version of what the removed DeserializeFromPipeReaderAsync used to do:
+            // single-message mode + drain on calling thread + deserialize on Task.Run BG.
+            using var input = new AsyncPipeReaderInput(initialCapacity: opts.BufferWriterChunkSize * 2, multiMessage: false);
+            var deserTask = Task.Run(() => AcBinaryDeserializer.Deserialize<T>(input, opts));
+            await input.DrainFromAsync(pipeReader).ConfigureAwait(false);
+            return await deserTask.ConfigureAwait(false);
        });

        await using var pipeClient = new NamedPipeClientStream(".", pipeName, PipeDirection.Out, System.IO.Pipes.PipeOptions.Asynchronous);
--- a/AyCode.Core/Serializers/Binaries/AsyncPipeReaderInputExtensions.cs
+++ b/AyCode.Core/Serializers/Binaries/AsyncPipeReaderInputExtensions.cs
@ -1,18 +1,22 @@
+using AyCode.Core.Serializers.Binaries;
 using System;
 using System.IO.Pipelines;
 using System.Threading;
 using System.Threading.Tasks;

-namespace AyCode.Core.Serializers.Binaries;
+namespace AyCode.Core.Tests.Serialization;

 /// <summary>
-/// Extension methods for populating <see cref="AsyncPipeReaderInput"/> from
-/// <see cref="System.IO.Pipelines.PipeReader"/>-backed transports (NamedPipe, FileStream,
+/// Test/benchmark-only extension methods for populating <see cref="AsyncPipeReaderInput"/>
+/// from <see cref="System.IO.Pipelines.PipeReader"/>-backed transports (NamedPipe, FileStream,
 /// custom pipe sources).
 ///
-/// Lives in a separate file from the core class so <see cref="AsyncPipeReaderInput"/> does not
-/// import <c>System.IO.Pipelines</c> in its primary surface — the optional pull-mode is visible
-/// at use-sites (per ADR-0003 Decision §3 at <c>docs/adr/0003-acbinary-streaming-receive-architecture.md</c>).
+/// <para><b>Why test-only:</b> in real production, the consuming application already has its own
+/// reader-task that reads from the pipe and pushes bytes via <c>AsyncPipeReaderInput.Feed</c>
+/// — providing this drain extension publicly would duplicate that responsibility and confuse
+/// the canonical push-pattern. The extension is kept here for unit-test scaffolding and the
+/// streaming benchmark; production NuGet consumers should write their own drain logic in their
+/// own reader-task following the application's threading model.</para>
 /// </summary>
 public static class AsyncPipeReaderInputExtensions
 {
@ -21,9 +25,9 @@ public static class AsyncPipeReaderInputExtensions
    /// calls <see cref="AsyncPipeReaderInput.Feed"/> on each segment and
    /// <see cref="AsyncPipeReaderInput.Complete"/> when the pipe completes.
    ///
-    /// <para>Typical usage: NamedPipe IPC and FileStream-via-PipeReader transports schedule this
-    /// on a background task while the deserialization context reads from the same input on
-    /// another thread.</para>
+    /// <para>Typical usage (test-only): NamedPipe IPC and FileStream-via-PipeReader transports
+    /// schedule this on a background task while the deserialization context reads from the same
+    /// input on another thread.</para>
    ///
    /// <para><see cref="AsyncPipeReaderInput.Complete"/> is invoked in a <c>finally</c> block —
    /// ensures the consumer always wakes up even if the pipe read throws or the operation is
--- a/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinaryDeserializer.cs
@ -337,52 +337,6 @@ public static partial class AcBinaryDeserializer
        }
    }

-    /// <summary>
-    /// Deserialize from a <see cref="System.IO.Pipelines.PipeReader"/> with full streaming pipeline
-    /// parallelism — drains the reader on the calling thread, while a background <c>Task.Run</c>
-    /// deserializes incrementally from the same shared <see cref="AsyncPipeReaderInput"/>.
-    ///
-    /// <para>Transport-agnostic: works with any <c>PipeReader</c> source — NamedPipe IPC
-    /// (<c>PipeReader.Create(namedPipeServerStream)</c>), file-stream
-    /// (<c>PipeReader.Create(fileStream)</c>), TCP (<c>PipeReader.Create(networkStream)</c>),
-    /// or custom <c>PipeReader</c> implementations. Reads <b>raw AcBinary bytes</b> verbatim from
-    /// the pipe — no wire-format unwrapping. Pair with the producer-side
-    /// <see cref="AcBinarySerializer.SerializeChunked{T}(T, System.IO.Pipelines.PipeWriter, AcBinarySerializerOptions)"/>
-    /// (or its <see cref="System.IO.Pipelines.Pipe"/> overload), which writes the same raw byte
-    /// stream as <see cref="AcBinarySerializer.Serialize{T}(T, AcBinarySerializerOptions)"/>'s
-    /// <c>byte[]</c> output.</para>
-    ///
-    /// <para>Receive buffer initial capacity is derived from <c>options.BufferWriterChunkSize × 2</c>
-    /// — two-chunks-worth of headroom plus reset-to-0 cycling reuses the same buffer for the
-    /// message's lifetime regardless of total payload size.</para>
-    ///
-    /// <para><b>For the multiplexed wire format</b> (per-chunk <c>[201][UINT16][data]</c> headers,
-    /// produced by <c>SerializeChunkedFramed</c> or SignalR's AsyncSegment mode): the parser
-    /// strips framing on its own (e.g. <c>AcBinaryHubProtocol.TryParseChunkData</c>) and feeds
-    /// only the data bytes here.</para>
-    /// </summary>
-    /// <param name="reader">Source pipe reader. Caller owns lifecycle (creation + completion).</param>
-    /// <param name="options">Serializer options. Defaults to <see cref="AcBinarySerializerOptions.Default"/>.
-    /// <c>BufferWriterChunkSize</c> controls the receive-side initial buffer (× 2 headroom).</param>
-    /// <param name="ct">Cancellation token. For connect-timeout, pass the token of a
-    /// <c>new CancellationTokenSource(timeout)</c>.</param>
-    public static async Task<T?> DeserializeFromPipeReaderAsync<T>(System.IO.Pipelines.PipeReader reader, AcBinarySerializerOptions? options = null, CancellationToken ct = default)
-    {
-        if (reader is null) throw new ArgumentNullException(nameof(reader));
-
-        var opts = options ?? AcBinarySerializerOptions.Default;
-
-        // Single-message mode (multiMessage: false) — bytes drained from the PipeReader are forwarded
-        // verbatim to the deserialization buffer. Pair with AcBinarySerializer.SerializeChunked
-        // (raw byte stream) on the producer side; for multi-message framed wire formats the parser
-        // strips framing upstream and feeds only data bytes here.
-        using var input = new AsyncPipeReaderInput(initialCapacity: opts.BufferWriterChunkSize * 2, multiMessage: false);
-        var deserTask = Task.Run(() => Deserialize<T>(input, opts), ct);
-
-        await input.DrainFromAsync(reader, ct).ConfigureAwait(false);
-        return await deserTask.ConfigureAwait(false);
-    }
-
    /// <summary>
    /// Internal: Deserialize with any TInput (multi-segment or other future input types).
    /// </summary>
--- a/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs
+++ b/AyCode.Core/Serializers/Binaries/AcBinarySerializer.cs
@ -426,9 +426,10 @@ public static partial class AcBinarySerializer
    /// Serialize to a <see cref="System.IO.Pipelines.Pipe"/> as a chunked stream — pure AcBinary
    /// bytes are written via <see cref="AsyncPipeWriterOutput"/> in raw mode (no per-chunk header).
    /// The output is byte-compatible with <see cref="Serialize{T}(T, AcBinarySerializerOptions)"/>'s
-    /// <c>byte[]</c> result; a consumer can drain <c>pipe.Reader</c> and feed the bytes directly to
-    /// <see cref="AcBinaryDeserializer"/> (or pipe-them through <c>DeserializeFromPipeReaderAsync</c>)
-    /// with no extra parser.
+    /// <c>byte[]</c> result; a consumer drains <c>pipe.Reader</c> in its own reader-task and pushes
+    /// bytes via <see cref="AsyncPipeReaderInput.Feed"/>, then calls
+    /// <see cref="AcBinaryDeserializer.Deserialize{T}(AsyncPipeReaderInput, AcBinarySerializerOptions)"/>
+    /// — no extra parser, no special transport adapter.
    ///
    /// <para><b>Why <see cref="System.IO.Pipelines.Pipe"/> instead of <see cref="System.IO.Pipelines.PipeWriter"/>?</b>
    /// <c>Pipe.Writer</c> is always the BCL <c>PipeWriterImpl</c>, which is parallel-capable
--- a/AyCode.Core/Serializers/Binaries/AsyncPipeReaderInput.cs
+++ b/AyCode.Core/Serializers/Binaries/AsyncPipeReaderInput.cs
@ -24,15 +24,49 @@ namespace AyCode.Core.Serializers.Binaries;
 /// verbatim (matches <c>AcBinarySerializer.SerializeChunked</c> raw output drained from a
 /// <see cref="System.IO.Pipelines.PipeReader"/>); single-message scenario, no auto-reset.</para>
 ///
-/// <para>Usage modes:</para>
+/// <para>Usage: <b>push pattern only</b>. The consumer's reader-task reads bytes from any
+/// underlying transport (the framework knows nothing about which) and pushes them via
+/// <see cref="Feed"/>; a separate consumer thread (or task) calls
+/// <see cref="AcBinaryDeserializer.Deserialize{T}(AsyncPipeReaderInput, AcBinarySerializerOptions)"/>.
+/// The framework does NOT own the transport — the consumer's reader-task does, following the
+/// application's threading model.</para>
+///
+/// <para><b>When chunked-streaming is the right fit</b> (vs raw <c>byte[]</c> /
+/// <see cref="AcBinaryDeserializer.Deserialize{T}(byte[], AcBinarySerializerOptions)"/>):</para>
 /// <list type="bullet">
-///   <item><b>Push (Feed-API)</b>: producer thread calls <see cref="Feed"/> with chunk bytes
-///         (typical for SignalR <c>TryParseChunkData</c>).</item>
-///   <item><b>Pull (DrainFromAsync extension)</b>: helper drains a
-///         <see cref="System.IO.Pipelines.PipeReader"/> into the input via repeated
-///         <see cref="Feed"/> calls (typical for NamedPipe / FileStream / NetworkStream).</item>
+///   <item><b>Network transports</b> — TCP / UDP / WebSocket / SSE / HTTP/2 streams. Per-chunk
+///         CPU overhead (~30 µs / chunk) is invisible next to ms-scale RTT; the streaming
+///         pipeline lets sender, transport, and receiver work in parallel on different parts of
+///         the message.</item>
+///   <item><b>Multi-connection servers</b> — Kestrel-style (SignalR), gRPC servers, custom RPC
+///         hosts. Per-connection peak memory bounded by buffer-size (e.g. 32 KB), not by max
+///         message size — 1000 concurrent connections × 1 MB messages = 32 MB peak (vs 1 GB
+///         with raw <c>byte[]</c>). LOH allocation pressure (≥ 85 KB messages) is also avoided.</item>
+///   <item><b>Message brokers / queues</b> — Kafka / Redis Streams / Azure Service Bus clients
+///         that expose <see cref="System.Buffers.IBufferWriter{T}"/> sinks. Streaming serialize
+///         writes directly into the transport buffer — no intermediate <c>byte[]</c> allocation.</item>
+///   <item><b>File streaming</b> — <c>FileStream</c> behind a
+///         <see cref="System.IO.Pipelines.PipeReader"/>. 100 MB+ payloads from disk with constant
+///         32 KB peak memory.</item>
+///   <item><b>In-memory <see cref="System.IO.Pipelines.Pipe"/> cross-thread handoff</b> —
+///         producer + consumer threads coordinate over a shared <c>Pipe</c>; zero-copy slab handoff.</item>
+///   <item><b>Custom transport adapters</b> — anything where the consumer wants to push bytes
+///         from a transport-specific reader-task.</item>
 /// </list>
 ///
+/// <para><b>When raw <c>byte[]</c> is the right fit</b>: same-process loopback IPC where transport
+/// latency is near zero, single-producer/single-consumer batch operations where peak memory is
+/// not a constraint, sub-LOH messages (&lt; 85 KB) with no GC-pressure concerns. The chunked-streaming
+/// per-chunk CPU overhead is fully visible in these scenarios — raw is faster end-to-end.</para>
+///
+/// <para><b>Performance characteristic</b>: per-chunk overhead is roughly constant (~25-30 µs —
+/// FlushAsync syscall + ReadAsync syscall + framing-parse + sliding-window bookkeeping). Total
+/// chunk-overhead = <c>(messageSize / chunkSize) × ~30 µs</c>. The streaming benefit is pipeline
+/// parallelism + bounded peak memory — both of which require a non-trivial transport stage to
+/// surface (network, file, cross-thread queue). On same-process loopback NamedPipe (the worst-case
+/// benchmark scenario), the per-chunk cost dominates and chunked appears slower than raw — this
+/// is a benchmark-artifact, not the production characteristic.</para>
+///
 /// Backed by a single contiguous <c>byte[]</c> from <see cref="ArrayPool{T}"/>. Positions reset
 /// to 0 when the consumer catches up (sliding-window cycling — peak buffer memory bounded by
 /// chunk size, NOT message size). Grow is the absolute last resort and practically never fires
@ -312,8 +346,8 @@ public sealed class AsyncPipeReaderInput : IBinaryInputBase, IDisposable
    }

    /// <summary>
-    /// Whether <see cref="Complete"/> has been called (or <see cref="DrainFromAsync"/>'s underlying
-    /// stream signalled EOF and the finally block closed the input). Once <c>true</c>, the session
+    /// Whether <see cref="Complete"/> has been called (typically by the consumer's reader-task
+    /// finally-block after the underlying transport signals EOF). Once <c>true</c>, the session
    /// has ended — any pending <see cref="AcBinaryDeserializer.Deserialize{T}(AsyncPipeReaderInput, AcBinarySerializerOptions)"/>
    /// call returns whatever partial buffer is left, and subsequent calls return immediately.
    /// </summary>
--- a/AyCode.Core/Serializers/Binaries/AsyncPipeWriterOutput.cs
+++ b/AyCode.Core/Serializers/Binaries/AsyncPipeWriterOutput.cs
@ -53,7 +53,33 @@ namespace AyCode.Core.Serializers.Binaries;
 /// <see cref="AcBinaryHubProtocol"/> passes 10 s from its options). A <see cref="TimeoutException"/>
 /// propagates to the caller, allowing the connection to abort instead of blocking forever.</para>
 ///
-/// Maximum chunk data size (in framed mode): 65535 bytes (UINT16 max).
+/// <para>Maximum chunk data size (in framed mode): 65535 bytes (UINT16 max).</para>
+///
+/// <para><b>When chunked-streaming is the right fit</b> (vs raw <c>byte[]</c> output via
+/// <see cref="AcBinarySerializer.Serialize{T}(T, AcBinarySerializerOptions)"/>):</para>
+/// <list type="bullet">
+///   <item><b>Network transports</b> — TCP / UDP / WebSocket / SSE / HTTP/2 streams. Per-chunk
+///         CPU overhead is invisible next to ms-scale RTT; pipeline parallelism lets sender,
+///         transport, and receiver work in parallel on different parts of the message.</item>
+///   <item><b>Multi-connection servers</b> — Kestrel-style (SignalR), gRPC / proprietary RPC.
+///         Per-connection peak memory bounded by chunk-size; LOH allocation pressure (≥ 85 KB
+///         messages) is avoided.</item>
+///   <item><b>Message brokers / queues</b> — Kafka / Redis Streams / Service Bus clients exposing
+///         <see cref="System.Buffers.IBufferWriter{T}"/> or <c>PipeWriter</c> sinks. Streaming
+///         serialize writes directly into the transport buffer.</item>
+///   <item><b>File streaming</b> — <c>FileStream</c>-backed <see cref="System.IO.Pipelines.PipeWriter"/>.
+///         100 MB+ payloads to disk with constant peak memory.</item>
+///   <item><b>In-memory cross-thread <see cref="System.IO.Pipelines.Pipe"/></b> — producer thread
+///         serialises while consumer thread deserialises in parallel.</item>
+///   <item><b>Custom transport adapters</b> — anything where the application owns a
+///         <c>PipeWriter</c> or <c>IBufferWriter</c> sink and wants incremental output.</item>
+/// </list>
+///
+/// <para><b>When raw <c>byte[]</c> output is the right fit</b>: same-process loopback IPC where
+/// transport latency is near zero, single batch-style operations where peak memory is not a
+/// constraint, sub-LOH messages (&lt; 85 KB) with no GC-pressure concerns. The chunked-streaming
+/// per-chunk CPU overhead is fully visible in these scenarios — raw is faster end-to-end. Pick the
+/// chunked path only when the transport stage is non-trivial (network, file, cross-thread queue).</para>
 /// </summary>
 public struct AsyncPipeWriterOutput : IBinaryOutputBase
 {
--- a/AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md
+++ b/AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md