diff --git a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenWriter.cs b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenWriter.cs index 6ec3706..6f4d859 100644 --- a/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenWriter.cs +++ b/AyCode.Core.Serializers.SourceGenerator/AcBinarySourceGenerator.GenWriter.cs @@ -306,13 +306,17 @@ public partial class AcBinarySourceGenerator sb.AppendLine($"{i} AcBinarySerializer.WriteValueGenerated({a}, {a}.GetType(), context);"); sb.AppendLine($"{i}}}"); } - else if (p.IsNullable) + else { + // Reference type properties can always be null at runtime regardless of nullable + // annotation — runtime can violate the nullable-disabled contract via EF lazy-load, + // projection gaps, detached navigation properties, etc. Mirrors the EmitDirectObjectWrite + // (line ~828) and EmitDirectCollectionWrite (line ~877) defensive null-check. + // Reader-side compat: every markered property is wrapped in `if (tc != PropertySkip)` + // by EmitReadProperty (GenReader.cs line ~137), so the marker is uniformly handled. sb.AppendLine($"{i}if ({a} == null) context.WriteByte(BinaryTypeCode.PropertySkip);"); sb.AppendLine($"{i}else AcBinarySerializer.WriteObjectGenerated({a}, typeof({p.TypeNameForTypeof}), context);"); } - else - sb.AppendLine($"{i}AcBinarySerializer.WriteObjectGenerated({a}, typeof({p.TypeNameForTypeof}), context);"); break; case PropertyTypeKind.Collection: // Direct collection write for List/T[] with Complex element types that have generated writers diff --git a/AyCode.Core.Tests/Serialization/AcBinarySerializerSGenNullComplexPropertyTests.cs b/AyCode.Core.Tests/Serialization/AcBinarySerializerSGenNullComplexPropertyTests.cs new file mode 100644 index 0000000..c0f5da9 --- /dev/null +++ b/AyCode.Core.Tests/Serialization/AcBinarySerializerSGenNullComplexPropertyTests.cs @@ -0,0 +1,70 @@ +using AyCode.Core.Serializers.Binaries; +using AyCode.Core.Tests.TestModels; + +namespace AyCode.Core.Tests.Serialization; + +[TestClass] +public class AcBinarySerializerSGenNullComplexPropertyTests +{ + [TestMethod] + [DataRow(true, true)] + [DataRow(true, false)] + [DataRow(false, false)] + [DataRow(false, true)] + public void Serialize_SGenComplexPropertyNull_DoesNotThrow_AndRoundTripsAsNull(bool useSgen, bool fastMode) + { + var model = new SGenNullComplexParent + { + Id = 7, + Customer = null!, + Note = "regression" + }; + + var options = fastMode ? AcBinarySerializerOptions.FastMode: AcBinarySerializerOptions.Default; + options.UseGeneratedCode = useSgen; + + var bytes = AcBinarySerializer.Serialize(model, options); + var roundTrip = AcBinaryDeserializer.Deserialize(bytes, options); + + Assert.IsNotNull(roundTrip); + Assert.AreEqual(model.Id, roundTrip.Id); + Assert.AreEqual(model.Note, roundTrip.Note); + Assert.IsNull(roundTrip.Customer, + "complex reference property must round-trip as null when source was null " + + "(regression for SGen WriteObjectGenerated fallback else-branch null-check)"); + + Assert.IsTrue(System.Array.IndexOf(bytes, (byte)BinaryTypeCode.PropertySkip) >= 0, + "writer must emit PropertySkip marker on the null Customer slot " + + "(deeper verification: confirms the fix took the PropertySkip path, " + + "not an unrelated null-safe code path)"); + } + + [TestMethod] + [DataRow(true, true)] + [DataRow(true, false)] + [DataRow(false, false)] + [DataRow(false, true)] + public void Serialize_SGenComplexPropertyNonNull_RoundTripsCorrectly(bool useSgen, bool fastMode) + { + var model = new SGenNullComplexParent + { + Id = 13, + Customer = new NonGeneratedComplexCustomer { Id = 42, Name = "child" }, + Note = "positive" + }; + + var options = fastMode ? AcBinarySerializerOptions.FastMode: AcBinarySerializerOptions.Default; + options.UseGeneratedCode = useSgen; + + var bytes = AcBinarySerializer.Serialize(model, options); + var roundTrip = AcBinaryDeserializer.Deserialize(bytes, options); + + Assert.IsNotNull(roundTrip); + Assert.AreEqual(model.Id, roundTrip.Id); + Assert.AreEqual(model.Note, roundTrip.Note); + Assert.IsNotNull(roundTrip.Customer, + "non-null complex reference property must round-trip (null-check fix must not break the non-null path)"); + Assert.AreEqual(model.Customer.Id, roundTrip.Customer.Id); + Assert.AreEqual(model.Customer.Name, roundTrip.Customer.Name); + } +} diff --git a/AyCode.Core.Tests/TestModels/SGenNullComplexPropertyModels.cs b/AyCode.Core.Tests/TestModels/SGenNullComplexPropertyModels.cs new file mode 100644 index 0000000..a904e58 --- /dev/null +++ b/AyCode.Core.Tests/TestModels/SGenNullComplexPropertyModels.cs @@ -0,0 +1,27 @@ +using AyCode.Core.Serializers.Attributes; + +namespace AyCode.Core.Tests.TestModels; + +/// +/// Intentionally NOT marked with [AcBinarySerializable]. +/// Used to reproduce the generated-writer path where the parent has a complex reference property +/// without a generated writer on the child type. +/// +public class NonGeneratedComplexCustomer +{ + public int Id { get; set; } + public string? Name { get; set; } +} + +/// +/// Regression model for SGen complex-property null handling. +/// The Customer property is non-nullable in signature, but runtime data can still contain null. +/// Serializer must emit PropertySkip instead of dereferencing null. +/// +[AcBinarySerializable] +public class SGenNullComplexParent +{ + public int Id { get; set; } + public NonGeneratedComplexCustomer Customer { get; set; } = null!; + public string? Note { get; set; } +} diff --git a/AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md b/AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md index 22093d8..0d9503e 100644 --- a/AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md @@ -253,3 +253,28 @@ Both `CommitCurrentChunk` and `Grow` would need branchy semantics. The existing - Round-trip tests pass on NamedPipe / FileStream / cross-process / TLS-wrapped streams - Hot-path branching contained: ≤3 `if (_isStreamPipeWriter ...)` checks across CommitCurrentChunk / Grow / AcquireChunk - BCL StreamPipeWriter `_tailMemory` race avoided — fallback buffer is owned, no race with the BCL internal state + +## ACCORE-BIN-T-Z6N2: `AsyncPipeWriterOutput.Abort()` — partial-chunk padding for mid-stream serialize failure +**Priority:** P3 · **Type:** Robustness · **Related:** SignalR CHUNK_ABORT mechanism ([`SIGNALR_BINARY_PROTOCOL_TODO.md#accore-sbp-t-q3m7`](../../../AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/SIGNALR_BINARY_PROTOCOL_TODO.md#accore-sbp-t-q3m7)) + +The SignalR-layer CHUNK_ABORT mechanism (`[203]`) lets a mid-stream sender-serialize failure fault-isolate to the affected `requestId` instead of aborting the transport connection. **Edge case not yet covered**: if `AsyncPipeWriterOutput` fails after committing a `[201][UINT16=N]` chunk header but before writing all `N` data bytes, the receiver's `TryParseChunkData` is mid-chunk — the `[203]` byte the protocol layer emits on the host `PipeWriter` is parsed as part of that chunk's data, not as the abort marker. The receiver typically reaches the protocol-violation `InvalidDataException` path (connection aborts; baseline behaviour). The chunked-abort path degrades to baseline only in this specific timing window. + +**Why this matters in practice:** rare. Serialize exceptions typically throw mid-data (a property getter dobs while writing field bytes), not mid-header (a 3-byte pre-data reservation, written and `Advance`d in microseconds). Not a correctness bug — the protocol-violation path is safe — but a graceful-fault-isolation gap for the unlucky timing. + +**Approach (sketch).** New `AsyncPipeWriterOutput.Abort(byte abortMarker)` instance method: + +1. If a `[201][UINT16=N]` chunk header was committed but `dataBytesWrittenSoFar < N`: zero-fill the remaining `N - dataBytesWrittenSoFar` bytes and `Advance` to close the open chunk cleanly. The receiver sees a complete chunk frame; the data tail is garbage but framing stays valid. +2. Emit `abortMarker` (`[203]` for the SignalR layer — kept parameterised so `AsyncPipeWriterOutput` stays SignalR-marker-agnostic). +3. Final `FlushAsync`. + +The protocol-layer `WriteMessageChunked` catch then calls `output.Abort(MsgAsyncChunkAbort)` (via a new entry point on `AcBinarySerializer.Serialize` that exposes the in-flight output for finalization) instead of writing directly to the `PipeWriter`. Mid-header window now flows through the graceful abort path. + +**Open design questions** (before implementing): +- **API surface for the protocol layer to reach the in-flight `AsyncPipeWriterOutput`**: the current `AcBinarySerializer.Serialize` overload allocates and discards the struct internally. Either (a) the serializer's own catch calls `Abort()` before propagating, (b) a try-with-cleanup-delegate Serialize overload, or (c) the protocol constructs its own `AsyncPipeWriterOutput` and drives the serialize itself. (c) is the biggest refactor — investigate cost. +- **Padding strategy**: zero-fill (natural choice) vs a documented sentinel byte. Zero-fill, document explicitly. +- **Cost on the common (no partial chunk) path**: a couple of field reads to check `dataBytesWrittenSoFar` vs `N`. Should be ≤ 5 ns vs inline `pipeWriter.GetSpan(1) + Advance + SyncFlush` — confirm by microbench. + +**Acceptance:** +- New integration test produces the mid-header race deterministically (custom `IGeneratedBinaryWriter` that throws after the header is committed, before the first data byte) and asserts `TryParseChunkData` reaches the `[203]` CHUNK_ABORT branch (graceful), not the `InvalidDataException` branch. +- Existing CHUNK_ABORT mid-data tests pass (common case unchanged). +- Microbenchmark: `Abort()` adds ≤ 5 ns on the common path vs inline `pipeWriter.GetSpan(1) + Advance + SyncFlush`. diff --git a/AyCode.Core/docs/BINARY/BINARY_SGEN.md b/AyCode.Core/docs/BINARY/BINARY_SGEN.md index bb61f24..8b3bf27 100644 --- a/AyCode.Core/docs/BINARY/BINARY_SGEN.md +++ b/AyCode.Core/docs/BINARY/BINARY_SGEN.md @@ -82,11 +82,13 @@ Located in `AcBinarySerializer.cs` region "Generated Writer Bridge Methods". Cal | Bridge | Called when | What it does | |--------|-----------|-------------| | `WriteValueGenerated(value, type, ctx, depth)` | SGen encounters non-SGen complex child | → `WriteValueNonPrimitive` (byte[]? IDictionary? IEnumerable? GetWrapper → WriteObject) | -| `WriteObjectGenerated(value, type, ctx, depth)` | SGen knows child is object (not collection) | → `GetWrapper(type)` → `WriteObject` | -| `WriteObjectGenerated(value, type, slot, ctx, depth)` | SGen knows child is SGen object with known slot | → `GetWrapper(type, slot)` → `WriteObject`. Slot = compile-time known wrapper index, avoids dictionary lookup | +| `WriteObjectGenerated(value, type, ctx, depth)` ✱ | SGen knows child is object (not collection) | → `GetWrapper(type)` → `WriteObject` | +| `WriteObjectGenerated(value, type, slot, ctx, depth)` ✱ | SGen knows child is SGen object with known slot | → `GetWrapper(type, slot)` → `WriteObject`. Slot = compile-time known wrapper index, avoids dictionary lookup | | `WriteStringGenerated(value, ctx)` | SGen writes string property | null → PropertySkip, empty → StringEmpty, else → `WriteString` (with interning) | | `ScanValueGenerated(value, type, ctx, depth)` | SGen scan encounters non-SGen child | → runtime `ScanValue` for reference/string tracking | +> **✱ Caller contract — null-check required.** The `WriteObjectGenerated` bridges assume non-null `value` and dereference via `Unsafe.As` inside the target's `WriteProperties`. All four SGen emit branches (`EmitDirectObjectWrite`, `EmitDirectCollectionWrite`, `IsObjectDeclaredType`, and the fallback `WriteObjectGenerated` path) MUST emit `if ({a} == null) WriteByte(BinaryTypeCode.PropertySkip);` before calling — runtime null is possible regardless of the nullable annotation (EF lazy-load, projection gaps, detached navigation properties, cross-assembly types without compile-time nullable context). The reader-side companion `EmitReadProperty` already wraps every markered property in `if (tc != PropertySkip) { … }`, so the marker is uniformly handled. The parity bug that motivated this contract is documented in [`BINARY_TODO.md#accore-bin-t-n4p8`](BINARY_TODO.md#accore-bin-t-n4p8). `WriteStringGenerated`, by contrast, **internalizes** the null-check (one of the bridge's responsibilities) — caller emit is shorter on the string ag. + ## Wrapper Slot System Each SGen type gets a unique slot index via `AllocateWrapperSlot()` (`Interlocked.Increment`). diff --git a/AyCode.Core/docs/BINARY/BINARY_TODO.md b/AyCode.Core/docs/BINARY/BINARY_TODO.md index 406b7e4..ef7a102 100644 --- a/AyCode.Core/docs/BINARY/BINARY_TODO.md +++ b/AyCode.Core/docs/BINARY/BINARY_TODO.md @@ -8,6 +8,36 @@ This page covers planned work for the **binary serializer core** (format, SGen, > **Archived entries**: see `BINARY_TODO_2026_04.md` and `BINARY_TODO_2026_05.md` (year-month bucket archives per LLMP-DEC retention policy). > Archive files are not auto-loaded — read on demand if relevant context is suspected (regression hint, supersession reference, ID lookup for archived entry). +## ACCORE-BIN-T-N4P8: ~~SGen reference-property null-check parity across all four emit branches~~ +**Status:** Closed (2026-05-23) · **Priority:** ~~P1~~ · **Type:** ~~Bug fix~~ + +~~`AcBinarySourceGenerator.GenWriter.cs` had four reference-property emit branches with divergent null handling. `EmitDirectObjectWrite` (line ~828), `EmitDirectCollectionWrite` (line ~877), and the `IsObjectDeclaredType` branch (line ~295) unconditionally emitted `if ({a} == null) PropertySkip; else …`, with explicit comments acknowledging that runtime null is possible regardless of the nullable annotation. The fourth branch — the fallback `WriteObjectGenerated` call (line ~309) — was inconsistent: it null-checked only when `p.IsNullable=true`, leaving the `else` branch (non-nullable signature) without a null guard. Real-world impact: cross-assembly types without a parent-side generated writer (e.g. NopCommerce `Customer` declared as `Customer Customer { get; set; }` in `FruitBank.Common.Dtos.OrderDto`) fall onto the fallback branch with `IsNullable=false`; if the runtime value is null (EF lazy-load gap, projection omission, detached navigation property), the generated code emits `AcBinarySerializer.WriteObjectGenerated(obj.Customer, …)` which then `Unsafe.As(value)` yields a null reference and the first property access NREs. The bug surfaced in SignalR chunked streaming: an `OrderItemPallet → OrderItemDto → OrderDto → Customer` graph with a null `Customer` produced a serialize-time `NullReferenceException` in `Customer_GeneratedWriter.WriteProperties:23` (the first `obj.Active` read).~~ + +### Resolution (2026-05-23) + +Single-line semantic change in `AcBinarySourceGenerator.GenWriter.cs`: the `else if (p.IsNullable)` + `else` branches in the `PropertyTypeKind.Complex` fallback path (line ~309-315) collapsed into one always-null-check branch: + +```csharp +else +{ + // Reference type properties can always be null at runtime regardless of nullable annotation + // (mirrors EmitDirectObjectWrite / EmitDirectCollectionWrite — runtime can violate the + // nullable-disabled contract via EF lazy-load, projection gaps, detached navigation properties). + sb.AppendLine($"{i}if ({a} == null) context.WriteByte(BinaryTypeCode.PropertySkip);"); + sb.AppendLine($"{i}else AcBinarySerializer.WriteObjectGenerated({a}, typeof({p.TypeNameForTypeof}), context);"); +} +``` + +All four reference-property emit branches now share the same defensive null-check pattern. + +**Reader-side compatibility:** no changes needed. `EmitReadProperty` (GenReader.cs ~137) already wraps every markered property in `if (tc != BinaryTypeCode.PropertySkip) { … }`, so the new marker is uniformly handled — the property stays at its default value (null for reference types). + +**Acceptance criteria met:** +- ✅ Production stack-trace (`Customer_GeneratedWriter:23` NRE in chunked send via `OrderItemPallet → OrderItemDto → OrderDto → Customer`) no longer reproducible. +- ✅ All four SGen reference-property emit branches now consistently null-check (parity verified by code inspection). +- ✅ Regression tests added in `AcBinarySerializerSGenNullComplexPropertyTests.cs`: 8 test cases (2 methods × 4 DataRows = SGen/Reflection × FastMode/Default), covering both null-property and non-null-property paths with marker-byte verification (asserts `BinaryTypeCode.PropertySkip` presence in the wire-output for the null slot — confirms the fix took the PropertySkip path, not an unrelated null-safe code path). +- 🟢 Discovery vector: the companion CHUNK_ABORT fault-isolation feature ([`ACCORE-SBP-T-Q3M7`](../../../AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/SIGNALR_BINARY_PROTOCOL_TODO.md#accore-sbp-t-q3m7)) was developed in response to the production NRE, which in turn exposed this unprotected emit branch. With this fix, CHUNK_ABORT becomes a genuine safety-net for unrelated failures rather than a workaround for a self-inflicted serializer bug. + ## ACCORE-BIN-T-P6M4: Universal hotpath optimization guardrails + follow-up backlog **Priority:** P1 · **Type:** Performance diff --git a/AyCode.Services.Tests/AyCode.Services.Tests.csproj b/AyCode.Services.Tests/AyCode.Services.Tests.csproj index 3705a9c..8fd88b5 100644 --- a/AyCode.Services.Tests/AyCode.Services.Tests.csproj +++ b/AyCode.Services.Tests/AyCode.Services.Tests.csproj @@ -1,7 +1,6 @@  - net10.0 latest enable enable diff --git a/AyCode.Services/SignalRs/AcBinaryHubProtocol.cs b/AyCode.Services/SignalRs/AcBinaryHubProtocol.cs index e0d5beb..633e8d1 100644 --- a/AyCode.Services/SignalRs/AcBinaryHubProtocol.cs +++ b/AyCode.Services/SignalRs/AcBinaryHubProtocol.cs @@ -53,6 +53,18 @@ public class AcBinaryHubProtocol : IHubProtocol private const byte MsgAsyncChunkData = 201; private const byte MsgAsyncChunkEnd = 202; + /// + /// CHUNK_ABORT marker. Emitted by the sender's if the + /// streamed-arg serialize fails mid-stream after CHUNK_START has been sent — instead of letting + /// the exception propagate (which would abort the entire SignalR transport connection in + /// HubConnectionContext.WriteCore, killing all other in-flight invocations on the same + /// WebSocket), the sender writes a single [203] byte so the receiver can fault the + /// pending invocation cleanly while the transport stays alive (fault isolation: + /// blast radius = one message). Recognised by , which surfaces + /// the abort to the awaiting caller via a synthesised . + /// + private const byte MsgAsyncChunkAbort = 203; + /// Sentinel object placed in the args array for the streamed argument (replaced after chunk deserialization). protected static readonly object StreamedArgPlaceholder = new(); @@ -529,8 +541,33 @@ public class AcBinaryHubProtocol : IHubProtocol // This includes the null streamedArg case (since the AcBinarySerializer null-bypass for // multiMessage=true was removed) — wire is [201][UINT16=1][Null][202], deserialized back to null. // No manual [202] write or extra FlushAsync needed in this layer. - dataBytes = AcBinarySerializer.Serialize(streamedArg, pipeWriter, _options, _flushPolicy, _flushTimeout); - _logger?.LogDebug("WriteMessageChunked CHUNK_DATA + CHUNK_END emitted via AsyncPipeWriterOutput dataBytes={DataBytes}", dataBytes); + // + // Fault isolation: if the streamed-arg serialize throws (e.g. a property getter NRE on the + // receiver's data class), CHUNK_START has already been sent — the receiver is in chunk-state + // waiting for [201]/[202]. Letting the exception propagate would abort the entire SignalR + // transport connection in HubConnectionContext.WriteCore (killing all other in-flight + // invocations on the same WebSocket). Instead, emit an explicit [203] CHUNK_ABORT marker so + // the receiver can fault the pending invocation cleanly while the connection stays alive. + // Blast radius = one message. + // Edge case (deferred): if AsyncPipeWriterOutput fails after committing a [201][UINT16=N] + // header but before writing all N data bytes, the receiver parses [203] as part of that + // chunk's data, not as the abort marker. Rare (exceptions typically throw mid-data, not + // mid-header), and reaches the protocol-violation path in TryParseChunkData. Robust fix: + // AsyncPipeWriterOutput.Abort() padding the in-flight chunk before emitting [203] — see + // BINARY_ASYNCPIPE_TODO. + try + { + dataBytes = AcBinarySerializer.Serialize(streamedArg, pipeWriter, _options, _flushPolicy, _flushTimeout); + _logger?.LogDebug("WriteMessageChunked CHUNK_DATA + CHUNK_END emitted via AsyncPipeWriterOutput dataBytes={DataBytes}", dataBytes); + } + catch (Exception serializeEx) + { + _logger?.LogError(serializeEx, "WriteMessageChunked streamed-arg serialize FAILED — emitting [203] CHUNK_ABORT messageType={MessageType}", message.GetType().Name); + + if (!TryEmitChunkAbort(pipeWriter)) throw; // pipe dead too — let SignalR abort the connection (baseline behaviour) + + return; // abort marker on the wire, connection alive, receiver faults the caller + } // Total wire bytes = length prefix (4) + CHUNK_START payload + CHUNK_DATA frames + CHUNK_END (1) // Each CHUNK_DATA frame adds 3 bytes ([201][UINT16 size]) per chunkSize-worth of data. @@ -964,29 +1001,150 @@ public class AcBinaryHubProtocol : IHubProtocol return true; } - // Unknown byte in chunk mode. - // Real-world case: server-side serialization fails after CHUNK_START was sent, then SignalR - // emits a normal framed CloseMessage. The first byte here is then the little-endian payload - // length (e.g. 114), not [201]/[202]. If we keep chunk state, subsequent parses keep failing - // with the same warning. Instead, tear down chunk mode and re-parse as normal framed message. - _logger?.LogWarning("TryParseChunkData unknown byte {FirstByte} in chunk mode, falling back to normal parse. " + - "binderHash={BinderHash} inputLength={InputLength} " + - "state: streamedArgType={TargetType} deserTaskStatus={TaskStatus} chunkFrameBytesConsumed={ChunkFrameBytesConsumed}", - firstByte, - binder.GetHashCode(), - input.Length, + if (firstByte == MsgAsyncChunkAbort) // 203 — server abandoned the chunked message mid-stream + { + _logger?.LogWarning("TryParseChunkData [203] CHUNK_ABORT targetType={TargetType} chunkFrameBytesConsumed={ChunkFrameBytesConsumed}", + state.StreamedArgType.Name, state.ChunkFrameBytesConsumed); + + AbandonChunkState(state, binder, reason: "[203] CHUNK_ABORT"); + input = input.Slice(1); // consume the [203] byte + + // Surface the abort to the caller via OnChunkAbort. Default base routing uses + // SignalR's InvocationId (CompletionMessage.WithError); derived classes can + // override for application-level correlation (e.g. SignalParams.requestId in args). + var invocationId = GetInvocationId(state.PartialMessage); + message = OnChunkAbort(state.PartialMessage, state.HeaderContext, invocationId); + + if (message == null) + { + // No routing target — return PingMessage so the SignalR loop's "consumed input + // ↔ produced message" contract holds. True fire-and-forget, or override handed + // off routing out-of-band. + _logger?.LogWarning("TryParseChunkData [203] OnChunkAbort returned null — returning Ping (no caller to fault)"); + message = PingMessage.Instance; + } + + return true; + } + + // Protocol-invariant violation — not [201]/[202]/[203]. Legitimate sender-abort is + // handled by the [203] branch above; anything reaching here is genuine framing + // corruption (sender bug, version mismatch, or chunk header misread drifting into + // data). The old "fallback to normal parse" was guess-and-hope: it can't tell mid-data + // garbage from a real next message. Surface as InvalidDataException — SignalR's outer + // handler treats it as a transport fault rather than masking it. + _logger?.LogError("TryParseChunkData PROTOCOL VIOLATION unknown byte {FirstByte} in chunk mode (expected [201]/[202]/[203]). " + + "binderHash={BinderHash} inputLength={InputLength} targetType={TargetType} deserTaskStatus={TaskStatus} chunkFrameBytesConsumed={ChunkFrameBytesConsumed}", + firstByte, binder.GetHashCode(), input.Length, state.StreamedArgType.Name, state.DeserTask?.Status.ToString() ?? "null", state.ChunkFrameBytesConsumed); - state.Input.Dispose(); - _chunkStates.Remove(binder); - return TryParseMessage(ref input, binder, out message); + AbandonChunkState(state, binder, reason: $"protocol violation (byte 0x{firstByte:X2})"); + + throw new System.IO.InvalidDataException( + $"AcBinary chunked protocol violation: unexpected byte 0x{firstByte:X2} ({firstByte}) in chunk mode " + + $"for targetType={state.StreamedArgType.Name}; expected [201] CHUNK_DATA, [202] CHUNK_END, or [203] CHUNK_ABORT."); } return false; } + /// + /// Teardown for non-success chunk-state termination (CHUNK_ABORT or protocol violation). + /// Mirrors the CHUNK_END path's Complete → await-deser → Dispose → Remove ordering, but + /// observes the background deser task's failure (likely on partial input) without rethrowing — + /// the abort/violation is the authoritative outcome, the deser exception is a derived effect. + /// + private void AbandonChunkState(AsyncChunkState state, IInvocationBinder binder, string reason) + { + state.Input.Complete(); + + if (state.DeserTask != null) + { + try + { + state.DeserTask.GetAwaiter().GetResult(); + } + catch (Exception deserEx) + { + _logger?.LogDebug(deserEx, + "AbandonChunkState ({Reason}): background deser task faulted on partial input (expected)", + reason); + } + } + + state.Input.Dispose(); + _chunkStates.Remove(binder); + } + + /// + /// Extracts the InvocationId from a chunked-mode , + /// covering all message types that flow through . Returns + /// null for unexpected types (defensive — shouldn't occur in normal use). + /// + private static string? GetInvocationId(HubMessage message) => message switch + { + InvocationMessage im => im.InvocationId, + StreamInvocationMessage sim => sim.InvocationId, + StreamItemMessage stim => stim.InvocationId, + CompletionMessage cm => cm.InvocationId, + _ => null + }; + + /// + /// Best-effort emit of the [203] CHUNK_ABORT marker after a serialize failure in + /// . Returns true if the byte was successfully written + /// and flushed; false if even the abort emit failed (transport-level fault) — the + /// caller should then rethrow to fall back to connection-level abort behaviour. + /// + private bool TryEmitChunkAbort(PipeWriter pipeWriter) + { + try + { + var abortSpan = pipeWriter.GetSpan(1); + abortSpan[0] = MsgAsyncChunkAbort; + pipeWriter.Advance(1); + SyncFlush(pipeWriter.FlushAsync()); + + _logger?.LogDebug("WriteMessageChunked [203] CHUNK_ABORT emitted (graceful fault isolation)"); + return true; + } + catch (Exception abortEx) + { + _logger?.LogError(abortEx, + "WriteMessageChunked failed to emit [203] CHUNK_ABORT — falling back to connection abort"); + return false; + } + } + + /// + /// Called by the CHUNK_ABORT [203] receive branch in to + /// produce a HubMessage that surfaces the abort to the awaiting caller. + /// Default base implementation: returns if + /// is non-null (SignalR-level routing — the awaiting task is + /// faulted with the embedded error). Returns null for fire-and-forget invocations (no + /// SignalR InvocationId), in which case the caller falls back to + /// and the abort is not propagated to any specific waiter. + /// Derived classes can override to synthesise an application-level error response — e.g. + /// when the protocol uses a custom request/response correlation in the message arguments + /// (rather than SignalR's InvocationId), the override can build an + /// that routes to the caller's application-level error path. Returning a non-null result short-circuits + /// the base SignalR-level routing; returning null explicitly hands off to the Ping fallback + /// (signalling "the abort was handled out-of-band or has no addressable caller"). + /// + /// The original message that activated chunk mode (with the streamed-arg placeholder still in place). + /// The opaque header context produced by for this message. + /// The SignalR-level InvocationId, or null for fire-and-forget messages. + /// A HubMessage to surface to the SignalR loop, or null to fall back to . + protected virtual HubMessage? OnChunkAbort(HubMessage partialMessage, object? headerContext, string? invocationId) + { + if (string.IsNullOrEmpty(invocationId)) return null; + + return CompletionMessage.WithError(invocationId, + "Server abandoned the chunked response (remote serialize failure — see server logs)."); + } + /// /// Parses CHUNK_START: reads original message (with -1 marker for streamed arg), /// creates , stores state. Background deser task starts lazily on first chunk. diff --git a/AyCode.Services/SignalRs/AyCodeBinaryHubProtocol.cs b/AyCode.Services/SignalRs/AyCodeBinaryHubProtocol.cs index 844897f..804ec75 100644 --- a/AyCode.Services/SignalRs/AyCodeBinaryHubProtocol.cs +++ b/AyCode.Services/SignalRs/AyCodeBinaryHubProtocol.cs @@ -218,4 +218,47 @@ public class AyCodeBinaryHubProtocol : AcBinaryHubProtocol if (rented) ArrayPool.Shared.Return(arr); } } + + /// + /// Application-level CHUNK_ABORT routing for the AyCode correlation + /// pattern. The base SignalR InvocationId is null on server-to-client SendAsync + /// (fire-and-forget at the SignalR layer), but the application encodes + /// (messageTag, requestId, SignalParams, data) into arg[0..3] of an + /// OnReceiveMessage callback. Synthesise a + /// response so the client's _responseByRequestId routing faults the awaiting Task with + /// a specific error instead of waiting for a transport-level timeout. + /// + protected override HubMessage? OnChunkAbort(HubMessage partialMessage, object? headerContext, string? invocationId) + { + // Recognise the AyCode OnReceiveMessage(messageTag, requestId, SignalParams, data) shape — + // arg[1] is the application's correlation key, arg[2] is SignalParams. + if (partialMessage is InvocationMessage inv + && inv.Arguments.Length >= 4 + && inv.Arguments[2] is SignalParams origParams) + { + var errorParams = new SignalParams + { + Status = SignalResponseStatus.Error, + IsRawBytesData = origParams.IsRawBytesData, + DataSerializerType = origParams.DataSerializerType + }; + + // Same target ("OnReceiveMessage") and args[0..1], SignalParams replaced with Error + // status, data arg (last) cleared — the abort means no data was actually produced. + // The client's OnReceiveMessage routes via _responseByRequestId[requestId] (arg[1]) + // and the SignalResponseStatus.Error surfaces to the awaiting caller. + var args = new object?[inv.Arguments.Length]; + Array.Copy(inv.Arguments, args, inv.Arguments.Length); + args[2] = errorParams; + args[inv.Arguments.Length - 1] = null; + + _logger?.LogDebug("OnChunkAbort synthesised SignalResponseStatus.Error response messageTag={MessageTag} requestId={RequestId}", + inv.Arguments[0], inv.Arguments[1]); + + return new InvocationMessage(inv.Target, args); + } + + // Unknown shape — defer to base (SignalR InvocationId routing or null/Ping fallback). + return base.OnChunkAbort(partialMessage, headerContext, invocationId); + } } diff --git a/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/README.md b/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/README.md index bf0ab7d..a83e4a6 100644 --- a/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/README.md +++ b/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/README.md @@ -190,10 +190,12 @@ Inner `AcBinarySerializerOptions` defaults relevant for SignalR: `UseGeneratedCo |-------|-----------|----------------------|-----------| | `Bytes` (default) | `ArrayBinaryOutput` → `byte[]` → write to pipe as raw blob | `ArrayBinaryInput` (single contiguous buffer via `MemoryMarshal.TryGetArray` zero-copy / pool-rent). | **Pro**: simplest, fastest per-call, WASM-safe on both sides. **Con**: no zero-copy write, no pipeline overlap. | | `Segment` | `BufferWriterBinaryOutput` → directly to `PipeWriter`, chunk-by-chunk, single `Flush` at end | Same as Bytes (unified `ArrayBinaryInput` receive path — `_protocolMode` affects send only). | **Pro**: zero-copy write, WASM-safe. **Con**: no pipeline overlap — receiver must wait for full payload before deser starts. | -| `AsyncSegment` | `AsyncPipeWriterOutput` → self-describing chunked framing `[201][UINT16 size][data]` per chunk + `[202]` end marker, per-chunk `FlushAsync` with timeout-bounded sync-await | `AsyncPipeReaderInput` (growing contiguous byte[] with sliding-window cycle); background `Task.Run` deserializes while chunks arrive. WASM: synchronous deser on `CHUNK_END`. | **Pro**: zero-copy write + pipeline parallelism (ser / network / deser overlap). **Con**: send-side not WASM-compatible (see below); slow consumer propagates as server-thread blocking (bounded by `FlushTimeout`). Max chunk: 65535 bytes. | +| `AsyncSegment` | `AsyncPipeWriterOutput` → self-describing chunked framing `[201][UINT16 size][data]` per chunk + `[202]` end marker (or `[203]` CHUNK_ABORT on mid-stream serialize failure — fault isolation, see below), per-chunk `FlushAsync` with timeout-bounded sync-await | `AsyncPipeReaderInput` (growing contiguous byte[] with sliding-window cycle); background `Task.Run` deserializes while chunks arrive. WASM: synchronous deser on `CHUNK_END`. | **Pro**: zero-copy write + pipeline parallelism (ser / network / deser overlap); per-message fault isolation via CHUNK_ABORT. **Con**: send-side not WASM-compatible (see below); slow consumer propagates as server-thread blocking (bounded by `FlushTimeout`). Max chunk: 65535 bytes. | In `AsyncSegment` mode, `WriteMessage` dispatches to `WriteMessageChunked` which sends: (1) CHUNK_START — standard SignalR framing `[INT32 len][200][original message with INT32 -1 for streamed arg]`, (2) N x CHUNK_DATA + final CHUNK_END — `[201][UINT16 size][data]` per chunk + `[202]` end marker, **all emitted by `AsyncPipeWriterOutput`** in framed mode (zero-copy via `PipeWriter.Advance` with 3-byte header reservation; protocol layer no longer writes its own `[202]` or extra `FlushAsync`). The receiver's `TryParseChunkData` accumulates into an `AsyncPipeReaderInput` (multiMessage:false — protocol parses `[201]/[202]` framing externally and feeds raw data via `Feed`); on non-WASM platforms a background `Task.Run` deserializes in parallel via `AsyncPipeReaderInputAdapter`, on WASM the deserializer runs synchronously on `CHUNK_END` over the already-buffered data. +**Fault isolation (CHUNK_ABORT, `[203]`):** if the streamed-arg serialize throws after CHUNK_START has been sent (e.g. a property-getter NRE on the data graph), `WriteMessageChunked`'s `catch` emits a single `[203]` byte on the host `PipeWriter` instead of letting the exception propagate. The receiver's `TryParseChunkData` recognises `[203]` as a first-class abort: tears down the chunk-state safely (mirrors the `[202]` Complete → DeserTask observe → Dispose ordering), then surfaces the failure to the awaiting caller as a synthesised `CompletionMessage.WithError(invocationId, ...)` — the caller's task faults with an actionable error instead of waiting for a transport-level timeout. Blast radius = one message; the SignalR transport connection stays alive and other in-flight invocations on the same WebSocket are unaffected. If the abort marker itself can't be emitted (transport-level fault), behaviour falls back to the baseline (exception propagates, SignalR aborts the connection). Any other unexpected byte in chunk mode (not `[201]/[202]/[203]`) is a protocol-invariant violation: `TryParseChunkData` tears down safely and throws `System.IO.InvalidDataException` rather than guessing. Edge case: a mid-header serialize failure (post-`[201][UINT16=N]` commit, pre-data) is currently caught by the protocol-violation path; the robust graceful-abort fix is tracked in [`../../../AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md#accore-bin-t-z6n2`](../../../AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md#accore-bin-t-z6n2). + In `Bytes` and `Segment` mode, the standard `WriteMessage` path is used. ### WebAssembly compatibility diff --git a/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/SIGNALR_BINARY_PROTOCOL_TODO.md b/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/SIGNALR_BINARY_PROTOCOL_TODO.md index 3ad1d16..6cb53de 100644 --- a/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/SIGNALR_BINARY_PROTOCOL_TODO.md +++ b/AyCode.Services/docs/SIGNALR_BINARY_PROTOCOL/SIGNALR_BINARY_PROTOCOL_TODO.md @@ -74,6 +74,27 @@ Migration completed in three coordinated commits: **Note on ADR-0003 promotion to Accepted:** ADR-0003 status update tracked separately — the original wording referenced unimplemented Steps 4/5 (NamedPipe/FileStream helpers) which were dropped. ADR may need a Resolution-style addendum noting the scope reduction; orthogonal cleanup task. +## ACCORE-SBP-T-Q3M7: ~~CHUNK_ABORT marker for mid-stream serialize-failure fault isolation~~ +**Status:** Closed (2026-05-23) · **Priority:** ~~P1~~ · **Type:** ~~Feature~~ + +~~Add a `[203]` CHUNK_ABORT marker to the AsyncSegment chunked-framing wire format. When the streamed-arg serialize throws after CHUNK_START has been sent, `WriteMessageChunked` emits the abort marker instead of letting the exception propagate (which would abort the entire SignalR transport connection in `HubConnectionContext.WriteCore`, killing all other in-flight invocations on the same WebSocket). The receiver's `TryParseChunkData` recognises `[203]` as a first-class case, tears down the chunk-state safely (mirroring the `[202]` Complete → DeserTask observe → Dispose ordering), and surfaces the failure to the awaiting caller as a synthesised `CompletionMessage.WithError(invocationId, ...)` — caller's task faults with a specific, actionable error rather than a transport timeout. Fault isolation: blast radius = one message; connection stays alive.~~ + +### Resolution (2026-05-23) + +Shipped in `AcBinaryHubProtocol.cs`: + +- **`MsgAsyncChunkAbort = 203`** constant added next to the `[200]/[201]/[202]` markers. +- **`WriteMessageChunked` (send side)**: `try/catch` around `AcBinarySerializer.Serialize(streamedArg, ...)`. The catch logs Error, then calls `TryEmitChunkAbort(pipeWriter)` (best-effort `GetSpan(1) + Advance + SyncFlush` on `[203]`). On success: returns gracefully; on emit-failure: rethrows so the SignalR outer handler aborts the connection (baseline behaviour preserved). +- **`TryParseChunkData` (receive side)**: new first-class branch on `firstByte == MsgAsyncChunkAbort`. Shared `AbandonChunkState(state, binder, reason)` helper for safe teardown (Complete → observe DeserTask → Dispose → Remove); `GetInvocationId(state.PartialMessage)` extracts the routing key; `CompletionMessage.WithError` synthesised for the caller. Fire-and-forget fallback: `PingMessage.Instance` (preserves the "consumed input ↔ produced message" contract). +- **Protocol-violation path tightened**: the previous "fallback to normal parse" guess-and-recover (for unknown byte in chunk mode) replaced with `LogError` + safe teardown + `System.IO.InvalidDataException`. With CHUNK_ABORT in the protocol, legitimate sender-abort is handled by the `[203]` branch; anything else is genuine framing corruption and must not be masked. + +**Acceptance criteria met:** +- ✅ Sender-side serialize failure no longer aborts the WebSocket — blast radius is the affected `requestId` only. +- ✅ Awaiting caller receives a specific error message rather than a transport timeout. +- ✅ Background deser task safely observed (no unobserved task exception, no Dispose-during-Wait race). +- ✅ Teardown logic deduplicated into the `AbandonChunkState` helper; `GetInvocationId` and `TryEmitChunkAbort` helpers extracted; nested try/catch eliminated. +- 🟡 **Deferred edge case** — mid-header serialize failure leaves a partial `[201][UINT16=N]` on the wire; `[203]` is parsed as chunk-data and the receiver lands on the protocol-violation `InvalidDataException` path. Robust graceful-abort fix tracked in [`../../../AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md#accore-bin-t-z6n2`](../../../AyCode.Core/docs/BINARY/BINARY_ASYNCPIPE_TODO.md#accore-bin-t-z6n2). + --- # 🟡 NuGet competitiveness ideas — NOT current priority