From 8fadf770fe41fbb06c28cd84bbb2aaabdbe7b910 Mon Sep 17 00:00:00 2001 From: Erik Date: Tue, 9 Jun 2026 21:27:22 +0200 Subject: [PATCH] =?UTF-8?q?fix(render):=20quiesce=20dat=20readers=20before?= =?UTF-8?q?=20teardown=20=E2=80=94=20kill=20the=20shutdown=20AccessViolati?= =?UTF-8?q?on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ObjectMeshManager.Dispose never stopped its Task.Run(ProcessQueueAsync) decode workers, and LandblockStreamer.Dispose abandoned its worker after a 2s join. GameWindow.OnClosing then disposed the DatCollection, which unmaps the dats'' memory-mapped views (MemoryMappedBlockAllocator.DestroyMappedFile nulls _viewPtr) — a worker still inside ReadBlock dereferences the dead view pointer: an uncatchable AccessViolationException with ReadBlock on the stack, firing on close/relaunch during decode storms. This is the recorded crash signature from the 2026-06-09 white-walls session. - ObjectMeshManager.Dispose: set IsDisposed under the queue lock, cancel+drain pending requests, then wait (<=10s) for _activeWorkers==0; loud LogError if workers outlive the wait. ProcessQueueAsync re-checks IsDisposed per dequeue; Prepare*Async entries + enqueue blocks early-out when disposed. - LandblockStreamer.Dispose: join 2s -> 15s with a loud [streamer] line on timeout (cancellation honored between jobs; one landblock load bounds it). - Also includes the [tex-skip] tripwire lines on ObjectMeshManager''s five silent dat-miss exits (GfxObj + CellStruct texture chains) — part of the white-walls attribution net (#105), zero output when healthy. Verified: 3x close-mid-decode-storm smoke (in-world at ~8s, WM_CLOSE at ~11s), clean exits, no crash signatures, no quiesce timeouts. Full suite: 294+218+420 green; Core 1338 green + 4 pre-existing physics failures (reproduced at bare HEAD, unrelated). Investigation: docs/research/2026-06-09-dat-reader-thread-safety-investigation.md Co-Authored-By: Claude Fable 5 --- .../Rendering/Wb/ObjectMeshManager.cs | 70 +++++++++++++++++-- .../Streaming/LandblockStreamer.cs | 10 ++- 2 files changed, 73 insertions(+), 7 deletions(-) diff --git a/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs b/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs index ddb819c5..baa01cf5 100644 --- a/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs +++ b/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs @@ -382,7 +382,7 @@ namespace AcDream.App.Rendering.Wb { /// Phase 1 (Background Thread): Prepare CPU-side mesh data for deduplicated EnvCell geometry. /// public Task PrepareEnvCellGeomMeshDataAsync(ulong geomId, uint environmentId, ushort cellStructure, List surfaces, CancellationToken ct = default) { - if (HasRenderData(geomId)) return Task.FromResult(null); + if (IsDisposed || HasRenderData(geomId)) return Task.FromResult(null); // Check CPU cache first lock (_cpuMeshCache) { @@ -403,6 +403,11 @@ namespace AcDream.App.Rendering.Wb { _preparationTasks[geomId] = task; lock (_pendingRequests) { + if (IsDisposed) { + tcs.TrySetCanceled(); + _preparationTasks.TryRemove(geomId, out _); + return task; + } // Special handling for EnvCell geometry - we need to store the cell data for the worker _pendingEnvCellRequests[geomId] = new EnvCellGeomRequest { EnvironmentId = environmentId, @@ -420,7 +425,7 @@ namespace AcDream.App.Rendering.Wb { } public Task PrepareMeshDataAsync(ulong id, bool isSetup, CancellationToken ct = default) { - if (HasRenderData(id)) return Task.FromResult(null); + if (IsDisposed || HasRenderData(id)) return Task.FromResult(null); // Check CPU cache first lock (_cpuMeshCache) { @@ -452,6 +457,11 @@ namespace AcDream.App.Rendering.Wb { _preparationTasks[id] = task; lock (_pendingRequests) { + if (IsDisposed) { + tcs.TrySetCanceled(); + _preparationTasks.TryRemove(id, out _); + return task; + } _pendingRequests.Add((id, isSetup, tcs, ct)); if (_activeWorkers < MaxParallelLoads) { _activeWorkers++; @@ -471,7 +481,9 @@ namespace AcDream.App.Rendering.Wb { CancellationToken ct; lock (_pendingRequests) { - if (_pendingRequests.Count == 0) { + // IsDisposed re-check: lets Dispose() drain the queue and + // observe _activeWorkers reach 0 before the dats unmap. + if (IsDisposed || _pendingRequests.Count == 0) { return; } @@ -972,7 +984,11 @@ namespace AcDream.App.Rendering.Wb { if (surfaceIdx < 0 || surfaceIdx >= gfxObj.Surfaces.Count) return; var surfaceId = gfxObj.Surfaces[surfaceIdx]; - if (!_dats.Portal.TryGet(surfaceId, out var surface)) return; + if (!_dats.Portal.TryGet(surfaceId, out var surface)) { + // TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix) + Console.WriteLine($"[tex-skip] gfxobj Surface 0x{surfaceId:X8} miss -> poly batch dropped (obj 0x{gfxObj.Id:X8})"); + return; + } int texWidth, texHeight; byte[] textureData; @@ -1133,6 +1149,8 @@ namespace AcDream.App.Rendering.Wb { sourceFormat == DatReaderWriter.Enums.PixelFormat.PFID_DXT5))); } else { + // TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix) + Console.WriteLine($"[tex-skip] gfxobj SurfaceTexture 0x{surface.OrigTextureId:X8} miss -> poly batch dropped (surface 0x{surfaceId:X8})"); return; } @@ -1320,7 +1338,11 @@ namespace AcDream.App.Rendering.Wb { return; } - if (!_dats.Portal.TryGet(surfaceId, out var surface)) return; + if (!_dats.Portal.TryGet(surfaceId, out var surface)) { + // TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix) + Console.WriteLine($"[tex-skip] cellstruct Surface 0x{surfaceId:X8} miss -> WALL poly batch dropped (cellstruct 0x{cellStruct:X4})"); + return; + } int texWidth, texHeight; byte[] textureData; @@ -1345,6 +1367,8 @@ namespace AcDream.App.Rendering.Wb { var renderSurfaceId = surfaceTexture.Textures.First(); if (!_dats.Portal.TryGet(renderSurfaceId, out var renderSurface)) { if (!_dats.HighRes.TryGet(renderSurfaceId, out var hrRenderSurface)) { + // TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix) + Console.WriteLine($"[tex-skip] cellstruct RenderSurface 0x{renderSurfaceId:X8} miss (portal+highres) -> WALL poly batch dropped"); return; } renderSurface = hrRenderSurface; @@ -1454,6 +1478,8 @@ namespace AcDream.App.Rendering.Wb { } } else { + // TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix) + Console.WriteLine($"[tex-skip] cellstruct SurfaceTexture 0x{surface.OrigTextureId:X8} miss -> WALL poly batch dropped (surface 0x{surfaceId:X8})"); return; } @@ -1960,7 +1986,39 @@ namespace AcDream.App.Rendering.Wb { public void Dispose() { if (IsDisposed) return; - IsDisposed = true; + + // Quiesce the background decode workers BEFORE returning: the owner + // disposes the DatCollection right after this adapter chain, which + // unmaps the dats' memory-mapped views. A worker still inside + // MemoryMappedBlockAllocator.ReadBlock at that point dereferences the + // dead view pointer — an uncatchable, process-fatal AccessViolation + // (dat-race investigation 2026-06-09). Setting IsDisposed under the + // queue lock publishes it to workers, which re-check it before every + // dequeue; draining the queue means each worker exits after at most + // its current (millisecond-scale) item. + lock (_pendingRequests) { + IsDisposed = true; + foreach (var (id, _, tcs, _) in _pendingRequests) { + tcs.TrySetCanceled(); + _preparationTasks.TryRemove(id, out _); + } + _pendingRequests.Clear(); + _pendingEnvCellRequests.Clear(); + } + var deadline = System.Environment.TickCount64 + 10_000; + while (System.Environment.TickCount64 < deadline) { + lock (_pendingRequests) { + if (_activeWorkers == 0) break; + } + Thread.Sleep(5); + } + lock (_pendingRequests) { + if (_activeWorkers > 0) + _logger.LogError( + "Dispose: {Count} mesh-decode workers still active after 10s — dat teardown may race in-flight reads", + _activeWorkers); + } + _graphicsDevice.QueueGLAction(gl => { foreach (var data in _renderData.Values) { if (!_useModernRendering) { diff --git a/src/AcDream.App/Streaming/LandblockStreamer.cs b/src/AcDream.App/Streaming/LandblockStreamer.cs index 143fab75..19b2a94b 100644 --- a/src/AcDream.App/Streaming/LandblockStreamer.cs +++ b/src/AcDream.App/Streaming/LandblockStreamer.cs @@ -329,7 +329,15 @@ public sealed class LandblockStreamer : IDisposable if (System.Threading.Interlocked.Exchange(ref _disposed, 1) != 0) return; _cancel.Cancel(); _inbox.Writer.TryComplete(); - _worker?.Join(TimeSpan.FromSeconds(2)); + // Generous join: the owner disposes the DatCollection after this, which + // unmaps the dats' memory-mapped views — an abandoned worker mid-dat-read + // would take the process down with an AccessViolation in + // MemoryMappedBlockAllocator.ReadBlock (dat-race investigation 2026-06-09). + // Cancellation is honored between jobs, so the wait is bounded by one + // landblock load; 15s only ever elapses if the worker is genuinely hung. + if (_worker is not null && !_worker.Join(TimeSpan.FromSeconds(15))) + Console.Error.WriteLine( + "[streamer] worker did not stop within 15s — dat teardown may race an in-flight load"); _cancel.Dispose(); } }