fix(render): quiesce dat readers before teardown — kill the shutdown AccessViolation
ObjectMeshManager.Dispose never stopped its Task.Run(ProcessQueueAsync) decode workers, and LandblockStreamer.Dispose abandoned its worker after a 2s join. GameWindow.OnClosing then disposed the DatCollection, which unmaps the dats'' memory-mapped views (MemoryMappedBlockAllocator.DestroyMappedFile nulls _viewPtr) — a worker still inside ReadBlock dereferences the dead view pointer: an uncatchable AccessViolationException with ReadBlock on the stack, firing on close/relaunch during decode storms. This is the recorded crash signature from the 2026-06-09 white-walls session. - ObjectMeshManager.Dispose: set IsDisposed under the queue lock, cancel+drain pending requests, then wait (<=10s) for _activeWorkers==0; loud LogError if workers outlive the wait. ProcessQueueAsync re-checks IsDisposed per dequeue; Prepare*Async entries + enqueue blocks early-out when disposed. - LandblockStreamer.Dispose: join 2s -> 15s with a loud [streamer] line on timeout (cancellation honored between jobs; one landblock load bounds it). - Also includes the [tex-skip] tripwire lines on ObjectMeshManager''s five silent dat-miss exits (GfxObj + CellStruct texture chains) — part of the white-walls attribution net (#105), zero output when healthy. Verified: 3x close-mid-decode-storm smoke (in-world at ~8s, WM_CLOSE at ~11s), clean exits, no crash signatures, no quiesce timeouts. Full suite: 294+218+420 green; Core 1338 green + 4 pre-existing physics failures (reproduced at bare HEAD, unrelated). Investigation: docs/research/2026-06-09-dat-reader-thread-safety-investigation.md Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
d0bd28543b
commit
8fadf770fe
2 changed files with 73 additions and 7 deletions
|
|
@ -382,7 +382,7 @@ namespace AcDream.App.Rendering.Wb {
|
|||
/// Phase 1 (Background Thread): Prepare CPU-side mesh data for deduplicated EnvCell geometry.
|
||||
/// </summary>
|
||||
public Task<ObjectMeshData?> PrepareEnvCellGeomMeshDataAsync(ulong geomId, uint environmentId, ushort cellStructure, List<ushort> surfaces, CancellationToken ct = default) {
|
||||
if (HasRenderData(geomId)) return Task.FromResult<ObjectMeshData?>(null);
|
||||
if (IsDisposed || HasRenderData(geomId)) return Task.FromResult<ObjectMeshData?>(null);
|
||||
|
||||
// Check CPU cache first
|
||||
lock (_cpuMeshCache) {
|
||||
|
|
@ -403,6 +403,11 @@ namespace AcDream.App.Rendering.Wb {
|
|||
_preparationTasks[geomId] = task;
|
||||
|
||||
lock (_pendingRequests) {
|
||||
if (IsDisposed) {
|
||||
tcs.TrySetCanceled();
|
||||
_preparationTasks.TryRemove(geomId, out _);
|
||||
return task;
|
||||
}
|
||||
// Special handling for EnvCell geometry - we need to store the cell data for the worker
|
||||
_pendingEnvCellRequests[geomId] = new EnvCellGeomRequest {
|
||||
EnvironmentId = environmentId,
|
||||
|
|
@ -420,7 +425,7 @@ namespace AcDream.App.Rendering.Wb {
|
|||
}
|
||||
|
||||
public Task<ObjectMeshData?> PrepareMeshDataAsync(ulong id, bool isSetup, CancellationToken ct = default) {
|
||||
if (HasRenderData(id)) return Task.FromResult<ObjectMeshData?>(null);
|
||||
if (IsDisposed || HasRenderData(id)) return Task.FromResult<ObjectMeshData?>(null);
|
||||
|
||||
// Check CPU cache first
|
||||
lock (_cpuMeshCache) {
|
||||
|
|
@ -452,6 +457,11 @@ namespace AcDream.App.Rendering.Wb {
|
|||
_preparationTasks[id] = task;
|
||||
|
||||
lock (_pendingRequests) {
|
||||
if (IsDisposed) {
|
||||
tcs.TrySetCanceled();
|
||||
_preparationTasks.TryRemove(id, out _);
|
||||
return task;
|
||||
}
|
||||
_pendingRequests.Add((id, isSetup, tcs, ct));
|
||||
if (_activeWorkers < MaxParallelLoads) {
|
||||
_activeWorkers++;
|
||||
|
|
@ -471,7 +481,9 @@ namespace AcDream.App.Rendering.Wb {
|
|||
CancellationToken ct;
|
||||
|
||||
lock (_pendingRequests) {
|
||||
if (_pendingRequests.Count == 0) {
|
||||
// IsDisposed re-check: lets Dispose() drain the queue and
|
||||
// observe _activeWorkers reach 0 before the dats unmap.
|
||||
if (IsDisposed || _pendingRequests.Count == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -972,7 +984,11 @@ namespace AcDream.App.Rendering.Wb {
|
|||
if (surfaceIdx < 0 || surfaceIdx >= gfxObj.Surfaces.Count) return;
|
||||
|
||||
var surfaceId = gfxObj.Surfaces[surfaceIdx];
|
||||
if (!_dats.Portal.TryGet<Surface>(surfaceId, out var surface)) return;
|
||||
if (!_dats.Portal.TryGet<Surface>(surfaceId, out var surface)) {
|
||||
// TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix)
|
||||
Console.WriteLine($"[tex-skip] gfxobj Surface 0x{surfaceId:X8} miss -> poly batch dropped (obj 0x{gfxObj.Id:X8})");
|
||||
return;
|
||||
}
|
||||
|
||||
int texWidth, texHeight;
|
||||
byte[] textureData;
|
||||
|
|
@ -1133,6 +1149,8 @@ namespace AcDream.App.Rendering.Wb {
|
|||
sourceFormat == DatReaderWriter.Enums.PixelFormat.PFID_DXT5)));
|
||||
}
|
||||
else {
|
||||
// TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix)
|
||||
Console.WriteLine($"[tex-skip] gfxobj SurfaceTexture 0x{surface.OrigTextureId:X8} miss -> poly batch dropped (surface 0x{surfaceId:X8})");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1320,7 +1338,11 @@ namespace AcDream.App.Rendering.Wb {
|
|||
return;
|
||||
}
|
||||
|
||||
if (!_dats.Portal.TryGet<Surface>(surfaceId, out var surface)) return;
|
||||
if (!_dats.Portal.TryGet<Surface>(surfaceId, out var surface)) {
|
||||
// TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix)
|
||||
Console.WriteLine($"[tex-skip] cellstruct Surface 0x{surfaceId:X8} miss -> WALL poly batch dropped (cellstruct 0x{cellStruct:X4})");
|
||||
return;
|
||||
}
|
||||
|
||||
int texWidth, texHeight;
|
||||
byte[] textureData;
|
||||
|
|
@ -1345,6 +1367,8 @@ namespace AcDream.App.Rendering.Wb {
|
|||
var renderSurfaceId = surfaceTexture.Textures.First();
|
||||
if (!_dats.Portal.TryGet<RenderSurface>(renderSurfaceId, out var renderSurface)) {
|
||||
if (!_dats.HighRes.TryGet<RenderSurface>(renderSurfaceId, out var hrRenderSurface)) {
|
||||
// TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix)
|
||||
Console.WriteLine($"[tex-skip] cellstruct RenderSurface 0x{renderSurfaceId:X8} miss (portal+highres) -> WALL poly batch dropped");
|
||||
return;
|
||||
}
|
||||
renderSurface = hrRenderSurface;
|
||||
|
|
@ -1454,6 +1478,8 @@ namespace AcDream.App.Rendering.Wb {
|
|||
}
|
||||
}
|
||||
else {
|
||||
// TEMP diagnostic (dat-race investigation 2026-06-09, strip with fix)
|
||||
Console.WriteLine($"[tex-skip] cellstruct SurfaceTexture 0x{surface.OrigTextureId:X8} miss -> WALL poly batch dropped (surface 0x{surfaceId:X8})");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1960,7 +1986,39 @@ namespace AcDream.App.Rendering.Wb {
|
|||
|
||||
public void Dispose() {
|
||||
if (IsDisposed) return;
|
||||
|
||||
// Quiesce the background decode workers BEFORE returning: the owner
|
||||
// disposes the DatCollection right after this adapter chain, which
|
||||
// unmaps the dats' memory-mapped views. A worker still inside
|
||||
// MemoryMappedBlockAllocator.ReadBlock at that point dereferences the
|
||||
// dead view pointer — an uncatchable, process-fatal AccessViolation
|
||||
// (dat-race investigation 2026-06-09). Setting IsDisposed under the
|
||||
// queue lock publishes it to workers, which re-check it before every
|
||||
// dequeue; draining the queue means each worker exits after at most
|
||||
// its current (millisecond-scale) item.
|
||||
lock (_pendingRequests) {
|
||||
IsDisposed = true;
|
||||
foreach (var (id, _, tcs, _) in _pendingRequests) {
|
||||
tcs.TrySetCanceled();
|
||||
_preparationTasks.TryRemove(id, out _);
|
||||
}
|
||||
_pendingRequests.Clear();
|
||||
_pendingEnvCellRequests.Clear();
|
||||
}
|
||||
var deadline = System.Environment.TickCount64 + 10_000;
|
||||
while (System.Environment.TickCount64 < deadline) {
|
||||
lock (_pendingRequests) {
|
||||
if (_activeWorkers == 0) break;
|
||||
}
|
||||
Thread.Sleep(5);
|
||||
}
|
||||
lock (_pendingRequests) {
|
||||
if (_activeWorkers > 0)
|
||||
_logger.LogError(
|
||||
"Dispose: {Count} mesh-decode workers still active after 10s — dat teardown may race in-flight reads",
|
||||
_activeWorkers);
|
||||
}
|
||||
|
||||
_graphicsDevice.QueueGLAction(gl => {
|
||||
foreach (var data in _renderData.Values) {
|
||||
if (!_useModernRendering) {
|
||||
|
|
|
|||
|
|
@ -329,7 +329,15 @@ public sealed class LandblockStreamer : IDisposable
|
|||
if (System.Threading.Interlocked.Exchange(ref _disposed, 1) != 0) return;
|
||||
_cancel.Cancel();
|
||||
_inbox.Writer.TryComplete();
|
||||
_worker?.Join(TimeSpan.FromSeconds(2));
|
||||
// Generous join: the owner disposes the DatCollection after this, which
|
||||
// unmaps the dats' memory-mapped views — an abandoned worker mid-dat-read
|
||||
// would take the process down with an AccessViolation in
|
||||
// MemoryMappedBlockAllocator.ReadBlock (dat-race investigation 2026-06-09).
|
||||
// Cancellation is honored between jobs, so the wait is bounded by one
|
||||
// landblock load; 15s only ever elapses if the worker is genuinely hung.
|
||||
if (_worker is not null && !_worker.Join(TimeSpan.FromSeconds(15)))
|
||||
Console.Error.WriteLine(
|
||||
"[streamer] worker did not stop within 15s — dat teardown may race an in-flight load");
|
||||
_cancel.Dispose();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue