From 8682a8db7021bf91ce8c5aa5d227e9338fc79fa8 Mon Sep 17 00:00:00 2001 From: Erik Date: Sat, 13 Jun 2026 10:27:26 +0200 Subject: [PATCH] close #125: bounded upload retry kills the sticky-drop debt (failed GL uploads were never re-staged) The GL root cause was fixed in fcade06 (the gpu_us query-ring stale errors). This closes the remaining design debt: a genuinely-failed UploadMeshData was dropped permanently. Exact mechanism (traced this session): UploadMeshData's catch returns null, the staged item is already consumed, and _renderData stays empty - but the prepared data lingers in _cpuMeshCache, so the #128 EnsureLoaded re-arm hits PrepareMeshDataAsync's CPU-cache short-circuit (ObjectMeshManager.cs:448-453) which returns the cached data WITHOUT re-staging it for upload. The mesh stays invisible until CPU-cache eviction - session-sticky under low cache pressure (the in-tower scenario). Fix: the per-frame Tick drain (WbMeshAdapter) now re-stages a failed upload for the NEXT frame via ObjectMeshManager.UploadOrRequeue, bounded by MaxUploadRetries (3). The attempt counter lives on the ObjectMeshData object so it resets to 0 naturally on re-prepare. Re-stages are collected and re-enqueued AFTER the drain loop, never inside it, so a deterministic failure cannot spin the queue within a single frame; past the cap it gives up with a loud [up-retry] ... giving up line - a genuine GL defect now surfaces instead of the old silent permanent drop or an unbounded retry storm. Retail loads content synchronously and has no such failure mode; this converges the async pipeline toward that guarantee. The uncaught GenerateMipmaps path (open-question c) is INTENTIONALLY left to surface errors - a blanket catch there would mask future real defects (no-workarounds rule), and its trigger (fcade06) is retired. No visual gate (robustness). Build green; App.Tests 264 + WbMeshAdapter tests green. No GL-context test seam exists for the upload path, so the bounded retry is verified by construction + the regression suite. Co-Authored-By: Claude Fable 5 --- docs/ISSUES.md | 32 +++++++++++--- .../Rendering/Wb/ObjectMeshManager.cs | 44 +++++++++++++++++++ src/AcDream.App/Rendering/Wb/WbMeshAdapter.cs | 13 +++++- 3 files changed, 81 insertions(+), 8 deletions(-) diff --git a/docs/ISSUES.md b/docs/ISSUES.md index 6d5f5552..2ae95c93 100644 --- a/docs/ISSUES.md +++ b/docs/ISSUES.md @@ -4453,8 +4453,9 @@ aperture instead of see-through to the world behind. ## #125 — GL InvalidOperation during staged texture upload: failed uploads are STICKY (never retried) + uncaught crash in GenerateMipmaps -**Status:** ROOT CAUSE FIXED 2026-06-11 (`fcade06`, live-verified) — -remaining: the sticky-drop design debt (below). +**Status:** CLOSED 2026-06-12 — the GL root cause was fixed `fcade06` +(2026-06-11, live-verified); the remaining sticky-drop DESIGN DEBT is now +fixed too (bounded upload retry, below). No visual gate (robustness). **RESOLVED (root cause):** the GL errors were the gpu_us QUERY RING's own — a glGenQueries name isn't a query object until first glBeginQuery, and @@ -4469,11 +4470,28 @@ slot; read only begun queries. Live-verified in-tower: 0 [wb-error] time under pview, meshMissing=0. **Normal runs (WB_DIAG off) never had these errors — this mechanism is RETIRED for #119.** -**Remaining debt (keep open under this number):** UploadMeshData removes -the preparation task BEFORE uploading, so any genuinely-failed upload is -never retried — permanently invisible mesh with one [wb-error] line. -The trigger is gone but the design flaw isn't; add retry/re-prepare -semantics in a maintenance pass. +**Remaining debt — FIXED 2026-06-12 (bounded upload retry):** the exact +stick was the CPU-cache short-circuit, not just the early `TryRemove`: a +failed `UploadMeshData` (catch → null) consumed the staged item and left +`_renderData` empty while the prepared data lingered in `_cpuMeshCache`, +so `PrepareMeshDataAsync`'s cache-hit path (`ObjectMeshManager.cs:448-453`) +returned it WITHOUT re-staging → never re-uploaded until CPU-cache +eviction (effectively session-sticky under low cache pressure). Fix: the +Tick drain (`WbMeshAdapter.cs`) now re-stages a failed upload for the NEXT +frame via `ObjectMeshManager.UploadOrRequeue`, bounded by +`MaxUploadRetries` (3) using a counter on the `ObjectMeshData` object +(resets to 0 on re-prepare). Re-stages are collected and re-enqueued +AFTER the drain loop — never inside it — so a deterministic failure can't +spin the queue in one frame; past the cap it gives up with a loud +`[up-retry] … giving up` line (surfaces a genuine GL defect instead of +the old silent permanent drop). Retail loads synchronously and has no +such failure mode; this converges the async pipeline toward that +guarantee. Build + App.Tests (264) green; no GL-context test seam exists +for the upload path so the retry is verified by construction + the +regression suite. The uncaught `GenerateMipmaps` path (open-question c) +is INTENTIONALLY left to surface errors — adding a blanket catch there +would mask future real defects (no-workarounds rule); its trigger +(`fcade06`) is already retired. **Filed:** 2026-06-11 (in-tower WB_DIAG launch, `tower-wbdiag3.log` — preserved in the worktree root) **Component:** render — WB staged texture pipeline (ObjectMeshManager / ManagedGLTextureArray) diff --git a/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs b/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs index d717a934..b9261ad1 100644 --- a/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs +++ b/src/AcDream.App/Rendering/Wb/ObjectMeshManager.cs @@ -62,6 +62,24 @@ namespace AcDream.App.Rendering.Wb { public VertexPositionNormalTexture[] Vertices { get; set; } = Array.Empty(); public List Batches { get; set; } = new(); + /// + /// #125 (2026-06-12): GL upload-retry counter. A failed + /// (returns null from its + /// catch) used to be dropped permanently — the staged item was consumed, + /// no render data was produced, and the prepared data lingered in the CPU + /// cache where PrepareMeshDataAsync's cache-hit short-circuit + /// returned it without ever re-staging it for upload (session-sticky + /// invisible mesh, one [wb-error] line). The drain loop now re-stages a + /// failed upload for the NEXT frame up to times. The counter lives on the mesh-data object so + /// it resets to 0 naturally whenever the id is re-prepared (fresh object), + /// and bounds a deterministic GL failure to a few loud lines instead of a + /// silent permanent drop OR an unbounded per-frame retry storm. Retail + /// loads content synchronously and has no such failure mode — this + /// converges our async pipeline toward that guarantee. + /// + public int UploadAttempts; + /// For EnvCell: the geometry of the cell itself. public ObjectMeshData? EnvCellGeometry { get; set; } @@ -216,6 +234,32 @@ namespace AcDream.App.Rendering.Wb { private readonly ConcurrentQueue _stagedMeshData = new(); public ConcurrentQueue StagedMeshData => _stagedMeshData; + /// #125: how many times a failed GL upload is re-staged before + /// giving up loudly. Small — a transient GL error clears on the next + /// frame; anything that fails this many times is a genuine defect to + /// surface, not retry forever. See . + public const int MaxUploadRetries = 3; + + /// + /// #125: drain one staged upload, returning whether it should be + /// re-staged for a later frame. The caller (the per-frame Tick drain) + /// collects the re-stages and re-enqueues them AFTER the drain loop — + /// never inside it — so a deterministic failure can't spin the queue in + /// a single frame. Increments the mesh-data's own attempt counter (resets + /// on re-prepare) and gives up loudly past . + /// + public bool UploadOrRequeue(ObjectMeshData meshData) { + if (UploadMeshData(meshData) is not null) + return false; // success (incl. legitimate 0-vertex → empty render data) + if (HasRenderData(meshData.ObjectId)) + return false; // raced to present by another path + meshData.UploadAttempts++; + if (meshData.UploadAttempts < MaxUploadRetries) + return true; // re-stage for next frame + Console.WriteLine($"[up-retry] 0x{meshData.ObjectId:X10} upload failed {meshData.UploadAttempts}x — giving up (was the #125 silent sticky drop; a GL error is being surfaced, not hidden)"); + return false; + } + // Cache for decoded textures to avoid redundant BCn decoding private readonly ConcurrentQueue _decodedTextureLru = new(); private readonly ConcurrentDictionary _decodedTextureCache = new(); diff --git a/src/AcDream.App/Rendering/Wb/WbMeshAdapter.cs b/src/AcDream.App/Rendering/Wb/WbMeshAdapter.cs index af2940ec..8bbdd6bd 100644 --- a/src/AcDream.App/Rendering/Wb/WbMeshAdapter.cs +++ b/src/AcDream.App/Rendering/Wb/WbMeshAdapter.cs @@ -244,10 +244,21 @@ public sealed class WbMeshAdapter : IDisposable, IWbMeshAdapter if (_disposed) return; _graphicsDevice!.ProcessGLQueue(); + // #125: drain staged uploads; a FAILED upload (UploadMeshData returned + // null from its catch) is re-staged for a LATER frame, not dropped. The + // re-stages are collected and re-enqueued AFTER the loop — re-enqueuing + // inside the while would let a deterministic failure spin the queue in a + // single frame. UploadOrRequeue bounds the retries (MaxUploadRetries) so + // a genuine defect surfaces loudly instead of the old silent sticky drop. + List? requeue = null; while (_meshManager!.StagedMeshData.TryDequeue(out var meshData)) { - _meshManager.UploadMeshData(meshData); + if (_meshManager.UploadOrRequeue(meshData)) + (requeue ??= new()).Add(meshData); } + if (requeue is not null) + foreach (var m in requeue) + _meshManager.StagedMeshData.Enqueue(m); bool texProbe = AcDream.Core.Rendering.RenderingDiagnostics.ProbeTexFlushEnabled; var pendingBefore = texProbe