feat(perf): Phase N.6 slice 1 — fix gpu_us double-buffering in WbDrawDispatcher

The dispatcher's GPU TimeElapsed queries were polled in the same frame as the indirect draw, so glGetQueryObject(ResultAvailable) always returned 0 and gpu_us in [WB-DIAG] was stuck at 0m/0p95. Replace the 2 single-handle queries with ring-of-3 arrays and move the result read to BEFORE issuing the next frame's queries into the same slot — at frame N we read slot N%3 which holds frame N-3's queries (oldest in the ring, ~50ms old at 60fps and definitely done across all desktop GL drivers). Vendor-neutral: AMD/NVIDIA/Intel desktop GL all work without driver-specific code. The gpuQuerySlot variable is hoisted to function scope (just before Phase 7 opaque pass) so both the opaque and transparent passes reference the same slot — the plan placed it inside the opaque-pass if-block, which would have been out of scope for the transparent BeginQuery; corrected in the implementation. No new tests — the change is purely a diagnostic readout fix, no observable behavior in the rendering path. Build green; tests at baseline (1711 passing, 8 pre-existing physics/MotionInterpreter failures unchanged). Manual gpu_us verification still pending in-world. Spec: docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md (§4). Plan: docs/superpowers/plans/2026-05-11-phase-n6-slice1.md (Task 1). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-11 11:24:26 +02:00 · 2026-05-11 11:24:26 +02:00 · a7c98004bb
commit a7c98004bb
parent a4931eeaa2
1 changed files with 49 additions and 23 deletions
--- a/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs
+++ b/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs
@ -152,8 +152,16 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
    private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new();
    private readonly long[] _cpuSamples = new long[256];   // microseconds
    private int _cpuSampleCursor;
-    private uint _gpuQueryOpaque;
-    private uint _gpuQueryTransparent;
+    // GPU timing uses a ring of 3 query-pair slots so the read of frame N-3's
+    // result lands when the GPU has finished (~50ms after issue on a typical
+    // 60fps frame). Ring of 3 is the vendor-neutral choice: NVIDIA drivers with
+    // triple-buffering+vsync can queue ~3 frames ahead, AMD typically 1-2,
+    // Intel iGPUs vary. ResultAvailable is the safety guard if the GPU is
+    // still working when we try to read.
+    private const int GpuQueryRingDepth = 3;
+    private readonly uint[] _gpuQueryOpaque      = new uint[GpuQueryRingDepth];
+    private readonly uint[] _gpuQueryTransparent = new uint[GpuQueryRingDepth];
+    private int _gpuQueryFrameIndex;
    private readonly long[] _gpuSamples = new long[256];   // microseconds
    private int _gpuSampleCursor;
    private bool _gpuQueriesInitialized;
@ -346,8 +354,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable

        if (diag && !_gpuQueriesInitialized)
        {
-            _gpuQueryOpaque      = _gl.GenQuery();
-            _gpuQueryTransparent = _gl.GenQuery();
+            for (int i = 0; i < GpuQueryRingDepth; i++)
+            {
+                _gpuQueryOpaque[i]      = _gl.GenQuery();
+                _gpuQueryTransparent[i] = _gl.GenQuery();
+            }
            _gpuQueriesInitialized = true;
        }

@ -754,6 +765,29 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
        if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
            _gl.Disable(EnableCap.CullFace);

+        // GPU timing: compute this frame's ring slot. We read frame N-3's
+        // result (the oldest data in the ring) before overwriting it with
+        // frame N's queries. Hoisted to function scope so both the opaque
+        // and transparent passes below can reference gpuQuerySlot. See spec
+        // §3 Q1/Q2 + §4 in
+        // docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md.
+        int gpuQuerySlot = _gpuQueryFrameIndex % GpuQueryRingDepth;
+        if (_gpuQueriesInitialized && _gpuQueryFrameIndex >= GpuQueryRingDepth)
+        {
+            _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.ResultAvailable, out int avail);
+            if (avail != 0)
+            {
+                _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot],      QueryObjectParameterName.Result, out ulong opaqueNs);
+                _gl.GetQueryObject(_gpuQueryTransparent[gpuQuerySlot], QueryObjectParameterName.Result, out ulong transNs);
+                long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
+                _gpuSamples[_gpuSampleCursor] = gpuUs;
+                _gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
+            }
+            // If avail==0 the sample is dropped silently. MedianMicros
+            // computes over the non-zero subset, so dropped samples don't
+            // poison the median.
+        }
+
        // ── Phase 7: opaque pass ─────────────────────────────────────────────
        if (_opaqueDrawCount > 0)
        {
@ -771,7 +805,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
            // mesh_modern.vert for why this is needed.
            _shader.SetInt("uDrawIDOffset", 0);
            _gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
-            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque);
+            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque[gpuQuerySlot]);
            _gl.MultiDrawElementsIndirect(
                PrimitiveType.Triangles,
                DrawElementsType.UnsignedShort,
@ -820,7 +854,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
            _gl.CullFace(TriangleFace.Back);
            _gl.FrontFace(FrontFaceDirection.Ccw);
            _shader.SetInt("uRenderPass", 1);
-            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent);
+            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent[gpuQuerySlot]);
            _gl.MultiDrawElementsIndirect(
                PrimitiveType.Triangles,
                DrawElementsType.UnsignedShort,
@ -843,21 +877,10 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
            _cpuSamples[_cpuSampleCursor] = cpuUs;
            _cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length;

-            // Read GPU samples non-blocking; the result for the previous frame's
-            // queries should be ready by now. If not, drop the sample (don't stall
-            // the CPU waiting for the GPU).
-            if (_gpuQueriesInitialized)
-            {
-                _gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.ResultAvailable, out int avail);
-                if (avail != 0)
-                {
-                    _gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.Result, out ulong opaqueNs);
-                    _gl.GetQueryObject(_gpuQueryTransparent, QueryObjectParameterName.Result, out ulong transNs);
-                    long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
-                    _gpuSamples[_gpuSampleCursor] = gpuUs;
-                    _gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
-                }
-            }
+            // GPU sample read happens BEFORE issuing the next frame's queries
+            // (see step 1.3 above). Increment the frame counter here so the
+            // next call computes a fresh slot.
+            if (_gpuQueriesInitialized) _gpuQueryFrameIndex++;

            _drawsIssued     += _opaqueDrawCount + _transparentDrawCount;
            _instancesIssued += totalInstances;
@ -1139,8 +1162,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
        _gl.DeleteBuffer(_indirectBuffer);
        if (_gpuQueriesInitialized)
        {
-            _gl.DeleteQuery(_gpuQueryOpaque);
-            _gl.DeleteQuery(_gpuQueryTransparent);
+            for (int i = 0; i < GpuQueryRingDepth; i++)
+            {
+                _gl.DeleteQuery(_gpuQueryOpaque[i]);
+                _gl.DeleteQuery(_gpuQueryTransparent[i]);
+            }
        }
    }