feat(perf): Phase N.6 slice 1 — fix gpu_us double-buffering in WbDrawDispatcher

The dispatcher's GPU TimeElapsed queries were polled in the same frame
as the indirect draw, so glGetQueryObject(ResultAvailable) always
returned 0 and gpu_us in [WB-DIAG] was stuck at 0m/0p95.

Replace the 2 single-handle queries with ring-of-3 arrays and move the
result read to BEFORE issuing the next frame's queries into the same
slot — at frame N we read slot N%3 which holds frame N-3's queries
(oldest in the ring, ~50ms old at 60fps and definitely done across all
desktop GL drivers). Vendor-neutral: AMD/NVIDIA/Intel desktop GL all
work without driver-specific code.

The gpuQuerySlot variable is hoisted to function scope (just before
Phase 7 opaque pass) so both the opaque and transparent passes
reference the same slot — the plan placed it inside the opaque-pass
if-block, which would have been out of scope for the transparent
BeginQuery; corrected in the implementation.

No new tests — the change is purely a diagnostic readout fix, no
observable behavior in the rendering path. Build green; tests at
baseline (1711 passing, 8 pre-existing physics/MotionInterpreter
failures unchanged). Manual gpu_us verification still pending in-world.

Spec: docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md (§4).
Plan: docs/superpowers/plans/2026-05-11-phase-n6-slice1.md (Task 1).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Erik 2026-05-11 11:24:26 +02:00
parent a4931eeaa2
commit a7c98004bb

View file

@ -152,8 +152,16 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new();
private readonly long[] _cpuSamples = new long[256]; // microseconds
private int _cpuSampleCursor;
private uint _gpuQueryOpaque;
private uint _gpuQueryTransparent;
// GPU timing uses a ring of 3 query-pair slots so the read of frame N-3's
// result lands when the GPU has finished (~50ms after issue on a typical
// 60fps frame). Ring of 3 is the vendor-neutral choice: NVIDIA drivers with
// triple-buffering+vsync can queue ~3 frames ahead, AMD typically 1-2,
// Intel iGPUs vary. ResultAvailable is the safety guard if the GPU is
// still working when we try to read.
private const int GpuQueryRingDepth = 3;
private readonly uint[] _gpuQueryOpaque = new uint[GpuQueryRingDepth];
private readonly uint[] _gpuQueryTransparent = new uint[GpuQueryRingDepth];
private int _gpuQueryFrameIndex;
private readonly long[] _gpuSamples = new long[256]; // microseconds
private int _gpuSampleCursor;
private bool _gpuQueriesInitialized;
@ -346,8 +354,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
if (diag && !_gpuQueriesInitialized)
{
_gpuQueryOpaque = _gl.GenQuery();
_gpuQueryTransparent = _gl.GenQuery();
for (int i = 0; i < GpuQueryRingDepth; i++)
{
_gpuQueryOpaque[i] = _gl.GenQuery();
_gpuQueryTransparent[i] = _gl.GenQuery();
}
_gpuQueriesInitialized = true;
}
@ -754,6 +765,29 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
_gl.Disable(EnableCap.CullFace);
// GPU timing: compute this frame's ring slot. We read frame N-3's
// result (the oldest data in the ring) before overwriting it with
// frame N's queries. Hoisted to function scope so both the opaque
// and transparent passes below can reference gpuQuerySlot. See spec
// §3 Q1/Q2 + §4 in
// docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md.
int gpuQuerySlot = _gpuQueryFrameIndex % GpuQueryRingDepth;
if (_gpuQueriesInitialized && _gpuQueryFrameIndex >= GpuQueryRingDepth)
{
_gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.ResultAvailable, out int avail);
if (avail != 0)
{
_gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.Result, out ulong opaqueNs);
_gl.GetQueryObject(_gpuQueryTransparent[gpuQuerySlot], QueryObjectParameterName.Result, out ulong transNs);
long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
_gpuSamples[_gpuSampleCursor] = gpuUs;
_gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
}
// If avail==0 the sample is dropped silently. MedianMicros
// computes over the non-zero subset, so dropped samples don't
// poison the median.
}
// ── Phase 7: opaque pass ─────────────────────────────────────────────
if (_opaqueDrawCount > 0)
{
@ -771,7 +805,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
// mesh_modern.vert for why this is needed.
_shader.SetInt("uDrawIDOffset", 0);
_gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque);
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque[gpuQuerySlot]);
_gl.MultiDrawElementsIndirect(
PrimitiveType.Triangles,
DrawElementsType.UnsignedShort,
@ -820,7 +854,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
_gl.CullFace(TriangleFace.Back);
_gl.FrontFace(FrontFaceDirection.Ccw);
_shader.SetInt("uRenderPass", 1);
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent);
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent[gpuQuerySlot]);
_gl.MultiDrawElementsIndirect(
PrimitiveType.Triangles,
DrawElementsType.UnsignedShort,
@ -843,21 +877,10 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
_cpuSamples[_cpuSampleCursor] = cpuUs;
_cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length;
// Read GPU samples non-blocking; the result for the previous frame's
// queries should be ready by now. If not, drop the sample (don't stall
// the CPU waiting for the GPU).
if (_gpuQueriesInitialized)
{
_gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.ResultAvailable, out int avail);
if (avail != 0)
{
_gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.Result, out ulong opaqueNs);
_gl.GetQueryObject(_gpuQueryTransparent, QueryObjectParameterName.Result, out ulong transNs);
long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
_gpuSamples[_gpuSampleCursor] = gpuUs;
_gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
}
}
// GPU sample read happens BEFORE issuing the next frame's queries
// (see step 1.3 above). Increment the frame counter here so the
// next call computes a fresh slot.
if (_gpuQueriesInitialized) _gpuQueryFrameIndex++;
_drawsIssued += _opaqueDrawCount + _transparentDrawCount;
_instancesIssued += totalInstances;
@ -1139,8 +1162,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
_gl.DeleteBuffer(_indirectBuffer);
if (_gpuQueriesInitialized)
{
_gl.DeleteQuery(_gpuQueryOpaque);
_gl.DeleteQuery(_gpuQueryTransparent);
for (int i = 0; i < GpuQueryRingDepth; i++)
{
_gl.DeleteQuery(_gpuQueryOpaque[i]);
_gl.DeleteQuery(_gpuQueryTransparent[i]);
}
}
}