feat(perf): Phase N.6 slice 1 — fix gpu_us double-buffering in WbDrawDispatcher
The dispatcher's GPU TimeElapsed queries were polled in the same frame as the indirect draw, so glGetQueryObject(ResultAvailable) always returned 0 and gpu_us in [WB-DIAG] was stuck at 0m/0p95. Replace the 2 single-handle queries with ring-of-3 arrays and move the result read to BEFORE issuing the next frame's queries into the same slot — at frame N we read slot N%3 which holds frame N-3's queries (oldest in the ring, ~50ms old at 60fps and definitely done across all desktop GL drivers). Vendor-neutral: AMD/NVIDIA/Intel desktop GL all work without driver-specific code. The gpuQuerySlot variable is hoisted to function scope (just before Phase 7 opaque pass) so both the opaque and transparent passes reference the same slot — the plan placed it inside the opaque-pass if-block, which would have been out of scope for the transparent BeginQuery; corrected in the implementation. No new tests — the change is purely a diagnostic readout fix, no observable behavior in the rendering path. Build green; tests at baseline (1711 passing, 8 pre-existing physics/MotionInterpreter failures unchanged). Manual gpu_us verification still pending in-world. Spec: docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md (§4). Plan: docs/superpowers/plans/2026-05-11-phase-n6-slice1.md (Task 1). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a4931eeaa2
commit
a7c98004bb
1 changed files with 49 additions and 23 deletions
|
|
@ -152,8 +152,16 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
|
||||||
private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new();
|
private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new();
|
||||||
private readonly long[] _cpuSamples = new long[256]; // microseconds
|
private readonly long[] _cpuSamples = new long[256]; // microseconds
|
||||||
private int _cpuSampleCursor;
|
private int _cpuSampleCursor;
|
||||||
private uint _gpuQueryOpaque;
|
// GPU timing uses a ring of 3 query-pair slots so the read of frame N-3's
|
||||||
private uint _gpuQueryTransparent;
|
// result lands when the GPU has finished (~50ms after issue on a typical
|
||||||
|
// 60fps frame). Ring of 3 is the vendor-neutral choice: NVIDIA drivers with
|
||||||
|
// triple-buffering+vsync can queue ~3 frames ahead, AMD typically 1-2,
|
||||||
|
// Intel iGPUs vary. ResultAvailable is the safety guard if the GPU is
|
||||||
|
// still working when we try to read.
|
||||||
|
private const int GpuQueryRingDepth = 3;
|
||||||
|
private readonly uint[] _gpuQueryOpaque = new uint[GpuQueryRingDepth];
|
||||||
|
private readonly uint[] _gpuQueryTransparent = new uint[GpuQueryRingDepth];
|
||||||
|
private int _gpuQueryFrameIndex;
|
||||||
private readonly long[] _gpuSamples = new long[256]; // microseconds
|
private readonly long[] _gpuSamples = new long[256]; // microseconds
|
||||||
private int _gpuSampleCursor;
|
private int _gpuSampleCursor;
|
||||||
private bool _gpuQueriesInitialized;
|
private bool _gpuQueriesInitialized;
|
||||||
|
|
@ -346,8 +354,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
|
||||||
|
|
||||||
if (diag && !_gpuQueriesInitialized)
|
if (diag && !_gpuQueriesInitialized)
|
||||||
{
|
{
|
||||||
_gpuQueryOpaque = _gl.GenQuery();
|
for (int i = 0; i < GpuQueryRingDepth; i++)
|
||||||
_gpuQueryTransparent = _gl.GenQuery();
|
{
|
||||||
|
_gpuQueryOpaque[i] = _gl.GenQuery();
|
||||||
|
_gpuQueryTransparent[i] = _gl.GenQuery();
|
||||||
|
}
|
||||||
_gpuQueriesInitialized = true;
|
_gpuQueriesInitialized = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -754,6 +765,29 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
|
||||||
if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
|
if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
|
||||||
_gl.Disable(EnableCap.CullFace);
|
_gl.Disable(EnableCap.CullFace);
|
||||||
|
|
||||||
|
// GPU timing: compute this frame's ring slot. We read frame N-3's
|
||||||
|
// result (the oldest data in the ring) before overwriting it with
|
||||||
|
// frame N's queries. Hoisted to function scope so both the opaque
|
||||||
|
// and transparent passes below can reference gpuQuerySlot. See spec
|
||||||
|
// §3 Q1/Q2 + §4 in
|
||||||
|
// docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md.
|
||||||
|
int gpuQuerySlot = _gpuQueryFrameIndex % GpuQueryRingDepth;
|
||||||
|
if (_gpuQueriesInitialized && _gpuQueryFrameIndex >= GpuQueryRingDepth)
|
||||||
|
{
|
||||||
|
_gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.ResultAvailable, out int avail);
|
||||||
|
if (avail != 0)
|
||||||
|
{
|
||||||
|
_gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.Result, out ulong opaqueNs);
|
||||||
|
_gl.GetQueryObject(_gpuQueryTransparent[gpuQuerySlot], QueryObjectParameterName.Result, out ulong transNs);
|
||||||
|
long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
|
||||||
|
_gpuSamples[_gpuSampleCursor] = gpuUs;
|
||||||
|
_gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
|
||||||
|
}
|
||||||
|
// If avail==0 the sample is dropped silently. MedianMicros
|
||||||
|
// computes over the non-zero subset, so dropped samples don't
|
||||||
|
// poison the median.
|
||||||
|
}
|
||||||
|
|
||||||
// ── Phase 7: opaque pass ─────────────────────────────────────────────
|
// ── Phase 7: opaque pass ─────────────────────────────────────────────
|
||||||
if (_opaqueDrawCount > 0)
|
if (_opaqueDrawCount > 0)
|
||||||
{
|
{
|
||||||
|
|
@ -771,7 +805,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
|
||||||
// mesh_modern.vert for why this is needed.
|
// mesh_modern.vert for why this is needed.
|
||||||
_shader.SetInt("uDrawIDOffset", 0);
|
_shader.SetInt("uDrawIDOffset", 0);
|
||||||
_gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
|
_gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
|
||||||
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque);
|
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque[gpuQuerySlot]);
|
||||||
_gl.MultiDrawElementsIndirect(
|
_gl.MultiDrawElementsIndirect(
|
||||||
PrimitiveType.Triangles,
|
PrimitiveType.Triangles,
|
||||||
DrawElementsType.UnsignedShort,
|
DrawElementsType.UnsignedShort,
|
||||||
|
|
@ -820,7 +854,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
|
||||||
_gl.CullFace(TriangleFace.Back);
|
_gl.CullFace(TriangleFace.Back);
|
||||||
_gl.FrontFace(FrontFaceDirection.Ccw);
|
_gl.FrontFace(FrontFaceDirection.Ccw);
|
||||||
_shader.SetInt("uRenderPass", 1);
|
_shader.SetInt("uRenderPass", 1);
|
||||||
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent);
|
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent[gpuQuerySlot]);
|
||||||
_gl.MultiDrawElementsIndirect(
|
_gl.MultiDrawElementsIndirect(
|
||||||
PrimitiveType.Triangles,
|
PrimitiveType.Triangles,
|
||||||
DrawElementsType.UnsignedShort,
|
DrawElementsType.UnsignedShort,
|
||||||
|
|
@ -843,21 +877,10 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
|
||||||
_cpuSamples[_cpuSampleCursor] = cpuUs;
|
_cpuSamples[_cpuSampleCursor] = cpuUs;
|
||||||
_cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length;
|
_cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length;
|
||||||
|
|
||||||
// Read GPU samples non-blocking; the result for the previous frame's
|
// GPU sample read happens BEFORE issuing the next frame's queries
|
||||||
// queries should be ready by now. If not, drop the sample (don't stall
|
// (see step 1.3 above). Increment the frame counter here so the
|
||||||
// the CPU waiting for the GPU).
|
// next call computes a fresh slot.
|
||||||
if (_gpuQueriesInitialized)
|
if (_gpuQueriesInitialized) _gpuQueryFrameIndex++;
|
||||||
{
|
|
||||||
_gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.ResultAvailable, out int avail);
|
|
||||||
if (avail != 0)
|
|
||||||
{
|
|
||||||
_gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.Result, out ulong opaqueNs);
|
|
||||||
_gl.GetQueryObject(_gpuQueryTransparent, QueryObjectParameterName.Result, out ulong transNs);
|
|
||||||
long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
|
|
||||||
_gpuSamples[_gpuSampleCursor] = gpuUs;
|
|
||||||
_gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_drawsIssued += _opaqueDrawCount + _transparentDrawCount;
|
_drawsIssued += _opaqueDrawCount + _transparentDrawCount;
|
||||||
_instancesIssued += totalInstances;
|
_instancesIssued += totalInstances;
|
||||||
|
|
@ -1139,8 +1162,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
|
||||||
_gl.DeleteBuffer(_indirectBuffer);
|
_gl.DeleteBuffer(_indirectBuffer);
|
||||||
if (_gpuQueriesInitialized)
|
if (_gpuQueriesInitialized)
|
||||||
{
|
{
|
||||||
_gl.DeleteQuery(_gpuQueryOpaque);
|
for (int i = 0; i < GpuQueryRingDepth; i++)
|
||||||
_gl.DeleteQuery(_gpuQueryTransparent);
|
{
|
||||||
|
_gl.DeleteQuery(_gpuQueryOpaque[i]);
|
||||||
|
_gl.DeleteQuery(_gpuQueryTransparent[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue