diff --git a/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs b/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs index d0dbd82..605b1e6 100644 --- a/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs +++ b/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs @@ -152,8 +152,16 @@ public sealed unsafe class WbDrawDispatcher : IDisposable private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new(); private readonly long[] _cpuSamples = new long[256]; // microseconds private int _cpuSampleCursor; - private uint _gpuQueryOpaque; - private uint _gpuQueryTransparent; + // GPU timing uses a ring of 3 query-pair slots so the read of frame N-3's + // result lands when the GPU has finished (~50ms after issue on a typical + // 60fps frame). Ring of 3 is the vendor-neutral choice: NVIDIA drivers with + // triple-buffering+vsync can queue ~3 frames ahead, AMD typically 1-2, + // Intel iGPUs vary. ResultAvailable is the safety guard if the GPU is + // still working when we try to read. + private const int GpuQueryRingDepth = 3; + private readonly uint[] _gpuQueryOpaque = new uint[GpuQueryRingDepth]; + private readonly uint[] _gpuQueryTransparent = new uint[GpuQueryRingDepth]; + private int _gpuQueryFrameIndex; private readonly long[] _gpuSamples = new long[256]; // microseconds private int _gpuSampleCursor; private bool _gpuQueriesInitialized; @@ -346,8 +354,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable if (diag && !_gpuQueriesInitialized) { - _gpuQueryOpaque = _gl.GenQuery(); - _gpuQueryTransparent = _gl.GenQuery(); + for (int i = 0; i < GpuQueryRingDepth; i++) + { + _gpuQueryOpaque[i] = _gl.GenQuery(); + _gpuQueryTransparent[i] = _gl.GenQuery(); + } _gpuQueriesInitialized = true; } @@ -754,6 +765,29 @@ public sealed unsafe class WbDrawDispatcher : IDisposable if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal)) _gl.Disable(EnableCap.CullFace); + // GPU timing: compute this frame's ring slot. We read frame N-3's + // result (the oldest data in the ring) before overwriting it with + // frame N's queries. Hoisted to function scope so both the opaque + // and transparent passes below can reference gpuQuerySlot. See spec + // §3 Q1/Q2 + §4 in + // docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md. + int gpuQuerySlot = _gpuQueryFrameIndex % GpuQueryRingDepth; + if (_gpuQueriesInitialized && _gpuQueryFrameIndex >= GpuQueryRingDepth) + { + _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.ResultAvailable, out int avail); + if (avail != 0) + { + _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.Result, out ulong opaqueNs); + _gl.GetQueryObject(_gpuQueryTransparent[gpuQuerySlot], QueryObjectParameterName.Result, out ulong transNs); + long gpuUs = (long)((opaqueNs + transNs) / 1000UL); + _gpuSamples[_gpuSampleCursor] = gpuUs; + _gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length; + } + // If avail==0 the sample is dropped silently. MedianMicros + // computes over the non-zero subset, so dropped samples don't + // poison the median. + } + // ── Phase 7: opaque pass ───────────────────────────────────────────── if (_opaqueDrawCount > 0) { @@ -771,7 +805,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable // mesh_modern.vert for why this is needed. _shader.SetInt("uDrawIDOffset", 0); _gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer); - if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque); + if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque[gpuQuerySlot]); _gl.MultiDrawElementsIndirect( PrimitiveType.Triangles, DrawElementsType.UnsignedShort, @@ -820,7 +854,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable _gl.CullFace(TriangleFace.Back); _gl.FrontFace(FrontFaceDirection.Ccw); _shader.SetInt("uRenderPass", 1); - if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent); + if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent[gpuQuerySlot]); _gl.MultiDrawElementsIndirect( PrimitiveType.Triangles, DrawElementsType.UnsignedShort, @@ -843,21 +877,10 @@ public sealed unsafe class WbDrawDispatcher : IDisposable _cpuSamples[_cpuSampleCursor] = cpuUs; _cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length; - // Read GPU samples non-blocking; the result for the previous frame's - // queries should be ready by now. If not, drop the sample (don't stall - // the CPU waiting for the GPU). - if (_gpuQueriesInitialized) - { - _gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.ResultAvailable, out int avail); - if (avail != 0) - { - _gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.Result, out ulong opaqueNs); - _gl.GetQueryObject(_gpuQueryTransparent, QueryObjectParameterName.Result, out ulong transNs); - long gpuUs = (long)((opaqueNs + transNs) / 1000UL); - _gpuSamples[_gpuSampleCursor] = gpuUs; - _gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length; - } - } + // GPU sample read happens BEFORE issuing the next frame's queries + // (see step 1.3 above). Increment the frame counter here so the + // next call computes a fresh slot. + if (_gpuQueriesInitialized) _gpuQueryFrameIndex++; _drawsIssued += _opaqueDrawCount + _transparentDrawCount; _instancesIssued += totalInstances; @@ -1139,8 +1162,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable _gl.DeleteBuffer(_indirectBuffer); if (_gpuQueriesInitialized) { - _gl.DeleteQuery(_gpuQueryOpaque); - _gl.DeleteQuery(_gpuQueryTransparent); + for (int i = 0; i < GpuQueryRingDepth; i++) + { + _gl.DeleteQuery(_gpuQueryOpaque[i]); + _gl.DeleteQuery(_gpuQueryTransparent[i]); + } } }