diff --git a/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs b/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs
index d0dbd82..605b1e6 100644
--- a/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs
+++ b/src/AcDream.App/Rendering/Wb/WbDrawDispatcher.cs
@@ -152,8 +152,16 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
     private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new();
     private readonly long[] _cpuSamples = new long[256];   // microseconds
     private int _cpuSampleCursor;
-    private uint _gpuQueryOpaque;
-    private uint _gpuQueryTransparent;
+    // GPU timing uses a ring of 3 query-pair slots so the read of frame N-3's
+    // result lands when the GPU has finished (~50ms after issue on a typical
+    // 60fps frame). Ring of 3 is the vendor-neutral choice: NVIDIA drivers with
+    // triple-buffering+vsync can queue ~3 frames ahead, AMD typically 1-2,
+    // Intel iGPUs vary. ResultAvailable is the safety guard if the GPU is
+    // still working when we try to read.
+    private const int GpuQueryRingDepth = 3;
+    private readonly uint[] _gpuQueryOpaque      = new uint[GpuQueryRingDepth];
+    private readonly uint[] _gpuQueryTransparent = new uint[GpuQueryRingDepth];
+    private int _gpuQueryFrameIndex;
     private readonly long[] _gpuSamples = new long[256];   // microseconds
     private int _gpuSampleCursor;
     private bool _gpuQueriesInitialized;
@@ -346,8 +354,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
 
         if (diag && !_gpuQueriesInitialized)
         {
-            _gpuQueryOpaque      = _gl.GenQuery();
-            _gpuQueryTransparent = _gl.GenQuery();
+            for (int i = 0; i < GpuQueryRingDepth; i++)
+            {
+                _gpuQueryOpaque[i]      = _gl.GenQuery();
+                _gpuQueryTransparent[i] = _gl.GenQuery();
+            }
             _gpuQueriesInitialized = true;
         }
 
@@ -754,6 +765,29 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
         if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
             _gl.Disable(EnableCap.CullFace);
 
+        // GPU timing: compute this frame's ring slot. We read frame N-3's
+        // result (the oldest data in the ring) before overwriting it with
+        // frame N's queries. Hoisted to function scope so both the opaque
+        // and transparent passes below can reference gpuQuerySlot. See spec
+        // §3 Q1/Q2 + §4 in
+        // docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md.
+        int gpuQuerySlot = _gpuQueryFrameIndex % GpuQueryRingDepth;
+        if (_gpuQueriesInitialized && _gpuQueryFrameIndex >= GpuQueryRingDepth)
+        {
+            _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.ResultAvailable, out int avail);
+            if (avail != 0)
+            {
+                _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot],      QueryObjectParameterName.Result, out ulong opaqueNs);
+                _gl.GetQueryObject(_gpuQueryTransparent[gpuQuerySlot], QueryObjectParameterName.Result, out ulong transNs);
+                long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
+                _gpuSamples[_gpuSampleCursor] = gpuUs;
+                _gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
+            }
+            // If avail==0 the sample is dropped silently. MedianMicros
+            // computes over the non-zero subset, so dropped samples don't
+            // poison the median.
+        }
+
         // ── Phase 7: opaque pass ─────────────────────────────────────────────
         if (_opaqueDrawCount > 0)
         {
@@ -771,7 +805,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
             // mesh_modern.vert for why this is needed.
             _shader.SetInt("uDrawIDOffset", 0);
             _gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
-            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque);
+            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque[gpuQuerySlot]);
             _gl.MultiDrawElementsIndirect(
                 PrimitiveType.Triangles,
                 DrawElementsType.UnsignedShort,
@@ -820,7 +854,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
             _gl.CullFace(TriangleFace.Back);
             _gl.FrontFace(FrontFaceDirection.Ccw);
             _shader.SetInt("uRenderPass", 1);
-            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent);
+            if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent[gpuQuerySlot]);
             _gl.MultiDrawElementsIndirect(
                 PrimitiveType.Triangles,
                 DrawElementsType.UnsignedShort,
@@ -843,21 +877,10 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
             _cpuSamples[_cpuSampleCursor] = cpuUs;
             _cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length;
 
-            // Read GPU samples non-blocking; the result for the previous frame's
-            // queries should be ready by now. If not, drop the sample (don't stall
-            // the CPU waiting for the GPU).
-            if (_gpuQueriesInitialized)
-            {
-                _gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.ResultAvailable, out int avail);
-                if (avail != 0)
-                {
-                    _gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.Result, out ulong opaqueNs);
-                    _gl.GetQueryObject(_gpuQueryTransparent, QueryObjectParameterName.Result, out ulong transNs);
-                    long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
-                    _gpuSamples[_gpuSampleCursor] = gpuUs;
-                    _gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
-                }
-            }
+            // GPU sample read happens BEFORE issuing the next frame's queries
+            // (see step 1.3 above). Increment the frame counter here so the
+            // next call computes a fresh slot.
+            if (_gpuQueriesInitialized) _gpuQueryFrameIndex++;
 
             _drawsIssued     += _opaqueDrawCount + _transparentDrawCount;
             _instancesIssued += totalInstances;
@@ -1139,8 +1162,11 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
         _gl.DeleteBuffer(_indirectBuffer);
         if (_gpuQueriesInitialized)
         {
-            _gl.DeleteQuery(_gpuQueryOpaque);
-            _gl.DeleteQuery(_gpuQueryTransparent);
+            for (int i = 0; i < GpuQueryRingDepth; i++)
+            {
+                _gl.DeleteQuery(_gpuQueryOpaque[i]);
+                _gl.DeleteQuery(_gpuQueryTransparent[i]);
+            }
         }
     }