phase(N.5) Task 10: glMultiDrawElementsIndirect dispatch — visual verified

Replaces WbDrawDispatcher's per-group glDrawElementsInstancedBaseVertexBaseInstance
loop with two glMultiDrawElementsIndirect calls (opaque + transparent).
Per-frame uploads three SSBOs:
- _instanceSsbo @ binding=0 (mat4 per instance, indexed by gl_BaseInstanceARB + gl_InstanceID)
- _batchSsbo @ binding=1 (BatchData per group, indexed by gl_DrawIDARB)
- _indirectBuffer (DrawElementsIndirectCommand[] — opaque first, transparent second)

GameWindow swaps the shader load to mesh_modern when _bindlessSupport
is non-null. Capability detection + shader load now run in the right
order (capability before TextureCache + before Shader).

Deletes the obsolete DrawGroup stub, EnsureInstanceAttribs, _instanceBuffer,
_patchedVaos. ClassifyBatches + ResolveTexture already migrated in
Task 8 to use ulong bindless handles.

BuildIndirectArrays (Task 9) wired in: _opaqueDraws + _translucentDraws
are flattened into IndirectGroupInput[], laid out via the helper into
contiguous indirect commands + parallel BatchData[]. opaqueByteOffset=0,
transparentByteOffset = opaqueCount × DrawCommandStride.

Visual verification (USER GATE) PASS: Holtburg courtyard renders
identical to N.4 — terrain, scenery, characters, NPCs all visible
without artifacts. [N.5] modern path capabilities present + mesh_modern
shader loaded log lines confirm the boot path. [WB-DIAG] hot-path
counters show healthy entity/draw activity.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Erik 2026-05-08 20:51:49 +02:00
parent b163c53622
commit f533414edf
2 changed files with 123 additions and 95 deletions

View file

@ -970,9 +970,9 @@ public sealed class GameWindow : IDisposable
Path.Combine(shadersDir, "terrain.vert"),
Path.Combine(shadersDir, "terrain.frag"));
_meshShader = new Shader(_gl,
Path.Combine(shadersDir, "mesh_instanced.vert"),
Path.Combine(shadersDir, "mesh_instanced.frag"));
// mesh_instanced is the default; Task 10 (N.5) moves the final shader
// selection to after capability detection so mesh_modern can be chosen
// when bindless + ARB_shader_draw_parameters are available. See below.
// Phase G.1/G.2: shared scene-lighting UBO. Stays bound at
// binding=1 for the lifetime of the process — every shader that
@ -1447,6 +1447,23 @@ public sealed class GameWindow : IDisposable
}
}
// N.5 Task 10: load mesh_modern when both extensions are present;
// fall back to mesh_instanced otherwise. Must be after capability
// detection so _bindlessSupport is known.
if (_bindlessSupport is not null)
{
_meshShader = new Shader(_gl,
Path.Combine(shadersDir, "mesh_modern.vert"),
Path.Combine(shadersDir, "mesh_modern.frag"));
Console.WriteLine("[N.5] mesh_modern shader loaded");
}
else
{
_meshShader = new Shader(_gl,
Path.Combine(shadersDir, "mesh_instanced.vert"),
Path.Combine(shadersDir, "mesh_instanced.frag"));
}
_textureCache = new TextureCache(_gl, _dats, _bindlessSupport);
// Two persistent GL sampler objects (Repeat + ClampToEdge) so
// the sky pass can pick wrap mode per submesh without mutating

View file

@ -32,18 +32,19 @@ namespace AcDream.App.Rendering.Wb;
/// </para>
///
/// <para>
/// <b>GL strategy:</b> GROUPED instanced drawing. All visible (entity, batch)
/// pairs are bucketed by <see cref="GroupKey"/>; within a group a single
/// <c>glDrawElementsInstancedBaseVertexBaseInstance</c> renders all instances.
/// All matrices for the frame land in one shared instance VBO via a single
/// <c>BufferData</c> upload. This drops draw calls from O(entities×batches)
/// to O(unique GfxObj×batch×texture) — typically two orders of magnitude fewer.
/// <b>GL strategy (N.5):</b> <c>glMultiDrawElementsIndirect</c> with SSBOs.
/// All visible (entity, batch) pairs are bucketed by <see cref="GroupKey"/>;
/// each group becomes one <c>DrawElementsIndirectCommand</c>. Three GPU buffers
/// are uploaded per frame: instance matrices (SSBO binding 0), per-group batch
/// metadata/texture handles (SSBO binding 1), and the indirect draw commands.
/// Two <c>glMultiDrawElementsIndirect</c> calls cover the opaque and transparent
/// passes respectively — one GL call per pass regardless of group count.
/// </para>
///
/// <para>
/// <b>Shader:</b> reuses <c>mesh_instanced</c> (vert locations 0-2 = Position/
/// Normal/UV from WB's <c>VertexPositionNormalTexture</c>; locations 3-6 = instance
/// matrix from our VBO). WB's 32-byte vertex stride is compatible.
/// <b>Shader:</b> <c>mesh_modern</c> when bindless + ARB_shader_draw_parameters
/// are available (N.5 path). Falls back to <c>mesh_instanced</c> when the GPU
/// lacks those extensions.
/// </para>
///
/// <para>
@ -74,11 +75,9 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
private BatchData[] _batchData = new BatchData[256];
private DrawElementsIndirectCommand[] _indirectCommands = new DrawElementsIndirectCommand[256];
#pragma warning disable CS0169 // Tasks 9-10 wire these counters
private int _opaqueDrawCount;
private int _transparentDrawCount;
private int _transparentByteOffset;
#pragma warning restore CS0169
// std430 layout: ulong TextureHandle (uvec2) at offset 0, uint TextureLayer
// at offset 8, uint Flags at offset 12. Total 16 bytes.
@ -94,13 +93,10 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
public uint Flags;
}
private readonly HashSet<uint> _patchedVaos = new();
// Per-frame scratch — reused across frames to avoid per-frame allocation.
private readonly Dictionary<GroupKey, InstanceGroup> _groups = new();
private readonly List<InstanceGroup> _opaqueDraws = new();
private readonly List<InstanceGroup> _translucentDraws = new();
private float[] _instanceBuffer = new float[256 * 16]; // grow on demand, never shrink
// Per-entity-cull AABB radius. Conservative — covers most entities; large
// outliers (long banners, tall columns) are still landblock-culled.
@ -275,8 +271,7 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
return;
}
// ── Phase 2: lay matrices out contiguously, assign per-group offsets,
// split into opaque/translucent + compute sort keys ─────────
// ── Phase 3: assign FirstInstance per group, lay matrices contiguously, sort opaque ──
int totalInstances = 0;
foreach (var grp in _groups.Values) totalInstances += grp.Matrices.Count;
if (totalInstances == 0)
@ -286,8 +281,8 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
}
int needed = totalInstances * 16;
if (_instanceBuffer.Length < needed)
_instanceBuffer = new float[needed + 256 * 16]; // headroom
if (_instanceData.Length < needed)
_instanceData = new float[needed + 256 * 16];
_opaqueDraws.Clear();
_translucentDraws.Clear();
@ -304,17 +299,17 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
// position for front-to-back sort (perf #2). Cheap heuristic; works
// well when instances of one group are spatially coherent
// (typical for trees in one landblock area, NPCs at one spawn).
var firstM = grp.Matrices[0];
var grpPos = new Vector3(firstM.M41, firstM.M42, firstM.M43);
var first = grp.Matrices[0];
var grpPos = new Vector3(first.M41, first.M42, first.M43);
grp.SortDistance = Vector3.DistanceSquared(camPos, grpPos);
for (int i = 0; i < grp.Matrices.Count; i++)
{
WriteMatrix(_instanceBuffer, cursor * 16, grp.Matrices[i]);
WriteMatrix(_instanceData, cursor * 16, grp.Matrices[i]);
cursor++;
}
if (grp.Translucency == TranslucencyKind.Opaque || grp.Translucency == TranslucencyKind.ClipMap)
if (IsOpaque(grp.Translucency))
_opaqueDraws.Add(grp);
else
_translucentDraws.Add(grp);
@ -326,82 +321,115 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
// Foundry interior).
_opaqueDraws.Sort(static (a, b) => a.SortDistance.CompareTo(b.SortDistance));
// ── Phase 3: one upload of all matrices ─────────────────────────────
// NOTE: _instanceSsbo is temporarily bound as ArrayBuffer for compile
// compatibility. Tasks 9-10 rewrite this to BindBufferBase(SSBO) +
// glMultiDrawElementsIndirect.
_gl.BindBuffer(BufferTargetARB.ArrayBuffer, _instanceSsbo);
fixed (float* p = _instanceBuffer)
_gl.BufferData(BufferTargetARB.ArrayBuffer,
(nuint)(totalInstances * 16 * sizeof(float)), p, BufferUsageARB.DynamicDraw);
// ── Phase 4: build IndirectGroupInput list (opaque sorted, then translucent),
// fill via BuildIndirectArrays ──────────────────────────────────
int totalDraws = _opaqueDraws.Count + _translucentDraws.Count;
if (_batchData.Length < totalDraws)
_batchData = new BatchData[totalDraws + 64];
if (_indirectCommands.Length < totalDraws)
_indirectCommands = new DrawElementsIndirectCommand[totalDraws + 64];
// ── Phase 4: bind VAO once (modern rendering shares one global VAO) ──
EnsureInstanceAttribs(anyVao);
var groupInputs = new List<IndirectGroupInput>(totalDraws);
foreach (var g in _opaqueDraws) groupInputs.Add(ToInput(g));
foreach (var g in _translucentDraws) groupInputs.Add(ToInput(g));
// Cast _batchData (private BatchData) to public-mirror BatchDataPublic for BuildIndirectArrays.
// Layout is asserted at test time (BatchDataPublic_LayoutMatchesPrivateBatchData test).
var batchPublic = new BatchDataPublic[totalDraws];
var layout = BuildIndirectArrays(groupInputs, _indirectCommands, batchPublic);
// Copy back into _batchData
for (int i = 0; i < totalDraws; i++)
{
_batchData[i] = new BatchData
{
TextureHandle = batchPublic[i].TextureHandle,
TextureLayer = batchPublic[i].TextureLayer,
Flags = batchPublic[i].Flags,
};
}
_opaqueDrawCount = layout.OpaqueCount;
_transparentDrawCount = layout.TransparentCount;
_transparentByteOffset = layout.TransparentByteOffset;
// ── Phase 5: upload three buffers ───────────────────────────────────
fixed (float* ip = _instanceData)
UploadSsbo(_instanceSsbo, 0, ip, totalInstances * 16 * sizeof(float));
fixed (BatchData* bp = _batchData)
UploadSsbo(_batchSsbo, 1, bp, totalDraws * sizeof(BatchData));
fixed (DrawElementsIndirectCommand* cp = _indirectCommands)
{
_gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
_gl.BufferData(BufferTargetARB.DrawIndirectBuffer,
(nuint)(totalDraws * sizeof(DrawElementsIndirectCommand)), cp, BufferUsageARB.DynamicDraw);
}
// ── Phase 6: bind global VAO once ───────────────────────────────────
_gl.BindVertexArray(anyVao);
// ── Phase 5: opaque + ClipMap pass (front-to-back sorted) ───────────
if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
_gl.Disable(EnableCap.CullFace);
foreach (var grp in _opaqueDraws)
// ── Phase 7: opaque pass ─────────────────────────────────────────────
if (_opaqueDrawCount > 0)
{
_shader.SetInt("uTranslucencyKind", (int)grp.Translucency);
DrawGroup(grp);
_gl.Disable(EnableCap.Blend);
_gl.DepthMask(true);
_shader.SetInt("uRenderPass", 0);
_gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
_gl.MultiDrawElementsIndirect(
PrimitiveType.Triangles,
DrawElementsType.UnsignedShort,
(void*)0,
(uint)_opaqueDrawCount,
(uint)DrawCommandStride);
}
// ── Phase 6: translucent pass ───────────────────────────────────────
_gl.Enable(EnableCap.Blend);
_gl.DepthMask(false);
if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
// ── Phase 8: transparent pass ────────────────────────────────────────
if (_transparentDrawCount > 0)
{
_gl.Disable(EnableCap.CullFace);
}
else
{
_gl.Enable(EnableCap.CullFace);
_gl.CullFace(TriangleFace.Back);
_gl.FrontFace(FrontFaceDirection.Ccw);
_gl.Enable(EnableCap.Blend);
_gl.BlendFunc(BlendingFactor.SrcAlpha, BlendingFactor.OneMinusSrcAlpha);
_gl.DepthMask(false);
_shader.SetInt("uRenderPass", 1);
_gl.MultiDrawElementsIndirect(
PrimitiveType.Triangles,
DrawElementsType.UnsignedShort,
(void*)_transparentByteOffset,
(uint)_transparentDrawCount,
(uint)DrawCommandStride);
_gl.DepthMask(true);
_gl.Disable(EnableCap.Blend);
}
foreach (var grp in _translucentDraws)
{
switch (grp.Translucency)
{
case TranslucencyKind.Additive:
_gl.BlendFunc(BlendingFactor.SrcAlpha, BlendingFactor.One);
break;
case TranslucencyKind.InvAlpha:
_gl.BlendFunc(BlendingFactor.OneMinusSrcAlpha, BlendingFactor.SrcAlpha);
break;
default:
_gl.BlendFunc(BlendingFactor.SrcAlpha, BlendingFactor.OneMinusSrcAlpha);
break;
}
_shader.SetInt("uTranslucencyKind", (int)grp.Translucency);
DrawGroup(grp);
}
_gl.DepthMask(true);
_gl.Disable(EnableCap.Blend);
_gl.Disable(EnableCap.CullFace);
_gl.BindVertexArray(0);
if (diag)
{
_drawsIssued += _opaqueDraws.Count + _translucentDraws.Count;
_drawsIssued += _opaqueDrawCount + _transparentDrawCount;
_instancesIssued += totalInstances;
MaybeFlushDiag();
}
}
private void DrawGroup(InstanceGroup grp)
private static IndirectGroupInput ToInput(InstanceGroup g) => new(
IndexCount: g.IndexCount,
FirstIndex: g.FirstIndex,
BaseVertex: g.BaseVertex,
InstanceCount: g.InstanceCount,
FirstInstance: g.FirstInstance,
TextureHandle: g.BindlessTextureHandle,
TextureLayer: g.TextureLayer,
Translucency: g.Translucency);
private unsafe void UploadSsbo(uint ssbo, uint binding, void* data, int byteCount)
{
throw new NotImplementedException(
"DrawGroup is being removed in Task 10 — the dispatcher rewrites Draw() " +
"to use glMultiDrawElementsIndirect instead of per-group draws. " +
"If this throws at runtime, Task 10 hasn't landed yet.");
_gl.BindBuffer(BufferTargetARB.ShaderStorageBuffer, ssbo);
_gl.BufferData(BufferTargetARB.ShaderStorageBuffer, (nuint)byteCount, data, BufferUsageARB.DynamicDraw);
_gl.BindBufferBase(BufferTargetARB.ShaderStorageBuffer, binding, ssbo);
}
private void MaybeFlushDiag()
@ -495,23 +523,6 @@ public sealed unsafe class WbDrawDispatcher : IDisposable
}
}
private void EnsureInstanceAttribs(uint vao)
{
if (!_patchedVaos.Add(vao)) return;
_gl.BindVertexArray(vao);
// NOTE: temporarily binding _instanceSsbo as ArrayBuffer for compile
// compatibility. Tasks 9-10 replace with BindBufferBase(SSBO).
_gl.BindBuffer(BufferTargetARB.ArrayBuffer, _instanceSsbo);
for (uint row = 0; row < 4; row++)
{
uint loc = 3 + row;
_gl.EnableVertexAttribArray(loc);
_gl.VertexAttribPointer(loc, 4, VertexAttribPointerType.Float, false, 64, (void*)(row * 16));
_gl.VertexAttribDivisor(loc, 1);
}
}
private static void WriteMatrix(float[] buf, int offset, in Matrix4x4 m)
{
buf[offset + 0] = m.M11; buf[offset + 1] = m.M12; buf[offset + 2] = m.M13; buf[offset + 3] = m.M14;