using System; using System.Collections.Generic; using System.Numerics; using System.Runtime.InteropServices; using AcDream.Core.Meshing; using AcDream.Core.Terrain; using AcDream.Core.World; using Chorizite.OpenGLSDLBackend.Lib; using Silk.NET.OpenGL; namespace AcDream.App.Rendering.Wb; /// /// Draws entities using WB's (a single global /// VAO/VBO/IBO under modern rendering) with acdream's /// for bindless texture resolution and for /// translucency classification. /// /// /// Atlas-tier entities (ServerGuid == 0): mesh data comes from WB's /// via . /// Textures resolve through the bindless-suffixed /// variants, returning 64-bit /// resident handles stored in the per-group SSBO. /// /// /// /// Per-instance-tier entities (ServerGuid != 0): mesh data also from /// WB, but textures resolve through /// with palette /// and surface overrides applied. is currently /// unused at draw time — GameWindow's spawn path already bakes AnimPartChanges + /// GfxObjDegradeResolver (Issue #47 close-detail mesh) into MeshRefs. /// /// /// /// GL strategy (N.5 — mandatory): glMultiDrawElementsIndirect with SSBOs /// and GL_ARB_bindless_texture + GL_ARB_shader_draw_parameters. /// All visible (entity, batch) pairs are bucketed by ; /// each group becomes one DrawElementsIndirectCommand. Three GPU buffers /// are uploaded per frame: instance matrices (SSBO binding 0), per-group batch /// metadata/texture handles (SSBO binding 1), and the indirect draw commands. /// Two glMultiDrawElementsIndirect calls cover the opaque and transparent /// passes respectively — one GL call per pass regardless of group count. /// /// /// /// Shader: mesh_modern (bindless + gl_DrawIDARB / /// gl_BaseInstanceARB). Missing bindless/draw-parameters throws /// at startup — there is no legacy fallback. /// /// /// /// Modern rendering assumption: WB's _useModernRendering path (GL /// 4.3 + bindless) puts every mesh in a single shared VAO/VBO/IBO and uses /// FirstIndex + BaseVertex per batch. The dispatcher honors those /// offsets inside each DrawElementsIndirectCommand via /// glMultiDrawElementsIndirect. /// /// public sealed unsafe class WbDrawDispatcher : IDisposable { private readonly GL _gl; private readonly Shader _shader; private readonly TextureCache _textures; private readonly WbMeshAdapter _meshAdapter; private readonly EntitySpawnAdapter _entitySpawnAdapter; private readonly BindlessSupport _bindless; // Tier 1 cache (#53): per-entity classification results for static // entities (those NOT in GameWindow._animatedEntities). Wired here in // Task 7 for plumbing only — Tasks 9-10 wire the per-entity // miss-populate / hit-fast-path through the loop. private readonly EntityClassificationCache _cache; // ACDREAM_DISABLE_TIER1_CACHE=1 A/B diagnostic — forces every static // entity through the slow path. Read once in ctor. private readonly bool _tier1CacheDisabled = string.Equals(Environment.GetEnvironmentVariable("ACDREAM_DISABLE_TIER1_CACHE"), "1", StringComparison.Ordinal); /// /// A.5 T22.5: gate for GL_SAMPLE_ALPHA_TO_COVERAGE around the opaque pass. /// Default true matches T20 behavior. Set false for Low/Medium presets that /// have MsaaSamples=0 (A2C is a no-op without MSAA, but turning it off /// avoids the unnecessary GL state thrash and is cleaner diagnostics). /// Can be toggled mid-session via . /// public bool AlphaToCoverage { get; set; } = true; // SSBO buffer ids private uint _instanceSsbo; private uint _batchSsbo; private uint _indirectBuffer; // Per-frame scratch arrays — Tasks 9-10 fully wire these. private float[] _instanceData = new float[256 * 16]; // mat4 floats per instance private BatchData[] _batchData = new BatchData[256]; private DrawElementsIndirectCommand[] _indirectCommands = new DrawElementsIndirectCommand[256]; private int _opaqueDrawCount; private int _transparentDrawCount; private int _transparentByteOffset; // std430 layout: ulong TextureHandle (uvec2) at offset 0, uint TextureLayer // at offset 8, uint Flags at offset 12. Total 16 bytes. // Pack=8 (not 4) because std430's uvec2 requires 8-byte alignment — Pack=4 // works today by accident (TextureHandle is the first field, so offset 0 is // always 8-byte aligned), but adding a 4-byte field before TextureHandle // without bumping Pack would silently misalign the GPU struct. [StructLayout(LayoutKind.Sequential, Pack = 8)] private struct BatchData { public ulong TextureHandle; // bindless handle (uvec2 in GLSL) public uint TextureLayer; public uint Flags; } // Per-frame scratch — reused across frames to avoid per-frame allocation. private readonly Dictionary _groups = new(); private readonly List _opaqueDraws = new(); private readonly List _translucentDraws = new(); // A.5 T26 follow-up (Bug B): WalkEntities populates this scratch list // instead of allocating a fresh List<(WorldEntity, int)> per frame. At // ~10K entities × ~3 mesh refs = ~30K tuples × 16 bytes = ~480 KB / frame // of GC pressure on the render thread under the original T17 shape. private readonly List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)> _walkScratch = new(); // Tier 1 cache (#53) — per-entity classification collector. Reused across // frames; cleared at flush time when the per-entity loop crosses an entity // boundary in _walkScratch (and once more at end-of-loop for the last // entity). _walkScratch is in entity-order, so all MeshRefs of one entity // are contiguous — accumulate them all before flushing one Populate call. // Animated entities skip this scratch entirely (collector = null). private readonly List _populateScratch = new(); // Per-entity-cull AABB radius. Conservative — covers most entities; large // outliers (long banners, tall columns) are still landblock-culled. private const float PerEntityCullRadius = 5.0f; private bool _disposed; // Diagnostic counters logged once per ~5s under ACDREAM_WB_DIAG=1. private int _entitiesSeen; private int _entitiesDrawn; private int _meshesMissing; private int _drawsIssued; private int _instancesIssued; private long _lastLogTick; // CPU + GPU timing for [WB-DIAG] under ACDREAM_WB_DIAG=1. private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new(); private readonly long[] _cpuSamples = new long[256]; // microseconds private int _cpuSampleCursor; // GPU timing uses a ring of 3 query-pair slots so the read of frame N-3's // result lands when the GPU has finished (~50ms after issue on a typical // 60fps frame). Ring of 3 is the vendor-neutral choice: NVIDIA drivers with // triple-buffering+vsync can queue ~3 frames ahead, AMD typically 1-2, // Intel iGPUs vary. ResultAvailable is the safety guard if the GPU is // still working when we try to read. private const int GpuQueryRingDepth = 3; private readonly uint[] _gpuQueryOpaque = new uint[GpuQueryRingDepth]; private readonly uint[] _gpuQueryTransparent = new uint[GpuQueryRingDepth]; private int _gpuQueryFrameIndex; private readonly long[] _gpuSamples = new long[256]; // microseconds private int _gpuSampleCursor; private bool _gpuQueriesInitialized; // Constructor accessibility is internal because EntityClassificationCache // is internal — a public ctor with an internal-typed parameter would be // an inconsistent-accessibility error. The dispatcher is constructed // exclusively from GameWindow (same assembly), so internal is fine. internal WbDrawDispatcher( GL gl, Shader shader, TextureCache textures, WbMeshAdapter meshAdapter, EntitySpawnAdapter entitySpawnAdapter, BindlessSupport bindless, EntityClassificationCache classificationCache) { ArgumentNullException.ThrowIfNull(gl); ArgumentNullException.ThrowIfNull(shader); ArgumentNullException.ThrowIfNull(textures); ArgumentNullException.ThrowIfNull(meshAdapter); ArgumentNullException.ThrowIfNull(entitySpawnAdapter); ArgumentNullException.ThrowIfNull(classificationCache); _gl = gl; _shader = shader; _textures = textures; _meshAdapter = meshAdapter; _entitySpawnAdapter = entitySpawnAdapter; _cache = classificationCache; _bindless = bindless ?? throw new ArgumentNullException(nameof(bindless)); _instanceSsbo = _gl.GenBuffer(); _batchSsbo = _gl.GenBuffer(); _indirectBuffer = _gl.GenBuffer(); } public static Matrix4x4 ComposePartWorldMatrix( Matrix4x4 entityWorld, Matrix4x4 animOverride, Matrix4x4 restPose) => restPose * animOverride * entityWorld; /// /// Entry for per-landblock iteration. /// Mirrors the shape yielded by GpuWorldState.LandblockEntries. /// public readonly record struct LandblockEntry( uint LandblockId, Vector3 AabbMin, Vector3 AabbMax, IReadOnlyList Entities, IReadOnlyDictionary? AnimatedById); /// /// Result of — the list of (entity, meshRef index) /// pairs that passed all visibility filters, plus a diagnostic walk count. /// public struct WalkResult { public int EntitiesWalked; public List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)> ToDraw; } /// /// Pure-CPU visibility filter over . /// Separated from so tests can exercise it without GL state. /// /// /// A.5 T17 Change #1: when an LB is frustum-culled AND /// is non-empty, the OLD path walked /// every entity in the LB just to find the few animated ones. This helper /// fixes that: if the LB is invisible, we iterate /// directly and look each up in /// entry.AnimatedById (typically <50 animated, up to ~10K total). /// /// /// /// A.5 T18 Change #2: per-entity AABB cull reads from the cached /// / /// (refreshed lazily if ), instead of /// recomputing Position±5 each frame. /// /// /// /// Test-friendly overload that allocates a fresh ToDraw list per call. /// Production code () uses the no-alloc overload below /// with a caller-provided scratch list. /// internal static WalkResult WalkEntities( IEnumerable landblockEntries, FrustumPlanes? frustum, uint? neverCullLandblockId, HashSet? visibleCellIds, HashSet? animatedEntityIds) { var scratch = new List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)>(); var result = new WalkResult { ToDraw = scratch }; WalkEntitiesInto( landblockEntries, frustum, neverCullLandblockId, visibleCellIds, animatedEntityIds, scratch, ref result); return result; } /// /// No-alloc overload: clears + populates the caller-provided /// list. reuses a per-dispatcher scratch field across frames to /// avoid the 480+ KB / frame GC pressure that the test-friendly overload incurs. /// Returns walk count via 's EntitiesWalked field. /// internal static void WalkEntitiesInto( IEnumerable landblockEntries, FrustumPlanes? frustum, uint? neverCullLandblockId, HashSet? visibleCellIds, HashSet? animatedEntityIds, List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)> scratch, ref WalkResult result) { scratch.Clear(); result.EntitiesWalked = 0; result.ToDraw = scratch; foreach (var entry in landblockEntries) { bool landblockVisible = frustum is null || entry.LandblockId == neverCullLandblockId || FrustumCuller.IsAabbVisible(frustum.Value, entry.AabbMin, entry.AabbMax); if (!landblockVisible) { // A.5 T17 Change #1: walk only animated entities, not all entities. // Avoids O(N_entities) scan when only O(N_animated) work is needed. if (animatedEntityIds is null || animatedEntityIds.Count == 0) continue; if (entry.AnimatedById is null) continue; foreach (var animatedId in animatedEntityIds) { if (!entry.AnimatedById.TryGetValue(animatedId, out var entity)) continue; if (entity.MeshRefs.Count == 0) continue; if (entity.ParentCellId.HasValue && visibleCellIds is not null && !visibleCellIds.Contains(entity.ParentCellId.Value)) continue; result.EntitiesWalked++; for (int i = 0; i < entity.MeshRefs.Count; i++) scratch.Add((entity, i, entry.LandblockId)); } continue; } foreach (var entity in entry.Entities) { if (entity.MeshRefs.Count == 0) continue; if (entity.ParentCellId.HasValue && visibleCellIds is not null && !visibleCellIds.Contains(entity.ParentCellId.Value)) continue; // Per-entity AABB frustum cull (perf #3). Animated entities bypass — // they're tracked at landblock level + need per-frame work regardless. // A.5 T18 Change #2: read cached AABB, refresh lazily on AabbDirty. bool isAnimated = animatedEntityIds?.Contains(entity.Id) == true; if (frustum is not null && !isAnimated && entry.LandblockId != neverCullLandblockId) { if (entity.AabbDirty) entity.RefreshAabb(); if (!FrustumCuller.IsAabbVisible(frustum.Value, entity.AabbMin, entity.AabbMax)) continue; } result.EntitiesWalked++; for (int i = 0; i < entity.MeshRefs.Count; i++) scratch.Add((entity, i, entry.LandblockId)); } } } public void Draw( ICamera camera, IEnumerable<(uint LandblockId, Vector3 AabbMin, Vector3 AabbMax, IReadOnlyList Entities, IReadOnlyDictionary? AnimatedById)> landblockEntries, FrustumPlanes? frustum = null, uint? neverCullLandblockId = null, HashSet? visibleCellIds = null, HashSet? animatedEntityIds = null) { _shader.Use(); var vp = camera.View * camera.Projection; _shader.SetMatrix4("uViewProjection", vp); bool diag = string.Equals(Environment.GetEnvironmentVariable("ACDREAM_WB_DIAG"), "1", StringComparison.Ordinal); if (diag && !_gpuQueriesInitialized) { for (int i = 0; i < GpuQueryRingDepth; i++) { _gpuQueryOpaque[i] = _gl.GenQuery(); _gpuQueryTransparent[i] = _gl.GenQuery(); } _gpuQueriesInitialized = true; } // Always run the CPU stopwatch — cheap; only logged under diag. _cpuStopwatch.Restart(); // Camera world-space position for front-to-back sort (perf #2). The view // matrix is the inverse of the camera's world transform, so the world // translation lives in the inverse's translation row. Vector3 camPos = Vector3.Zero; if (Matrix4x4.Invert(camera.View, out var invView)) camPos = invView.Translation; // ── Phase 1: clear groups, walk entities, build groups ────────────── foreach (var grp in _groups.Values) grp.Matrices.Clear(); var metaTable = _meshAdapter.MetadataTable; uint anyVao = 0; // Project the 5-tuple enumerable into LandblockEntry records for WalkEntities. static IEnumerable ToEntries( IEnumerable<(uint LandblockId, Vector3 AabbMin, Vector3 AabbMax, IReadOnlyList Entities, IReadOnlyDictionary? AnimatedById)> src) { foreach (var e in src) yield return new LandblockEntry(e.LandblockId, e.AabbMin, e.AabbMax, e.Entities, e.AnimatedById); } // A.5 T26 follow-up (Bug B): use the no-alloc WalkEntitiesInto overload // that populates _walkScratch (a per-dispatcher field reused across frames) // instead of allocating a fresh List<(WorldEntity, int)> per frame. var walkResult = default(WalkResult); WalkEntitiesInto( ToEntries(landblockEntries), frustum, neverCullLandblockId, visibleCellIds, animatedEntityIds, _walkScratch, ref walkResult); // Tier 1 cache (#53) flush-tracking locals. _walkScratch holds one tuple // per (entity, MeshRefIndex) and is in entity-order, so all MeshRefs of // a given entity are contiguous. We accumulate ALL of an entity's // batches into _populateScratch, then flush exactly once per entity: // either when the iteration crosses to a different entity, or at the // end of the loop for the last entity. Flushing per-tuple would // overwrite earlier MeshRefs (the cache is keyed by entity.Id), so // multi-part Setup-backed entities would only retain their LAST // MeshRef's batches — bug fixed in commit after 2f489a8. uint? populateEntityId = null; uint populateLandblockId = 0; // Tier 1 cache (#53) — fast-path one-shot tracker. The cache stores a // FLAT list of batches across all MeshRefs of an entity, so a single // ApplyCacheHit call already drew every batch. _walkScratch yields // one tuple per (entity, MeshRefIndex), so without this guard a // 3-MeshRef static entity on a frame-2 cache hit would call // ApplyCacheHit 3 times — appending all 6 batches × 3 = 18 instances // to _groups instead of 6. Result: severe Z-fighting + 3× perf hit // on every multi-part static entity (buildings, statues, multi-MeshRef // NPCs). The fast path must fire only on the FIRST tuple of each // entity; subsequent tuples skip via this tracker. uint? lastHitEntityId = null; // Tier 1 cache (#53) — incomplete-entity guard. When any MeshRef of // the current entity has _meshAdapter.TryGetRenderData return null // (mesh still async-decoding via ObjectMeshManager.PrepareMeshDataAsync), // we mark the entity incomplete and DROP the accumulated populate // scratch at entity boundary instead of writing it to the cache. // Otherwise the cache would hold a partial classification (some parts // missing), and frame-2 cache hits would persist that partial render // even after the missing mesh loads — every subsequent frame sees the // cache hit and skips re-classification, so the missing parts never // recover. User-visible symptom: the drudge statue on top of the // Foundry (multi-part Setup entity with AnimPartChange) renders with // some parts missing permanently. Reset on entity change. bool currentEntityIncomplete = false; // Per-tuple entity tracker used purely for entity-change detection. // Updated UNCONDITIONALLY at end of every tuple (including tuples that // skip via null renderData), so the flag-reset block below correctly // distinguishes "new entity" from "same entity, different tuple." // populateEntityId can't be used for this because it's only set after // a successful slow-path classification. uint? prevTupleEntityId = null; foreach (var (entity, partIdx, landblockId) in _walkScratch) { if (diag) _entitiesSeen++; // Skip subsequent tuples of an entity that already cache-hit on // its first tuple. ApplyCacheHit drew the full flat batch list; // re-firing here would N-multiply the instance count. Diag // _entitiesDrawn is bumped here to preserve per-tuple parity with // the previous counting semantics. if (lastHitEntityId == entity.Id) { if (diag) _entitiesDrawn++; continue; } // Reset the hit tracker on entity change so the next entity's // first tuple re-checks the cache. (When this iteration is the // FIRST tuple of a new entity after a cache-hit entity, we must // not retain the previous entity's id.) if (lastHitEntityId.HasValue && lastHitEntityId.Value != entity.Id) { lastHitEntityId = null; } // Tier 1 cache (#53) — drop the previous entity's accumulated // populate scratch BEFORE MaybeFlushOnEntityChange runs. If the // previous entity ended incomplete (≥1 null renderData), we MUST // NOT cache its partial classification: clear scratch and null // the tracker so MaybeFlushOnEntityChange sees the cleaned state // and no-ops for this entity. Reset the incomplete flag for the // new entity so each one gets a fresh measurement. // // CRITICAL: the flag reset must fire ONLY on entity change, not // every tuple. Resetting per-tuple within the same entity would // undo a null-renderData flag set by a previous tuple of the same // entity → if the missing MeshRef sits in the MIDDLE of the // entity's MeshRefs list, a later valid tuple's reset would // re-mark the entity "complete" and let partial data populate // the cache. Trees with [trunk valid, branches null, leaves // valid] hit this exactly — branches never recover. bool isNewEntity = !prevTupleEntityId.HasValue || prevTupleEntityId.Value != entity.Id; if (isNewEntity) { if (populateEntityId.HasValue && currentEntityIncomplete) { _populateScratch.Clear(); populateEntityId = null; } currentEntityIncomplete = false; } prevTupleEntityId = entity.Id; // Flush-on-entity-change: if the previous entity accumulated any // batches AND this iteration is for a different entity, populate // its cache entry now and reset the scratch buffer. (populateEntityId, populateLandblockId) = MaybeFlushOnEntityChange( populateEntityId, populateLandblockId, entity.Id, _cache, _populateScratch); var entityWorld = Matrix4x4.CreateFromQuaternion(entity.Rotation) * Matrix4x4.CreateTranslation(entity.Position); bool isAnimated = animatedEntityIds?.Contains(entity.Id) == true; // Cache-hit fast path (Task 10): static entity with a populated // cache entry skips classification entirely. Walk the cached // (GroupKey, RestPose) flat list and append cached.RestPose * // entityWorld to each matching group's matrices. Animated entities // bypass the cache (collector is set null below; their entries are // never populated in the first place). // // Placed AFTER the entity-change flush above so that, on a // hit, this iteration also finishes flushing any pending // populate state from a previous entity. Animated entities never // enter this branch — the !isAnimated guard makes that explicit. // // Fires ONCE per entity: the first tuple reaches here, runs // ApplyCacheHit, sets lastHitEntityId, and continues. Subsequent // tuples of the same entity short-circuit at the top of the loop // body via the lastHitEntityId == entity.Id check above. if (!isAnimated && !_tier1CacheDisabled && _cache.TryGet(entity.Id, landblockId, out var cachedEntry)) { ApplyCacheHit(cachedEntry!, entityWorld, AppendInstanceToGroup); // anyVao recovery: when the first visible entity in the frame // takes the fast path, no slow-path lookup has populated // anyVao yet. Look up THIS entity's first MeshRef once via // the mesh adapter — cheap dict lookup, not a re-classify. if (anyVao == 0) { var firstMeshRef = entity.MeshRefs[partIdx]; var firstRenderData = _meshAdapter.TryGetRenderData(firstMeshRef.GfxObjId); if (firstRenderData is not null) anyVao = firstRenderData.VAO; } if (diag) _entitiesDrawn++; lastHitEntityId = entity.Id; #if DEBUG // Cross-check guard: assert the membership predicate held at hit time. // The full re-classification cross-check (spec section 6.5) is a stretch // goal; this simpler assert catches the prior Tier 1 bug class — a // static entity that turns out to actually be animated would fire here. // // Structurally redundant with the `if (!isAnimated && ...)` branch // condition, but serves as a TRIPWIRE: a future refactor that // incorrectly relaxes the branch condition (e.g., removes // `!isAnimated` from the guard) would silently allow animated // entities into the fast path; the assert catches that immediately. System.Diagnostics.Debug.Assert( !isAnimated, $"EntityClassificationCache hit on animated entity {entity.Id} — invariant violated"); #endif continue; } // Compute palette-override hash ONCE per entity (perf #4). // Reused across every (part, batch) lookup so the FNV-1a fold // over SubPalettes runs once instead of N times. Zero when the // entity has no palette override (trees, scenery). ulong palHash = 0; if (entity.PaletteOverride is not null) palHash = TextureCache.HashPaletteOverride(entity.PaletteOverride); // Note: GameWindow's spawn path already applies // AnimPartChanges + GfxObjDegradeResolver (Issue #47 fix — // close-detail mesh swap for humanoids) to MeshRefs. We // trust MeshRefs as the source of truth here. AnimatedEntityState's // overrides become relevant only for hot-swap (0xF625 // ObjDescEvent) which today rebuilds MeshRefs anyway. var meshRef = entity.MeshRefs[partIdx]; ulong gfxObjId = meshRef.GfxObjId; var renderData = _meshAdapter.TryGetRenderData(gfxObjId); if (renderData is null) { // Tier 1 cache (#53): mesh data is still async-decoding via // WB's ObjectMeshManager.PrepareMeshDataAsync. Flag the entity // as incomplete so the entity-boundary check (or end-of-loop // check) drops the accumulated populate scratch instead of // caching a partial classification. The slow path retries on // the next frame; once all this entity's meshes have loaded, // the populate fires with the complete batch set. currentEntityIncomplete = true; if (diag) _meshesMissing++; continue; } if (anyVao == 0) anyVao = renderData.VAO; // Cache-miss path (animated entities skip cache entirely). // Static entities accumulate into _populateScratch across ALL // their MeshRefs; the flush at next-entity-boundary (or // end-of-loop) commits them as a single Populate call. var collector = isAnimated ? null : _populateScratch; bool drewAny = false; if (renderData.IsSetup && renderData.SetupParts.Count > 0) { foreach (var (partGfxObjId, partTransform) in renderData.SetupParts) { var partData = _meshAdapter.TryGetRenderData(partGfxObjId); if (partData is null) continue; var model = ComposePartWorldMatrix( entityWorld, meshRef.PartTransform, partTransform); var restPose = partTransform * meshRef.PartTransform; ClassifyBatches(partData, partGfxObjId, model, entity, meshRef, palHash, metaTable, restPose, collector); drewAny = true; } } else { var model = meshRef.PartTransform * entityWorld; ClassifyBatches(renderData, gfxObjId, model, entity, meshRef, palHash, metaTable, restPose: meshRef.PartTransform, collector: collector); drewAny = true; } // Track THIS entity for the next iteration's flush check. Only // when collector is non-null (entity is static); animated entities // leave the tracker null so we don't try to flush them. if (collector is not null) { populateEntityId = entity.Id; populateLandblockId = landblockId; } if (diag && drewAny) _entitiesDrawn++; } // Tier 1 cache (#53) — drop the accumulated populate scratch if the // LAST entity in the loop ended incomplete (had ≥1 null renderData). // Same reason as the entity-boundary handling above: avoid caching a // partial classification. The slow path will retry on the next frame // and populate correctly once all meshes have loaded. if (currentEntityIncomplete) { _populateScratch.Clear(); populateEntityId = null; } // Final flush: the last entity in _walkScratch has no "next iteration" // to trigger the entity-change flush, so commit its accumulated batches // here. No-op when the last entity was animated (populateEntityId stays // null) or when no entities walked at all. FinalFlushPopulate(populateEntityId, populateLandblockId, _cache, _populateScratch); // Nothing visible — skip the GL pass entirely. if (anyVao == 0) { _cpuStopwatch.Stop(); if (diag) MaybeFlushDiag(); return; } // ── Phase 3: assign FirstInstance per group, lay matrices contiguously, sort opaque ── int totalInstances = 0; foreach (var grp in _groups.Values) totalInstances += grp.Matrices.Count; if (totalInstances == 0) { _cpuStopwatch.Stop(); if (diag) MaybeFlushDiag(); return; } int needed = totalInstances * 16; if (_instanceData.Length < needed) _instanceData = new float[needed + 256 * 16]; _opaqueDraws.Clear(); _translucentDraws.Clear(); int cursor = 0; foreach (var grp in _groups.Values) { if (grp.Matrices.Count == 0) continue; grp.FirstInstance = cursor; grp.InstanceCount = grp.Matrices.Count; // Use the first instance's translation as the group's representative // position for front-to-back sort (perf #2). Cheap heuristic; works // well when instances of one group are spatially coherent // (typical for trees in one landblock area, NPCs at one spawn). var first = grp.Matrices[0]; var grpPos = new Vector3(first.M41, first.M42, first.M43); grp.SortDistance = Vector3.DistanceSquared(camPos, grpPos); for (int i = 0; i < grp.Matrices.Count; i++) { WriteMatrix(_instanceData, cursor * 16, grp.Matrices[i]); cursor++; } if (IsOpaque(grp.Translucency)) _opaqueDraws.Add(grp); else _translucentDraws.Add(grp); } // Front-to-back sort for opaque pass: nearer groups draw first so the // depth test rejects fragments hidden behind them, reducing fragment // shader cost from overdraw on dense scenes (Holtburg courtyard, // Foundry interior). _opaqueDraws.Sort(static (a, b) => a.SortDistance.CompareTo(b.SortDistance)); // ── Phase 4: build IndirectGroupInput list (opaque sorted, then translucent), // fill via BuildIndirectArrays ────────────────────────────────── int totalDraws = _opaqueDraws.Count + _translucentDraws.Count; if (_batchData.Length < totalDraws) _batchData = new BatchData[totalDraws + 64]; if (_indirectCommands.Length < totalDraws) _indirectCommands = new DrawElementsIndirectCommand[totalDraws + 64]; var groupInputs = new List(totalDraws); foreach (var g in _opaqueDraws) groupInputs.Add(ToInput(g)); foreach (var g in _translucentDraws) groupInputs.Add(ToInput(g)); // Cast _batchData (private BatchData) to public-mirror BatchDataPublic for BuildIndirectArrays. // Layout is asserted at test time (BatchDataPublic_LayoutMatchesPrivateBatchData test). var batchPublic = new BatchDataPublic[totalDraws]; var layout = BuildIndirectArrays(groupInputs, _indirectCommands, batchPublic); // Copy back into _batchData for (int i = 0; i < totalDraws; i++) { _batchData[i] = new BatchData { TextureHandle = batchPublic[i].TextureHandle, TextureLayer = batchPublic[i].TextureLayer, Flags = batchPublic[i].Flags, }; } _opaqueDrawCount = layout.OpaqueCount; _transparentDrawCount = layout.TransparentCount; _transparentByteOffset = layout.TransparentByteOffset; // ── Phase 5: upload three buffers ─────────────────────────────────── fixed (float* ip = _instanceData) UploadSsbo(_instanceSsbo, 0, ip, totalInstances * 16 * sizeof(float)); fixed (BatchData* bp = _batchData) UploadSsbo(_batchSsbo, 1, bp, totalDraws * sizeof(BatchData)); fixed (DrawElementsIndirectCommand* cp = _indirectCommands) { _gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer); _gl.BufferData(BufferTargetARB.DrawIndirectBuffer, (nuint)(totalDraws * sizeof(DrawElementsIndirectCommand)), cp, BufferUsageARB.DynamicDraw); } // ── Phase 6: bind global VAO once ─────────────────────────────────── _gl.BindVertexArray(anyVao); if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal)) _gl.Disable(EnableCap.CullFace); // GPU timing: compute this frame's ring slot. We read frame N-3's // result (the oldest data in the ring) before overwriting it with // frame N's queries. Hoisted to function scope so both the opaque // and transparent passes below can reference gpuQuerySlot. See spec // §3 Q1/Q2 + §4 in // docs/superpowers/specs/2026-05-11-phase-n6-slice1-design.md. int gpuQuerySlot = _gpuQueryFrameIndex % GpuQueryRingDepth; // diag is part of the gate so the read/issue/increment trio stays // symmetric — without it, toggling ACDREAM_WB_DIAG mid-session would // freeze the frame counter (gated by diag below) while the read kept // re-reading the same slot, producing duplicate stale samples. if (diag && _gpuQueriesInitialized && _gpuQueryFrameIndex >= GpuQueryRingDepth) { _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.ResultAvailable, out int avail); if (avail != 0) { _gl.GetQueryObject(_gpuQueryOpaque[gpuQuerySlot], QueryObjectParameterName.Result, out ulong opaqueNs); _gl.GetQueryObject(_gpuQueryTransparent[gpuQuerySlot], QueryObjectParameterName.Result, out ulong transNs); long gpuUs = (long)((opaqueNs + transNs) / 1000UL); _gpuSamples[_gpuSampleCursor] = gpuUs; _gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length; } // If avail==0 the sample is dropped silently. MedianMicros // computes over the non-zero subset, so dropped samples don't // poison the median. } // ── Phase 7: opaque pass ───────────────────────────────────────────── if (_opaqueDrawCount > 0) { _gl.Disable(EnableCap.Blend); _gl.DepthMask(true); // A.5 T20: enable A2C for ClipMap foliage — GPU derives sample mask // from the alpha written by mesh_modern.frag so foliage edges are // smooth under MSAA 4x. A no-op for fully-opaque (α=1) batches. // A.5 T22.5: gated by AlphaToCoverage property so Low/Medium presets // (no MSAA) skip the unnecessary GL state change. if (AlphaToCoverage) _gl.Enable(EnableCap.SampleAlphaToCoverage); _shader.SetInt("uRenderPass", 0); // Phase Post-A.5 (ISSUE #52, 2026-05-10): opaque section of // Batches[] starts at index 0. See uDrawIDOffset comment in // mesh_modern.vert for why this is needed. _shader.SetInt("uDrawIDOffset", 0); _gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer); if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque[gpuQuerySlot]); _gl.MultiDrawElementsIndirect( PrimitiveType.Triangles, DrawElementsType.UnsignedShort, (void*)0, (uint)_opaqueDrawCount, (uint)DrawCommandStride); if (diag && _gpuQueriesInitialized) _gl.EndQuery(QueryTarget.TimeElapsed); if (AlphaToCoverage) _gl.Disable(EnableCap.SampleAlphaToCoverage); } // ── Phase 8: transparent pass ──────────────────────────────────────── if (_transparentDrawCount > 0) { _gl.Enable(EnableCap.Blend); _gl.BlendFunc(BlendingFactor.SrcAlpha, BlendingFactor.OneMinusSrcAlpha); _gl.DepthMask(false); // Phase Post-A.5 (ISSUE #52, 2026-05-10): transparent section of // Batches[] starts at index _opaqueDrawCount. Without this offset, // each transparent draw reads BatchData[0..transparentCount) — the // OPAQUE section — and the lifestone crystal's apparent texture // flickers to whatever opaque batch sorted first that frame. See // uDrawIDOffset comment in mesh_modern.vert. _shader.SetInt("uDrawIDOffset", _opaqueDrawCount); // Phase Post-A.5 (ISSUE #52, 2026-05-10): re-establish Phase 9.2's // back-face cull setup. The legacy StaticMeshRenderer had this // (commit 6f1971a, 2026-04-11) until the N.5 retirement amendment // (commit dcae2b6, 2026-05-08) deleted that renderer; the new // WbDrawDispatcher never inherited the cull-face state. // // Closed-shell translucent meshes — lifestone crystal, glow gems, // any convex blended mesh — NEED back-face culling in the // translucent pass. Without it, back faces composite OVER front // faces in arbitrary iteration order, because DepthMask(false) // means nothing records depth within the translucent set. The // result is the user-visible "one face missing, see into the // hollow interior" + frame-to-frame color flicker as rotation // shifts the triangle order. // // Our fan triangulation emits pos-side polygons as (0, i, i+1) — // CCW in standard OpenGL conventions — so GL_BACK + CCW-front is // the correct state. Matches WorldBuilder's per-batch CullMode // handling. Neg-side polygons (rare on translucent AC content) // use reversed winding and get culled here, matching the opaque // pass and the original Phase 9.2 fix's known limitation. _gl.Enable(EnableCap.CullFace); _gl.CullFace(TriangleFace.Back); _gl.FrontFace(FrontFaceDirection.Ccw); _shader.SetInt("uRenderPass", 1); if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent[gpuQuerySlot]); _gl.MultiDrawElementsIndirect( PrimitiveType.Triangles, DrawElementsType.UnsignedShort, (void*)_transparentByteOffset, (uint)_transparentDrawCount, (uint)DrawCommandStride); if (diag && _gpuQueriesInitialized) _gl.EndQuery(QueryTarget.TimeElapsed); _gl.DepthMask(true); _gl.Disable(EnableCap.Blend); } _gl.Disable(EnableCap.CullFace); _gl.BindVertexArray(0); _cpuStopwatch.Stop(); if (diag) { long cpuUs = _cpuStopwatch.ElapsedTicks * 1_000_000L / System.Diagnostics.Stopwatch.Frequency; _cpuSamples[_cpuSampleCursor] = cpuUs; _cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length; // GPU sample read happens BEFORE issuing the next frame's queries // (see step 1.3 above). Increment the frame counter here so the // next call computes a fresh slot. if (_gpuQueriesInitialized) _gpuQueryFrameIndex++; _drawsIssued += _opaqueDrawCount + _transparentDrawCount; _instancesIssued += totalInstances; MaybeFlushDiag(); } } private static IndirectGroupInput ToInput(InstanceGroup g) => new( IndexCount: g.IndexCount, FirstIndex: g.FirstIndex, BaseVertex: g.BaseVertex, InstanceCount: g.InstanceCount, FirstInstance: g.FirstInstance, TextureHandle: g.BindlessTextureHandle, TextureLayer: g.TextureLayer, Translucency: g.Translucency); private unsafe void UploadSsbo(uint ssbo, uint binding, void* data, int byteCount) { _gl.BindBuffer(BufferTargetARB.ShaderStorageBuffer, ssbo); _gl.BufferData(BufferTargetARB.ShaderStorageBuffer, (nuint)byteCount, data, BufferUsageARB.DynamicDraw); _gl.BindBufferBase(BufferTargetARB.ShaderStorageBuffer, binding, ssbo); } private void MaybeFlushDiag() { long now = Environment.TickCount64; if (now - _lastLogTick > 5000) { long cpuMed = MedianMicros(_cpuSamples); long cpuP95 = Percentile95Micros(_cpuSamples); long gpuMed = MedianMicros(_gpuSamples); long gpuP95 = Percentile95Micros(_gpuSamples); // A.5 T23: flag when entity dispatcher median exceeds 2.0ms budget // (Phase A.5 spec §2 acceptance criterion 6). Grep-friendly prefix. const long BudgetUs = 2000; string budgetFlag = cpuMed > BudgetUs ? " BUDGET_OVER" : ""; Console.WriteLine( $"[WB-DIAG]{budgetFlag} entSeen={_entitiesSeen} entDrawn={_entitiesDrawn} meshMissing={_meshesMissing} drawsIssued={_drawsIssued} instances={_instancesIssued} groups={_groups.Count} " + $"cpu_us={cpuMed}m/{cpuP95}p95 gpu_us={gpuMed}m/{gpuP95}p95"); _entitiesSeen = _entitiesDrawn = _meshesMissing = _drawsIssued = _instancesIssued = 0; _lastLogTick = now; // Don't reset the sample buffers — they're a moving window of the // last 256 frames; clearing per 5s flush would lose recent history. } } private static long MedianMicros(long[] samples) { var copy = (long[])samples.Clone(); Array.Sort(copy); int nz = 0; foreach (var v in copy) if (v > 0) nz++; if (nz == 0) return 0; return copy[copy.Length - nz / 2]; } private static long Percentile95Micros(long[] samples) { var copy = (long[])samples.Clone(); Array.Sort(copy); int nz = 0; foreach (var v in copy) if (v > 0) nz++; if (nz == 0) return 0; int idx = copy.Length - 1 - (int)(nz * 0.05); return copy[idx]; } // ── Tier 1 cache (#53) helpers extracted for testability ───────────────── // // Three pure-CPU static helpers carved out of Draw's per-entity loop so // unit tests can exercise the populate/flush algorithm + cache-hit fast // path without needing a real GL context. Production code (Draw) calls // these helpers; the dispatcher integration tests in // WbDrawDispatcherBucketingTests use them to drive the same algorithm // through deterministic inputs. /// /// Apply a cache hit's batches into the per-frame group dictionary by /// composing cached.RestPose * entityWorld per batch and routing /// the result through . The delegate /// abstracts over so this helper stays /// GL-free and unit-testable. /// /// /// Matrix multiplication is non-commutative: it MUST be /// RestPose * entityWorld, not the reverse. See /// for the full part-world product. /// internal static void ApplyCacheHit( EntityCacheEntry entry, Matrix4x4 entityWorld, Action appendInstance) { foreach (var cached in entry.Batches) { appendInstance(cached.Key, cached.RestPose * entityWorld); } } /// /// Per-tuple flush check. If is set /// AND differs from , the previous /// entity's accumulated batches are committed to /// and is cleared. Returns the /// updated tracker tuple — pass these back into the field locals in the /// caller's loop. /// /// /// This is the bug-fix structure from commit 00fa8ae (per-MeshRef /// Populate would overwrite earlier MeshRefs because the cache is /// keyed by entity.Id; flushing only on entity boundary preserves all /// MeshRefs' batches). _walkScratch is in entity-order so all MeshRefs /// of one entity arrive contiguously. /// internal static (uint? PopulateEntityId, uint PopulateLandblockId) MaybeFlushOnEntityChange( uint? populateEntityId, uint populateLandblockId, uint currentEntityId, EntityClassificationCache cache, List populateScratch) { if (populateEntityId.HasValue && populateEntityId.Value != currentEntityId) { if (populateScratch.Count > 0) { cache.Populate(populateEntityId.Value, populateLandblockId, populateScratch.ToArray()); } populateScratch.Clear(); return (null, 0u); } return (populateEntityId, populateLandblockId); } /// /// End-of-loop final flush. The last entity in _walkScratch has /// no next-iteration to trigger , /// so commit its accumulated batches here. No-op when no populate is /// pending (the last entity was animated, or the scratch is empty). /// /// End-of-loop only — does NOT reset the caller's tracker locals /// (intentional, since they go out of scope immediately after). /// /// internal static void FinalFlushPopulate( uint? populateEntityId, uint populateLandblockId, EntityClassificationCache cache, List populateScratch) { if (populateEntityId.HasValue && populateScratch.Count > 0) { cache.Populate(populateEntityId.Value, populateLandblockId, populateScratch.ToArray()); populateScratch.Clear(); } } /// /// Instance-side helper used by . Looks up or /// creates an for the given key in /// _groups and appends the per-instance world matrix. /// private void AppendInstanceToGroup(GroupKey key, Matrix4x4 model) { if (!_groups.TryGetValue(key, out var grp)) { grp = new InstanceGroup { Ibo = key.Ibo, FirstIndex = key.FirstIndex, BaseVertex = key.BaseVertex, IndexCount = key.IndexCount, BindlessTextureHandle = key.BindlessTextureHandle, TextureLayer = key.TextureLayer, Translucency = key.Translucency, }; _groups[key] = grp; } grp.Matrices.Add(model); } private void ClassifyBatches( ObjectRenderData renderData, ulong gfxObjId, Matrix4x4 model, WorldEntity entity, MeshRef meshRef, ulong palHash, AcSurfaceMetadataTable metaTable, Matrix4x4 restPose, List? collector = null) { for (int batchIdx = 0; batchIdx < renderData.Batches.Count; batchIdx++) { var batch = renderData.Batches[batchIdx]; TranslucencyKind translucency; if (metaTable.TryLookup(gfxObjId, batchIdx, out var meta)) { translucency = meta.Translucency; } else { translucency = batch.IsAdditive ? TranslucencyKind.Additive : batch.IsTransparent ? TranslucencyKind.AlphaBlend : TranslucencyKind.Opaque; } ulong texHandle = ResolveTexture(entity, meshRef, batch, palHash); if (texHandle == 0) continue; // TextureLayer is always 0 for per-instance composites; non-zero when // WB atlas is adopted in N.6+ and batches reference a shared atlas layer. uint texLayer = 0; var key = new GroupKey( batch.IBO, batch.FirstIndex, (int)batch.BaseVertex, batch.IndexCount, texHandle, texLayer, translucency); if (!_groups.TryGetValue(key, out var grp)) { grp = new InstanceGroup { Ibo = batch.IBO, FirstIndex = batch.FirstIndex, BaseVertex = (int)batch.BaseVertex, IndexCount = batch.IndexCount, BindlessTextureHandle = texHandle, TextureLayer = texLayer, Translucency = translucency, }; _groups[key] = grp; } grp.Matrices.Add(model); collector?.Add(new CachedBatch(key, texHandle, restPose)); } } private ulong ResolveTexture(WorldEntity entity, MeshRef meshRef, ObjectRenderBatch batch, ulong palHash) { uint surfaceId = batch.Key.SurfaceId; if (surfaceId == 0 || surfaceId == 0xFFFFFFFF) return 0; uint overrideOrigTex = 0; bool hasOrigTexOverride = meshRef.SurfaceOverrides is not null && meshRef.SurfaceOverrides.TryGetValue(surfaceId, out overrideOrigTex); uint? origTexOverride = hasOrigTexOverride ? overrideOrigTex : (uint?)null; if (entity.PaletteOverride is not null) { return _textures.GetOrUploadWithPaletteOverrideBindless( surfaceId, origTexOverride, entity.PaletteOverride, palHash); } else if (hasOrigTexOverride) { return _textures.GetOrUploadWithOrigTextureOverrideBindless(surfaceId, overrideOrigTex); } else { return _textures.GetOrUploadBindless(surfaceId); } } private static void WriteMatrix(float[] buf, int offset, in Matrix4x4 m) { buf[offset + 0] = m.M11; buf[offset + 1] = m.M12; buf[offset + 2] = m.M13; buf[offset + 3] = m.M14; buf[offset + 4] = m.M21; buf[offset + 5] = m.M22; buf[offset + 6] = m.M23; buf[offset + 7] = m.M24; buf[offset + 8] = m.M31; buf[offset + 9] = m.M32; buf[offset + 10] = m.M33; buf[offset + 11] = m.M34; buf[offset + 12] = m.M41; buf[offset + 13] = m.M42; buf[offset + 14] = m.M43; buf[offset + 15] = m.M44; } public void Dispose() { if (_disposed) return; _disposed = true; _gl.DeleteBuffer(_instanceSsbo); _gl.DeleteBuffer(_batchSsbo); _gl.DeleteBuffer(_indirectBuffer); if (_gpuQueriesInitialized) { for (int i = 0; i < GpuQueryRingDepth; i++) { _gl.DeleteQuery(_gpuQueryOpaque[i]); _gl.DeleteQuery(_gpuQueryTransparent[i]); } } } // ── Public types + helpers for BuildIndirectArrays (Task 9) ───────────── // // These are public so the pure-CPU unit tests in AcDream.Core.Tests can // exercise BuildIndirectArrays without needing a GL context. /// /// Stride in bytes of DrawElementsIndirectCommand in the indirect buffer. /// 5 × uint = 20 bytes. Tests and callers reference this symbolically /// rather than hard-coding 20 so a layout change produces a compile error. /// public const int DrawCommandStride = 20; // sizeof(DrawElementsIndirectCommand): 5 × uint /// /// Public view of the per-group inputs to — used in tests. /// public readonly record struct IndirectGroupInput( int IndexCount, uint FirstIndex, int BaseVertex, int InstanceCount, int FirstInstance, ulong TextureHandle, uint TextureLayer, TranslucencyKind Translucency); /// /// Public mirror of the per-group uploaded to the SSBO. /// Tests verify the layout. Same field shape as the private BatchData. /// [StructLayout(LayoutKind.Sequential, Pack = 8)] public struct BatchDataPublic { public ulong TextureHandle; public uint TextureLayer; public uint Flags; } /// Result of . public readonly record struct IndirectLayoutResult( int OpaqueCount, int TransparentCount, int TransparentByteOffset); /// /// Lays out the indirect commands + parallel BatchData array contiguously: /// opaque section first (caller sorts before calling), transparent section second. /// Pure CPU, no GL state. Caller passes pre-sized scratch arrays. /// /// /// Classification: Opaque + ClipMap → opaque pass (ClipMap uses discard, not /// blending). Everything else (AlphaBlend, Additive, InvAlpha) → transparent pass. /// public static IndirectLayoutResult BuildIndirectArrays( IReadOnlyList groups, DrawElementsIndirectCommand[] indirectScratch, BatchDataPublic[] batchScratch) { int opaqueCount = 0; int transparentCount = 0; foreach (var g in groups) { if (IsOpaque(g.Translucency)) opaqueCount++; else transparentCount++; } int oi = 0; // opaque write cursor (fills [0..opaqueCount)) int ti = opaqueCount; // transparent write cursor (fills [opaqueCount..end)) foreach (var g in groups) { var dec = new DrawElementsIndirectCommand { Count = (uint)g.IndexCount, InstanceCount = (uint)g.InstanceCount, FirstIndex = g.FirstIndex, BaseVertex = g.BaseVertex, BaseInstance = (uint)g.FirstInstance, }; var bd = new BatchDataPublic { TextureHandle = g.TextureHandle, TextureLayer = g.TextureLayer, Flags = 0, }; if (IsOpaque(g.Translucency)) { indirectScratch[oi] = dec; batchScratch[oi] = bd; oi++; } else { indirectScratch[ti] = dec; batchScratch[ti] = bd; ti++; } } return new IndirectLayoutResult(opaqueCount, transparentCount, opaqueCount * DrawCommandStride); } /// /// Public test shim for . Locks in the N.5 Decision 2 /// translucency partition: Opaque + ClipMap → opaque indirect; AlphaBlend + /// Additive + InvAlpha → transparent indirect. /// public static bool IsOpaquePublic(TranslucencyKind t) => IsOpaque(t); private static bool IsOpaque(TranslucencyKind t) => t == TranslucencyKind.Opaque || t == TranslucencyKind.ClipMap; // ──────────────────────────────────────────────────────────────────────── private sealed class InstanceGroup { public uint Ibo; public uint FirstIndex; public int BaseVertex; public int IndexCount; public ulong BindlessTextureHandle; // 64-bit (was uint TextureHandle in N.4) public uint TextureLayer; // 0 for per-instance composites; non-zero when WB atlas is adopted in N.6+ public TranslucencyKind Translucency; public int FirstInstance; // offset into the shared instance VBO (in instances, not bytes) public int InstanceCount; public float SortDistance; // squared distance from camera to first instance, for opaque sort public readonly List Matrices = new(); } }