User reported (cache enabled, post-c55acdc): drudge statue renders fully
but many trees are missing branches. Cache-disabled A/B run rendered trees
correctly. So the bug is in the cache wiring.
Root cause: c55acdc's `currentEntityIncomplete = false;` reset fired
UNCONDITIONALLY at the top of every iteration. For a tree with MeshRefs
[trunk valid, branches null, leaves valid], the tuple sequence is:
- tuple 0 (trunk): no flag set
- tuple 1 (branches): TryGetRenderData null → set flag, continue
- tuple 2 (leaves): unconditional reset → flag = false (WRONG)
- end-of-entity: flag is false, scratch has trunk+leaves batches but NOT
branches → MaybeFlushOnEntityChange populates a PARTIAL cache entry
- cache hits forever serve trunk+leaves with no branches
Drudge happened to render correctly because its missing MeshRef was at the
END of its MeshRefs list — no later tuple reset the flag.
Adds a per-tuple `prevTupleEntityId` tracker for entity-change detection,
updated UNCONDITIONALLY at end of each tuple (including tuples that skip
via null renderData). The flag-reset block now fires ONLY on actual entity
change. Within the same entity, the flag accumulates across tuples.
Also includes ACDREAM_DISABLE_TIER1_CACHE=1 diagnostic env-var added
inline (was stashed previously) for future A/B testing.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1276 lines
58 KiB
C#
1276 lines
58 KiB
C#
using System;
|
||
using System.Collections.Generic;
|
||
using System.Numerics;
|
||
using System.Runtime.InteropServices;
|
||
using AcDream.Core.Meshing;
|
||
using AcDream.Core.Terrain;
|
||
using AcDream.Core.World;
|
||
using Chorizite.OpenGLSDLBackend.Lib;
|
||
using Silk.NET.OpenGL;
|
||
|
||
namespace AcDream.App.Rendering.Wb;
|
||
|
||
/// <summary>
|
||
/// Draws entities using WB's <see cref="ObjectRenderData"/> (a single global
|
||
/// VAO/VBO/IBO under modern rendering) with acdream's <see cref="TextureCache"/>
|
||
/// for bindless texture resolution and <see cref="AcSurfaceMetadataTable"/> for
|
||
/// translucency classification.
|
||
///
|
||
/// <para>
|
||
/// <b>Atlas-tier</b> entities (<c>ServerGuid == 0</c>): mesh data comes from WB's
|
||
/// <see cref="ObjectMeshManager"/> via <see cref="WbMeshAdapter.TryGetRenderData"/>.
|
||
/// Textures resolve through the bindless-suffixed
|
||
/// <see cref="TextureCache.GetOrUploadBindless"/> variants, returning 64-bit
|
||
/// resident handles stored in the per-group SSBO.
|
||
/// </para>
|
||
///
|
||
/// <para>
|
||
/// <b>Per-instance-tier</b> entities (<c>ServerGuid != 0</c>): mesh data also from
|
||
/// WB, but textures resolve through
|
||
/// <see cref="TextureCache.GetOrUploadWithPaletteOverrideBindless"/> with palette
|
||
/// and surface overrides applied. <see cref="AnimatedEntityState"/> is currently
|
||
/// unused at draw time — GameWindow's spawn path already bakes AnimPartChanges +
|
||
/// GfxObjDegradeResolver (Issue #47 close-detail mesh) into <c>MeshRefs</c>.
|
||
/// </para>
|
||
///
|
||
/// <para>
|
||
/// <b>GL strategy (N.5 — mandatory):</b> <c>glMultiDrawElementsIndirect</c> with SSBOs
|
||
/// and <c>GL_ARB_bindless_texture</c> + <c>GL_ARB_shader_draw_parameters</c>.
|
||
/// All visible (entity, batch) pairs are bucketed by <see cref="GroupKey"/>;
|
||
/// each group becomes one <c>DrawElementsIndirectCommand</c>. Three GPU buffers
|
||
/// are uploaded per frame: instance matrices (SSBO binding 0), per-group batch
|
||
/// metadata/texture handles (SSBO binding 1), and the indirect draw commands.
|
||
/// Two <c>glMultiDrawElementsIndirect</c> calls cover the opaque and transparent
|
||
/// passes respectively — one GL call per pass regardless of group count.
|
||
/// </para>
|
||
///
|
||
/// <para>
|
||
/// <b>Shader:</b> <c>mesh_modern</c> (bindless + <c>gl_DrawIDARB</c> /
|
||
/// <c>gl_BaseInstanceARB</c>). Missing bindless/draw-parameters throws
|
||
/// <see cref="NotSupportedException"/> at startup — there is no legacy fallback.
|
||
/// </para>
|
||
///
|
||
/// <para>
|
||
/// <b>Modern rendering assumption:</b> WB's <c>_useModernRendering</c> path (GL
|
||
/// 4.3 + bindless) puts every mesh in a single shared VAO/VBO/IBO and uses
|
||
/// <c>FirstIndex</c> + <c>BaseVertex</c> per batch. The dispatcher honors those
|
||
/// offsets inside each <c>DrawElementsIndirectCommand</c> via
|
||
/// <c>glMultiDrawElementsIndirect</c>.
|
||
/// </para>
|
||
/// </summary>
|
||
public sealed unsafe class WbDrawDispatcher : IDisposable
|
||
{
|
||
private readonly GL _gl;
|
||
private readonly Shader _shader;
|
||
private readonly TextureCache _textures;
|
||
private readonly WbMeshAdapter _meshAdapter;
|
||
private readonly EntitySpawnAdapter _entitySpawnAdapter;
|
||
|
||
private readonly BindlessSupport _bindless;
|
||
|
||
// Tier 1 cache (#53): per-entity classification results for static
|
||
// entities (those NOT in GameWindow._animatedEntities). Wired here in
|
||
// Task 7 for plumbing only — Tasks 9-10 wire the per-entity
|
||
// miss-populate / hit-fast-path through the loop.
|
||
private readonly EntityClassificationCache _cache;
|
||
|
||
// ACDREAM_DISABLE_TIER1_CACHE=1 A/B diagnostic — forces every static
|
||
// entity through the slow path. Read once in ctor.
|
||
private readonly bool _tier1CacheDisabled =
|
||
string.Equals(Environment.GetEnvironmentVariable("ACDREAM_DISABLE_TIER1_CACHE"), "1", StringComparison.Ordinal);
|
||
|
||
/// <summary>
|
||
/// A.5 T22.5: gate for GL_SAMPLE_ALPHA_TO_COVERAGE around the opaque pass.
|
||
/// Default true matches T20 behavior. Set false for Low/Medium presets that
|
||
/// have MsaaSamples=0 (A2C is a no-op without MSAA, but turning it off
|
||
/// avoids the unnecessary GL state thrash and is cleaner diagnostics).
|
||
/// Can be toggled mid-session via <see cref="GameWindow.ReapplyQualityPreset"/>.
|
||
/// </summary>
|
||
public bool AlphaToCoverage { get; set; } = true;
|
||
|
||
// SSBO buffer ids
|
||
private uint _instanceSsbo;
|
||
private uint _batchSsbo;
|
||
private uint _indirectBuffer;
|
||
|
||
// Per-frame scratch arrays — Tasks 9-10 fully wire these.
|
||
private float[] _instanceData = new float[256 * 16]; // mat4 floats per instance
|
||
private BatchData[] _batchData = new BatchData[256];
|
||
private DrawElementsIndirectCommand[] _indirectCommands = new DrawElementsIndirectCommand[256];
|
||
|
||
private int _opaqueDrawCount;
|
||
private int _transparentDrawCount;
|
||
private int _transparentByteOffset;
|
||
|
||
// std430 layout: ulong TextureHandle (uvec2) at offset 0, uint TextureLayer
|
||
// at offset 8, uint Flags at offset 12. Total 16 bytes.
|
||
// Pack=8 (not 4) because std430's uvec2 requires 8-byte alignment — Pack=4
|
||
// works today by accident (TextureHandle is the first field, so offset 0 is
|
||
// always 8-byte aligned), but adding a 4-byte field before TextureHandle
|
||
// without bumping Pack would silently misalign the GPU struct.
|
||
[StructLayout(LayoutKind.Sequential, Pack = 8)]
|
||
private struct BatchData
|
||
{
|
||
public ulong TextureHandle; // bindless handle (uvec2 in GLSL)
|
||
public uint TextureLayer;
|
||
public uint Flags;
|
||
}
|
||
|
||
// Per-frame scratch — reused across frames to avoid per-frame allocation.
|
||
private readonly Dictionary<GroupKey, InstanceGroup> _groups = new();
|
||
private readonly List<InstanceGroup> _opaqueDraws = new();
|
||
private readonly List<InstanceGroup> _translucentDraws = new();
|
||
// A.5 T26 follow-up (Bug B): WalkEntities populates this scratch list
|
||
// instead of allocating a fresh List<(WorldEntity, int)> per frame. At
|
||
// ~10K entities × ~3 mesh refs = ~30K tuples × 16 bytes = ~480 KB / frame
|
||
// of GC pressure on the render thread under the original T17 shape.
|
||
private readonly List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)> _walkScratch = new();
|
||
|
||
// Tier 1 cache (#53) — per-entity classification collector. Reused across
|
||
// frames; cleared at flush time when the per-entity loop crosses an entity
|
||
// boundary in _walkScratch (and once more at end-of-loop for the last
|
||
// entity). _walkScratch is in entity-order, so all MeshRefs of one entity
|
||
// are contiguous — accumulate them all before flushing one Populate call.
|
||
// Animated entities skip this scratch entirely (collector = null).
|
||
private readonly List<CachedBatch> _populateScratch = new();
|
||
|
||
// Per-entity-cull AABB radius. Conservative — covers most entities; large
|
||
// outliers (long banners, tall columns) are still landblock-culled.
|
||
private const float PerEntityCullRadius = 5.0f;
|
||
|
||
private bool _disposed;
|
||
|
||
// Diagnostic counters logged once per ~5s under ACDREAM_WB_DIAG=1.
|
||
private int _entitiesSeen;
|
||
private int _entitiesDrawn;
|
||
private int _meshesMissing;
|
||
private int _drawsIssued;
|
||
private int _instancesIssued;
|
||
private long _lastLogTick;
|
||
|
||
// CPU + GPU timing for [WB-DIAG] under ACDREAM_WB_DIAG=1.
|
||
private readonly System.Diagnostics.Stopwatch _cpuStopwatch = new();
|
||
private readonly long[] _cpuSamples = new long[256]; // microseconds
|
||
private int _cpuSampleCursor;
|
||
private uint _gpuQueryOpaque;
|
||
private uint _gpuQueryTransparent;
|
||
private readonly long[] _gpuSamples = new long[256]; // microseconds
|
||
private int _gpuSampleCursor;
|
||
private bool _gpuQueriesInitialized;
|
||
|
||
// Constructor accessibility is internal because EntityClassificationCache
|
||
// is internal — a public ctor with an internal-typed parameter would be
|
||
// an inconsistent-accessibility error. The dispatcher is constructed
|
||
// exclusively from GameWindow (same assembly), so internal is fine.
|
||
internal WbDrawDispatcher(
|
||
GL gl,
|
||
Shader shader,
|
||
TextureCache textures,
|
||
WbMeshAdapter meshAdapter,
|
||
EntitySpawnAdapter entitySpawnAdapter,
|
||
BindlessSupport bindless,
|
||
EntityClassificationCache classificationCache)
|
||
{
|
||
ArgumentNullException.ThrowIfNull(gl);
|
||
ArgumentNullException.ThrowIfNull(shader);
|
||
ArgumentNullException.ThrowIfNull(textures);
|
||
ArgumentNullException.ThrowIfNull(meshAdapter);
|
||
ArgumentNullException.ThrowIfNull(entitySpawnAdapter);
|
||
ArgumentNullException.ThrowIfNull(classificationCache);
|
||
|
||
_gl = gl;
|
||
_shader = shader;
|
||
_textures = textures;
|
||
_meshAdapter = meshAdapter;
|
||
_entitySpawnAdapter = entitySpawnAdapter;
|
||
_cache = classificationCache;
|
||
|
||
_bindless = bindless ?? throw new ArgumentNullException(nameof(bindless));
|
||
_instanceSsbo = _gl.GenBuffer();
|
||
_batchSsbo = _gl.GenBuffer();
|
||
_indirectBuffer = _gl.GenBuffer();
|
||
}
|
||
|
||
public static Matrix4x4 ComposePartWorldMatrix(
|
||
Matrix4x4 entityWorld,
|
||
Matrix4x4 animOverride,
|
||
Matrix4x4 restPose)
|
||
=> restPose * animOverride * entityWorld;
|
||
|
||
/// <summary>
|
||
/// Entry for <see cref="WalkEntities"/> per-landblock iteration.
|
||
/// Mirrors the shape yielded by <c>GpuWorldState.LandblockEntries</c>.
|
||
/// </summary>
|
||
public readonly record struct LandblockEntry(
|
||
uint LandblockId,
|
||
Vector3 AabbMin,
|
||
Vector3 AabbMax,
|
||
IReadOnlyList<WorldEntity> Entities,
|
||
IReadOnlyDictionary<uint, WorldEntity>? AnimatedById);
|
||
|
||
/// <summary>
|
||
/// Result of <see cref="WalkEntities"/> — the list of (entity, meshRef index)
|
||
/// pairs that passed all visibility filters, plus a diagnostic walk count.
|
||
/// </summary>
|
||
public struct WalkResult
|
||
{
|
||
public int EntitiesWalked;
|
||
public List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)> ToDraw;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Pure-CPU visibility filter over <paramref name="landblockEntries"/>.
|
||
/// Separated from <see cref="Draw"/> so tests can exercise it without GL state.
|
||
///
|
||
/// <para>
|
||
/// A.5 T17 Change #1: when an LB is frustum-culled AND
|
||
/// <paramref name="animatedEntityIds"/> is non-empty, the OLD path walked
|
||
/// every entity in the LB just to find the few animated ones. This helper
|
||
/// fixes that: if the LB is invisible, we iterate
|
||
/// <paramref name="animatedEntityIds"/> directly and look each up in
|
||
/// <c>entry.AnimatedById</c> (typically <50 animated, up to ~10K total).
|
||
/// </para>
|
||
///
|
||
/// <para>
|
||
/// A.5 T18 Change #2: per-entity AABB cull reads from the cached
|
||
/// <see cref="WorldEntity.AabbMin"/>/<see cref="WorldEntity.AabbMax"/>
|
||
/// (refreshed lazily if <see cref="WorldEntity.AabbDirty"/>), instead of
|
||
/// recomputing Position±5 each frame.
|
||
/// </para>
|
||
/// </summary>
|
||
/// <summary>
|
||
/// Test-friendly overload that allocates a fresh ToDraw list per call.
|
||
/// Production code (<see cref="Draw"/>) uses the no-alloc overload below
|
||
/// with a caller-provided scratch list.
|
||
/// </summary>
|
||
internal static WalkResult WalkEntities(
|
||
IEnumerable<LandblockEntry> landblockEntries,
|
||
FrustumPlanes? frustum,
|
||
uint? neverCullLandblockId,
|
||
HashSet<uint>? visibleCellIds,
|
||
HashSet<uint>? animatedEntityIds)
|
||
{
|
||
var scratch = new List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)>();
|
||
var result = new WalkResult { ToDraw = scratch };
|
||
WalkEntitiesInto(
|
||
landblockEntries, frustum, neverCullLandblockId,
|
||
visibleCellIds, animatedEntityIds, scratch, ref result);
|
||
return result;
|
||
}
|
||
|
||
/// <summary>
|
||
/// No-alloc overload: clears + populates the caller-provided <paramref name="scratch"/>
|
||
/// list. <see cref="Draw"/> reuses a per-dispatcher scratch field across frames to
|
||
/// avoid the 480+ KB / frame GC pressure that the test-friendly overload incurs.
|
||
/// Returns walk count via <paramref name="result"/>'s <c>EntitiesWalked</c> field.
|
||
/// </summary>
|
||
internal static void WalkEntitiesInto(
|
||
IEnumerable<LandblockEntry> landblockEntries,
|
||
FrustumPlanes? frustum,
|
||
uint? neverCullLandblockId,
|
||
HashSet<uint>? visibleCellIds,
|
||
HashSet<uint>? animatedEntityIds,
|
||
List<(WorldEntity Entity, int MeshRefIndex, uint LandblockId)> scratch,
|
||
ref WalkResult result)
|
||
{
|
||
scratch.Clear();
|
||
result.EntitiesWalked = 0;
|
||
result.ToDraw = scratch;
|
||
|
||
foreach (var entry in landblockEntries)
|
||
{
|
||
bool landblockVisible = frustum is null
|
||
|| entry.LandblockId == neverCullLandblockId
|
||
|| FrustumCuller.IsAabbVisible(frustum.Value, entry.AabbMin, entry.AabbMax);
|
||
|
||
if (!landblockVisible)
|
||
{
|
||
// A.5 T17 Change #1: walk only animated entities, not all entities.
|
||
// Avoids O(N_entities) scan when only O(N_animated) work is needed.
|
||
if (animatedEntityIds is null || animatedEntityIds.Count == 0) continue;
|
||
if (entry.AnimatedById is null) continue;
|
||
foreach (var animatedId in animatedEntityIds)
|
||
{
|
||
if (!entry.AnimatedById.TryGetValue(animatedId, out var entity)) continue;
|
||
if (entity.MeshRefs.Count == 0) continue;
|
||
if (entity.ParentCellId.HasValue && visibleCellIds is not null
|
||
&& !visibleCellIds.Contains(entity.ParentCellId.Value)) continue;
|
||
result.EntitiesWalked++;
|
||
for (int i = 0; i < entity.MeshRefs.Count; i++)
|
||
scratch.Add((entity, i, entry.LandblockId));
|
||
}
|
||
continue;
|
||
}
|
||
|
||
foreach (var entity in entry.Entities)
|
||
{
|
||
if (entity.MeshRefs.Count == 0) continue;
|
||
|
||
if (entity.ParentCellId.HasValue && visibleCellIds is not null
|
||
&& !visibleCellIds.Contains(entity.ParentCellId.Value))
|
||
continue;
|
||
|
||
// Per-entity AABB frustum cull (perf #3). Animated entities bypass —
|
||
// they're tracked at landblock level + need per-frame work regardless.
|
||
// A.5 T18 Change #2: read cached AABB, refresh lazily on AabbDirty.
|
||
bool isAnimated = animatedEntityIds?.Contains(entity.Id) == true;
|
||
if (frustum is not null && !isAnimated && entry.LandblockId != neverCullLandblockId)
|
||
{
|
||
if (entity.AabbDirty) entity.RefreshAabb();
|
||
if (!FrustumCuller.IsAabbVisible(frustum.Value, entity.AabbMin, entity.AabbMax))
|
||
continue;
|
||
}
|
||
|
||
result.EntitiesWalked++;
|
||
for (int i = 0; i < entity.MeshRefs.Count; i++)
|
||
scratch.Add((entity, i, entry.LandblockId));
|
||
}
|
||
}
|
||
}
|
||
|
||
public void Draw(
|
||
ICamera camera,
|
||
IEnumerable<(uint LandblockId, Vector3 AabbMin, Vector3 AabbMax,
|
||
IReadOnlyList<WorldEntity> Entities,
|
||
IReadOnlyDictionary<uint, WorldEntity>? AnimatedById)> landblockEntries,
|
||
FrustumPlanes? frustum = null,
|
||
uint? neverCullLandblockId = null,
|
||
HashSet<uint>? visibleCellIds = null,
|
||
HashSet<uint>? animatedEntityIds = null)
|
||
{
|
||
_shader.Use();
|
||
var vp = camera.View * camera.Projection;
|
||
_shader.SetMatrix4("uViewProjection", vp);
|
||
|
||
bool diag = string.Equals(Environment.GetEnvironmentVariable("ACDREAM_WB_DIAG"), "1", StringComparison.Ordinal);
|
||
|
||
if (diag && !_gpuQueriesInitialized)
|
||
{
|
||
_gpuQueryOpaque = _gl.GenQuery();
|
||
_gpuQueryTransparent = _gl.GenQuery();
|
||
_gpuQueriesInitialized = true;
|
||
}
|
||
|
||
// Always run the CPU stopwatch — cheap; only logged under diag.
|
||
_cpuStopwatch.Restart();
|
||
|
||
// Camera world-space position for front-to-back sort (perf #2). The view
|
||
// matrix is the inverse of the camera's world transform, so the world
|
||
// translation lives in the inverse's translation row.
|
||
Vector3 camPos = Vector3.Zero;
|
||
if (Matrix4x4.Invert(camera.View, out var invView))
|
||
camPos = invView.Translation;
|
||
|
||
// ── Phase 1: clear groups, walk entities, build groups ──────────────
|
||
foreach (var grp in _groups.Values) grp.Matrices.Clear();
|
||
|
||
var metaTable = _meshAdapter.MetadataTable;
|
||
uint anyVao = 0;
|
||
|
||
// Project the 5-tuple enumerable into LandblockEntry records for WalkEntities.
|
||
static IEnumerable<LandblockEntry> ToEntries(
|
||
IEnumerable<(uint LandblockId, Vector3 AabbMin, Vector3 AabbMax,
|
||
IReadOnlyList<WorldEntity> Entities,
|
||
IReadOnlyDictionary<uint, WorldEntity>? AnimatedById)> src)
|
||
{
|
||
foreach (var e in src)
|
||
yield return new LandblockEntry(e.LandblockId, e.AabbMin, e.AabbMax, e.Entities, e.AnimatedById);
|
||
}
|
||
|
||
// A.5 T26 follow-up (Bug B): use the no-alloc WalkEntitiesInto overload
|
||
// that populates _walkScratch (a per-dispatcher field reused across frames)
|
||
// instead of allocating a fresh List<(WorldEntity, int)> per frame.
|
||
var walkResult = default(WalkResult);
|
||
WalkEntitiesInto(
|
||
ToEntries(landblockEntries),
|
||
frustum,
|
||
neverCullLandblockId,
|
||
visibleCellIds,
|
||
animatedEntityIds,
|
||
_walkScratch,
|
||
ref walkResult);
|
||
|
||
// Tier 1 cache (#53) flush-tracking locals. _walkScratch holds one tuple
|
||
// per (entity, MeshRefIndex) and is in entity-order, so all MeshRefs of
|
||
// a given entity are contiguous. We accumulate ALL of an entity's
|
||
// batches into _populateScratch, then flush exactly once per entity:
|
||
// either when the iteration crosses to a different entity, or at the
|
||
// end of the loop for the last entity. Flushing per-tuple would
|
||
// overwrite earlier MeshRefs (the cache is keyed by entity.Id), so
|
||
// multi-part Setup-backed entities would only retain their LAST
|
||
// MeshRef's batches — bug fixed in commit after 2f489a8.
|
||
uint? populateEntityId = null;
|
||
uint populateLandblockId = 0;
|
||
|
||
// Tier 1 cache (#53) — fast-path one-shot tracker. The cache stores a
|
||
// FLAT list of batches across all MeshRefs of an entity, so a single
|
||
// ApplyCacheHit call already drew every batch. _walkScratch yields
|
||
// one tuple per (entity, MeshRefIndex), so without this guard a
|
||
// 3-MeshRef static entity on a frame-2 cache hit would call
|
||
// ApplyCacheHit 3 times — appending all 6 batches × 3 = 18 instances
|
||
// to _groups instead of 6. Result: severe Z-fighting + 3× perf hit
|
||
// on every multi-part static entity (buildings, statues, multi-MeshRef
|
||
// NPCs). The fast path must fire only on the FIRST tuple of each
|
||
// entity; subsequent tuples skip via this tracker.
|
||
uint? lastHitEntityId = null;
|
||
|
||
// Tier 1 cache (#53) — incomplete-entity guard. When any MeshRef of
|
||
// the current entity has _meshAdapter.TryGetRenderData return null
|
||
// (mesh still async-decoding via ObjectMeshManager.PrepareMeshDataAsync),
|
||
// we mark the entity incomplete and DROP the accumulated populate
|
||
// scratch at entity boundary instead of writing it to the cache.
|
||
// Otherwise the cache would hold a partial classification (some parts
|
||
// missing), and frame-2 cache hits would persist that partial render
|
||
// even after the missing mesh loads — every subsequent frame sees the
|
||
// cache hit and skips re-classification, so the missing parts never
|
||
// recover. User-visible symptom: the drudge statue on top of the
|
||
// Foundry (multi-part Setup entity with AnimPartChange) renders with
|
||
// some parts missing permanently. Reset on entity change.
|
||
bool currentEntityIncomplete = false;
|
||
|
||
// Per-tuple entity tracker used purely for entity-change detection.
|
||
// Updated UNCONDITIONALLY at end of every tuple (including tuples that
|
||
// skip via null renderData), so the flag-reset block below correctly
|
||
// distinguishes "new entity" from "same entity, different tuple."
|
||
// populateEntityId can't be used for this because it's only set after
|
||
// a successful slow-path classification.
|
||
uint? prevTupleEntityId = null;
|
||
|
||
foreach (var (entity, partIdx, landblockId) in _walkScratch)
|
||
{
|
||
if (diag) _entitiesSeen++;
|
||
|
||
// Skip subsequent tuples of an entity that already cache-hit on
|
||
// its first tuple. ApplyCacheHit drew the full flat batch list;
|
||
// re-firing here would N-multiply the instance count. Diag
|
||
// _entitiesDrawn is bumped here to preserve per-tuple parity with
|
||
// the previous counting semantics.
|
||
if (lastHitEntityId == entity.Id)
|
||
{
|
||
if (diag) _entitiesDrawn++;
|
||
continue;
|
||
}
|
||
|
||
// Reset the hit tracker on entity change so the next entity's
|
||
// first tuple re-checks the cache. (When this iteration is the
|
||
// FIRST tuple of a new entity after a cache-hit entity, we must
|
||
// not retain the previous entity's id.)
|
||
if (lastHitEntityId.HasValue && lastHitEntityId.Value != entity.Id)
|
||
{
|
||
lastHitEntityId = null;
|
||
}
|
||
|
||
// Tier 1 cache (#53) — drop the previous entity's accumulated
|
||
// populate scratch BEFORE MaybeFlushOnEntityChange runs. If the
|
||
// previous entity ended incomplete (≥1 null renderData), we MUST
|
||
// NOT cache its partial classification: clear scratch and null
|
||
// the tracker so MaybeFlushOnEntityChange sees the cleaned state
|
||
// and no-ops for this entity. Reset the incomplete flag for the
|
||
// new entity so each one gets a fresh measurement.
|
||
//
|
||
// CRITICAL: the flag reset must fire ONLY on entity change, not
|
||
// every tuple. Resetting per-tuple within the same entity would
|
||
// undo a null-renderData flag set by a previous tuple of the same
|
||
// entity → if the missing MeshRef sits in the MIDDLE of the
|
||
// entity's MeshRefs list, a later valid tuple's reset would
|
||
// re-mark the entity "complete" and let partial data populate
|
||
// the cache. Trees with [trunk valid, branches null, leaves
|
||
// valid] hit this exactly — branches never recover.
|
||
bool isNewEntity = !prevTupleEntityId.HasValue || prevTupleEntityId.Value != entity.Id;
|
||
if (isNewEntity)
|
||
{
|
||
if (populateEntityId.HasValue && currentEntityIncomplete)
|
||
{
|
||
_populateScratch.Clear();
|
||
populateEntityId = null;
|
||
}
|
||
currentEntityIncomplete = false;
|
||
}
|
||
prevTupleEntityId = entity.Id;
|
||
|
||
// Flush-on-entity-change: if the previous entity accumulated any
|
||
// batches AND this iteration is for a different entity, populate
|
||
// its cache entry now and reset the scratch buffer.
|
||
(populateEntityId, populateLandblockId) = MaybeFlushOnEntityChange(
|
||
populateEntityId, populateLandblockId, entity.Id, _cache, _populateScratch);
|
||
|
||
var entityWorld =
|
||
Matrix4x4.CreateFromQuaternion(entity.Rotation) *
|
||
Matrix4x4.CreateTranslation(entity.Position);
|
||
|
||
bool isAnimated = animatedEntityIds?.Contains(entity.Id) == true;
|
||
|
||
// Cache-hit fast path (Task 10): static entity with a populated
|
||
// cache entry skips classification entirely. Walk the cached
|
||
// (GroupKey, RestPose) flat list and append cached.RestPose *
|
||
// entityWorld to each matching group's matrices. Animated entities
|
||
// bypass the cache (collector is set null below; their entries are
|
||
// never populated in the first place).
|
||
//
|
||
// Placed AFTER the entity-change flush above so that, on a
|
||
// hit, this iteration also finishes flushing any pending
|
||
// populate state from a previous entity. Animated entities never
|
||
// enter this branch — the !isAnimated guard makes that explicit.
|
||
//
|
||
// Fires ONCE per entity: the first tuple reaches here, runs
|
||
// ApplyCacheHit, sets lastHitEntityId, and continues. Subsequent
|
||
// tuples of the same entity short-circuit at the top of the loop
|
||
// body via the lastHitEntityId == entity.Id check above.
|
||
if (!isAnimated && !_tier1CacheDisabled && _cache.TryGet(entity.Id, landblockId, out var cachedEntry))
|
||
{
|
||
ApplyCacheHit(cachedEntry!, entityWorld, AppendInstanceToGroup);
|
||
|
||
// anyVao recovery: when the first visible entity in the frame
|
||
// takes the fast path, no slow-path lookup has populated
|
||
// anyVao yet. Look up THIS entity's first MeshRef once via
|
||
// the mesh adapter — cheap dict lookup, not a re-classify.
|
||
if (anyVao == 0)
|
||
{
|
||
var firstMeshRef = entity.MeshRefs[partIdx];
|
||
var firstRenderData = _meshAdapter.TryGetRenderData(firstMeshRef.GfxObjId);
|
||
if (firstRenderData is not null) anyVao = firstRenderData.VAO;
|
||
}
|
||
|
||
if (diag) _entitiesDrawn++;
|
||
lastHitEntityId = entity.Id;
|
||
|
||
#if DEBUG
|
||
// Cross-check guard: assert the membership predicate held at hit time.
|
||
// The full re-classification cross-check (spec section 6.5) is a stretch
|
||
// goal; this simpler assert catches the prior Tier 1 bug class — a
|
||
// static entity that turns out to actually be animated would fire here.
|
||
//
|
||
// Structurally redundant with the `if (!isAnimated && ...)` branch
|
||
// condition, but serves as a TRIPWIRE: a future refactor that
|
||
// incorrectly relaxes the branch condition (e.g., removes
|
||
// `!isAnimated` from the guard) would silently allow animated
|
||
// entities into the fast path; the assert catches that immediately.
|
||
System.Diagnostics.Debug.Assert(
|
||
!isAnimated,
|
||
$"EntityClassificationCache hit on animated entity {entity.Id} — invariant violated");
|
||
#endif
|
||
|
||
continue;
|
||
}
|
||
|
||
// Compute palette-override hash ONCE per entity (perf #4).
|
||
// Reused across every (part, batch) lookup so the FNV-1a fold
|
||
// over SubPalettes runs once instead of N times. Zero when the
|
||
// entity has no palette override (trees, scenery).
|
||
ulong palHash = 0;
|
||
if (entity.PaletteOverride is not null)
|
||
palHash = TextureCache.HashPaletteOverride(entity.PaletteOverride);
|
||
|
||
// Note: GameWindow's spawn path already applies
|
||
// AnimPartChanges + GfxObjDegradeResolver (Issue #47 fix —
|
||
// close-detail mesh swap for humanoids) to MeshRefs. We
|
||
// trust MeshRefs as the source of truth here. AnimatedEntityState's
|
||
// overrides become relevant only for hot-swap (0xF625
|
||
// ObjDescEvent) which today rebuilds MeshRefs anyway.
|
||
var meshRef = entity.MeshRefs[partIdx];
|
||
ulong gfxObjId = meshRef.GfxObjId;
|
||
|
||
var renderData = _meshAdapter.TryGetRenderData(gfxObjId);
|
||
if (renderData is null)
|
||
{
|
||
// Tier 1 cache (#53): mesh data is still async-decoding via
|
||
// WB's ObjectMeshManager.PrepareMeshDataAsync. Flag the entity
|
||
// as incomplete so the entity-boundary check (or end-of-loop
|
||
// check) drops the accumulated populate scratch instead of
|
||
// caching a partial classification. The slow path retries on
|
||
// the next frame; once all this entity's meshes have loaded,
|
||
// the populate fires with the complete batch set.
|
||
currentEntityIncomplete = true;
|
||
if (diag) _meshesMissing++;
|
||
continue;
|
||
}
|
||
if (anyVao == 0) anyVao = renderData.VAO;
|
||
|
||
// Cache-miss path (animated entities skip cache entirely).
|
||
// Static entities accumulate into _populateScratch across ALL
|
||
// their MeshRefs; the flush at next-entity-boundary (or
|
||
// end-of-loop) commits them as a single Populate call.
|
||
var collector = isAnimated ? null : _populateScratch;
|
||
|
||
bool drewAny = false;
|
||
if (renderData.IsSetup && renderData.SetupParts.Count > 0)
|
||
{
|
||
foreach (var (partGfxObjId, partTransform) in renderData.SetupParts)
|
||
{
|
||
var partData = _meshAdapter.TryGetRenderData(partGfxObjId);
|
||
if (partData is null) continue;
|
||
|
||
var model = ComposePartWorldMatrix(
|
||
entityWorld, meshRef.PartTransform, partTransform);
|
||
|
||
var restPose = partTransform * meshRef.PartTransform;
|
||
ClassifyBatches(partData, partGfxObjId, model, entity, meshRef, palHash, metaTable, restPose, collector);
|
||
drewAny = true;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
var model = meshRef.PartTransform * entityWorld;
|
||
ClassifyBatches(renderData, gfxObjId, model, entity, meshRef, palHash, metaTable, restPose: meshRef.PartTransform, collector: collector);
|
||
drewAny = true;
|
||
}
|
||
|
||
// Track THIS entity for the next iteration's flush check. Only
|
||
// when collector is non-null (entity is static); animated entities
|
||
// leave the tracker null so we don't try to flush them.
|
||
if (collector is not null)
|
||
{
|
||
populateEntityId = entity.Id;
|
||
populateLandblockId = landblockId;
|
||
}
|
||
|
||
if (diag && drewAny) _entitiesDrawn++;
|
||
}
|
||
|
||
// Tier 1 cache (#53) — drop the accumulated populate scratch if the
|
||
// LAST entity in the loop ended incomplete (had ≥1 null renderData).
|
||
// Same reason as the entity-boundary handling above: avoid caching a
|
||
// partial classification. The slow path will retry on the next frame
|
||
// and populate correctly once all meshes have loaded.
|
||
if (currentEntityIncomplete)
|
||
{
|
||
_populateScratch.Clear();
|
||
populateEntityId = null;
|
||
}
|
||
|
||
// Final flush: the last entity in _walkScratch has no "next iteration"
|
||
// to trigger the entity-change flush, so commit its accumulated batches
|
||
// here. No-op when the last entity was animated (populateEntityId stays
|
||
// null) or when no entities walked at all.
|
||
FinalFlushPopulate(populateEntityId, populateLandblockId, _cache, _populateScratch);
|
||
|
||
// Nothing visible — skip the GL pass entirely.
|
||
if (anyVao == 0)
|
||
{
|
||
_cpuStopwatch.Stop();
|
||
if (diag) MaybeFlushDiag();
|
||
return;
|
||
}
|
||
|
||
// ── Phase 3: assign FirstInstance per group, lay matrices contiguously, sort opaque ──
|
||
int totalInstances = 0;
|
||
foreach (var grp in _groups.Values) totalInstances += grp.Matrices.Count;
|
||
if (totalInstances == 0)
|
||
{
|
||
_cpuStopwatch.Stop();
|
||
if (diag) MaybeFlushDiag();
|
||
return;
|
||
}
|
||
|
||
int needed = totalInstances * 16;
|
||
if (_instanceData.Length < needed)
|
||
_instanceData = new float[needed + 256 * 16];
|
||
|
||
_opaqueDraws.Clear();
|
||
_translucentDraws.Clear();
|
||
|
||
int cursor = 0;
|
||
foreach (var grp in _groups.Values)
|
||
{
|
||
if (grp.Matrices.Count == 0) continue;
|
||
|
||
grp.FirstInstance = cursor;
|
||
grp.InstanceCount = grp.Matrices.Count;
|
||
|
||
// Use the first instance's translation as the group's representative
|
||
// position for front-to-back sort (perf #2). Cheap heuristic; works
|
||
// well when instances of one group are spatially coherent
|
||
// (typical for trees in one landblock area, NPCs at one spawn).
|
||
var first = grp.Matrices[0];
|
||
var grpPos = new Vector3(first.M41, first.M42, first.M43);
|
||
grp.SortDistance = Vector3.DistanceSquared(camPos, grpPos);
|
||
|
||
for (int i = 0; i < grp.Matrices.Count; i++)
|
||
{
|
||
WriteMatrix(_instanceData, cursor * 16, grp.Matrices[i]);
|
||
cursor++;
|
||
}
|
||
|
||
if (IsOpaque(grp.Translucency))
|
||
_opaqueDraws.Add(grp);
|
||
else
|
||
_translucentDraws.Add(grp);
|
||
}
|
||
|
||
// Front-to-back sort for opaque pass: nearer groups draw first so the
|
||
// depth test rejects fragments hidden behind them, reducing fragment
|
||
// shader cost from overdraw on dense scenes (Holtburg courtyard,
|
||
// Foundry interior).
|
||
_opaqueDraws.Sort(static (a, b) => a.SortDistance.CompareTo(b.SortDistance));
|
||
|
||
// ── Phase 4: build IndirectGroupInput list (opaque sorted, then translucent),
|
||
// fill via BuildIndirectArrays ──────────────────────────────────
|
||
int totalDraws = _opaqueDraws.Count + _translucentDraws.Count;
|
||
if (_batchData.Length < totalDraws)
|
||
_batchData = new BatchData[totalDraws + 64];
|
||
if (_indirectCommands.Length < totalDraws)
|
||
_indirectCommands = new DrawElementsIndirectCommand[totalDraws + 64];
|
||
|
||
var groupInputs = new List<IndirectGroupInput>(totalDraws);
|
||
foreach (var g in _opaqueDraws) groupInputs.Add(ToInput(g));
|
||
foreach (var g in _translucentDraws) groupInputs.Add(ToInput(g));
|
||
|
||
// Cast _batchData (private BatchData) to public-mirror BatchDataPublic for BuildIndirectArrays.
|
||
// Layout is asserted at test time (BatchDataPublic_LayoutMatchesPrivateBatchData test).
|
||
var batchPublic = new BatchDataPublic[totalDraws];
|
||
var layout = BuildIndirectArrays(groupInputs, _indirectCommands, batchPublic);
|
||
|
||
// Copy back into _batchData
|
||
for (int i = 0; i < totalDraws; i++)
|
||
{
|
||
_batchData[i] = new BatchData
|
||
{
|
||
TextureHandle = batchPublic[i].TextureHandle,
|
||
TextureLayer = batchPublic[i].TextureLayer,
|
||
Flags = batchPublic[i].Flags,
|
||
};
|
||
}
|
||
_opaqueDrawCount = layout.OpaqueCount;
|
||
_transparentDrawCount = layout.TransparentCount;
|
||
_transparentByteOffset = layout.TransparentByteOffset;
|
||
|
||
// ── Phase 5: upload three buffers ───────────────────────────────────
|
||
fixed (float* ip = _instanceData)
|
||
UploadSsbo(_instanceSsbo, 0, ip, totalInstances * 16 * sizeof(float));
|
||
|
||
fixed (BatchData* bp = _batchData)
|
||
UploadSsbo(_batchSsbo, 1, bp, totalDraws * sizeof(BatchData));
|
||
|
||
fixed (DrawElementsIndirectCommand* cp = _indirectCommands)
|
||
{
|
||
_gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
|
||
_gl.BufferData(BufferTargetARB.DrawIndirectBuffer,
|
||
(nuint)(totalDraws * sizeof(DrawElementsIndirectCommand)), cp, BufferUsageARB.DynamicDraw);
|
||
}
|
||
|
||
// ── Phase 6: bind global VAO once ───────────────────────────────────
|
||
_gl.BindVertexArray(anyVao);
|
||
|
||
if (string.Equals(Environment.GetEnvironmentVariable("ACDREAM_NO_CULL"), "1", StringComparison.Ordinal))
|
||
_gl.Disable(EnableCap.CullFace);
|
||
|
||
// ── Phase 7: opaque pass ─────────────────────────────────────────────
|
||
if (_opaqueDrawCount > 0)
|
||
{
|
||
_gl.Disable(EnableCap.Blend);
|
||
_gl.DepthMask(true);
|
||
// A.5 T20: enable A2C for ClipMap foliage — GPU derives sample mask
|
||
// from the alpha written by mesh_modern.frag so foliage edges are
|
||
// smooth under MSAA 4x. A no-op for fully-opaque (α=1) batches.
|
||
// A.5 T22.5: gated by AlphaToCoverage property so Low/Medium presets
|
||
// (no MSAA) skip the unnecessary GL state change.
|
||
if (AlphaToCoverage) _gl.Enable(EnableCap.SampleAlphaToCoverage);
|
||
_shader.SetInt("uRenderPass", 0);
|
||
// Phase Post-A.5 (ISSUE #52, 2026-05-10): opaque section of
|
||
// Batches[] starts at index 0. See uDrawIDOffset comment in
|
||
// mesh_modern.vert for why this is needed.
|
||
_shader.SetInt("uDrawIDOffset", 0);
|
||
_gl.BindBuffer(BufferTargetARB.DrawIndirectBuffer, _indirectBuffer);
|
||
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryOpaque);
|
||
_gl.MultiDrawElementsIndirect(
|
||
PrimitiveType.Triangles,
|
||
DrawElementsType.UnsignedShort,
|
||
(void*)0,
|
||
(uint)_opaqueDrawCount,
|
||
(uint)DrawCommandStride);
|
||
if (diag && _gpuQueriesInitialized) _gl.EndQuery(QueryTarget.TimeElapsed);
|
||
if (AlphaToCoverage) _gl.Disable(EnableCap.SampleAlphaToCoverage);
|
||
}
|
||
|
||
// ── Phase 8: transparent pass ────────────────────────────────────────
|
||
if (_transparentDrawCount > 0)
|
||
{
|
||
_gl.Enable(EnableCap.Blend);
|
||
_gl.BlendFunc(BlendingFactor.SrcAlpha, BlendingFactor.OneMinusSrcAlpha);
|
||
_gl.DepthMask(false);
|
||
// Phase Post-A.5 (ISSUE #52, 2026-05-10): transparent section of
|
||
// Batches[] starts at index _opaqueDrawCount. Without this offset,
|
||
// each transparent draw reads BatchData[0..transparentCount) — the
|
||
// OPAQUE section — and the lifestone crystal's apparent texture
|
||
// flickers to whatever opaque batch sorted first that frame. See
|
||
// uDrawIDOffset comment in mesh_modern.vert.
|
||
_shader.SetInt("uDrawIDOffset", _opaqueDrawCount);
|
||
// Phase Post-A.5 (ISSUE #52, 2026-05-10): re-establish Phase 9.2's
|
||
// back-face cull setup. The legacy StaticMeshRenderer had this
|
||
// (commit 6f1971a, 2026-04-11) until the N.5 retirement amendment
|
||
// (commit dcae2b6, 2026-05-08) deleted that renderer; the new
|
||
// WbDrawDispatcher never inherited the cull-face state.
|
||
//
|
||
// Closed-shell translucent meshes — lifestone crystal, glow gems,
|
||
// any convex blended mesh — NEED back-face culling in the
|
||
// translucent pass. Without it, back faces composite OVER front
|
||
// faces in arbitrary iteration order, because DepthMask(false)
|
||
// means nothing records depth within the translucent set. The
|
||
// result is the user-visible "one face missing, see into the
|
||
// hollow interior" + frame-to-frame color flicker as rotation
|
||
// shifts the triangle order.
|
||
//
|
||
// Our fan triangulation emits pos-side polygons as (0, i, i+1) —
|
||
// CCW in standard OpenGL conventions — so GL_BACK + CCW-front is
|
||
// the correct state. Matches WorldBuilder's per-batch CullMode
|
||
// handling. Neg-side polygons (rare on translucent AC content)
|
||
// use reversed winding and get culled here, matching the opaque
|
||
// pass and the original Phase 9.2 fix's known limitation.
|
||
_gl.Enable(EnableCap.CullFace);
|
||
_gl.CullFace(TriangleFace.Back);
|
||
_gl.FrontFace(FrontFaceDirection.Ccw);
|
||
_shader.SetInt("uRenderPass", 1);
|
||
if (diag && _gpuQueriesInitialized) _gl.BeginQuery(QueryTarget.TimeElapsed, _gpuQueryTransparent);
|
||
_gl.MultiDrawElementsIndirect(
|
||
PrimitiveType.Triangles,
|
||
DrawElementsType.UnsignedShort,
|
||
(void*)_transparentByteOffset,
|
||
(uint)_transparentDrawCount,
|
||
(uint)DrawCommandStride);
|
||
if (diag && _gpuQueriesInitialized) _gl.EndQuery(QueryTarget.TimeElapsed);
|
||
_gl.DepthMask(true);
|
||
_gl.Disable(EnableCap.Blend);
|
||
}
|
||
|
||
_gl.Disable(EnableCap.CullFace);
|
||
_gl.BindVertexArray(0);
|
||
|
||
_cpuStopwatch.Stop();
|
||
|
||
if (diag)
|
||
{
|
||
long cpuUs = _cpuStopwatch.ElapsedTicks * 1_000_000L / System.Diagnostics.Stopwatch.Frequency;
|
||
_cpuSamples[_cpuSampleCursor] = cpuUs;
|
||
_cpuSampleCursor = (_cpuSampleCursor + 1) % _cpuSamples.Length;
|
||
|
||
// Read GPU samples non-blocking; the result for the previous frame's
|
||
// queries should be ready by now. If not, drop the sample (don't stall
|
||
// the CPU waiting for the GPU).
|
||
if (_gpuQueriesInitialized)
|
||
{
|
||
_gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.ResultAvailable, out int avail);
|
||
if (avail != 0)
|
||
{
|
||
_gl.GetQueryObject(_gpuQueryOpaque, QueryObjectParameterName.Result, out ulong opaqueNs);
|
||
_gl.GetQueryObject(_gpuQueryTransparent, QueryObjectParameterName.Result, out ulong transNs);
|
||
long gpuUs = (long)((opaqueNs + transNs) / 1000UL);
|
||
_gpuSamples[_gpuSampleCursor] = gpuUs;
|
||
_gpuSampleCursor = (_gpuSampleCursor + 1) % _gpuSamples.Length;
|
||
}
|
||
}
|
||
|
||
_drawsIssued += _opaqueDrawCount + _transparentDrawCount;
|
||
_instancesIssued += totalInstances;
|
||
MaybeFlushDiag();
|
||
}
|
||
}
|
||
|
||
private static IndirectGroupInput ToInput(InstanceGroup g) => new(
|
||
IndexCount: g.IndexCount,
|
||
FirstIndex: g.FirstIndex,
|
||
BaseVertex: g.BaseVertex,
|
||
InstanceCount: g.InstanceCount,
|
||
FirstInstance: g.FirstInstance,
|
||
TextureHandle: g.BindlessTextureHandle,
|
||
TextureLayer: g.TextureLayer,
|
||
Translucency: g.Translucency);
|
||
|
||
private unsafe void UploadSsbo(uint ssbo, uint binding, void* data, int byteCount)
|
||
{
|
||
_gl.BindBuffer(BufferTargetARB.ShaderStorageBuffer, ssbo);
|
||
_gl.BufferData(BufferTargetARB.ShaderStorageBuffer, (nuint)byteCount, data, BufferUsageARB.DynamicDraw);
|
||
_gl.BindBufferBase(BufferTargetARB.ShaderStorageBuffer, binding, ssbo);
|
||
}
|
||
|
||
private void MaybeFlushDiag()
|
||
{
|
||
long now = Environment.TickCount64;
|
||
if (now - _lastLogTick > 5000)
|
||
{
|
||
long cpuMed = MedianMicros(_cpuSamples);
|
||
long cpuP95 = Percentile95Micros(_cpuSamples);
|
||
long gpuMed = MedianMicros(_gpuSamples);
|
||
long gpuP95 = Percentile95Micros(_gpuSamples);
|
||
// A.5 T23: flag when entity dispatcher median exceeds 2.0ms budget
|
||
// (Phase A.5 spec §2 acceptance criterion 6). Grep-friendly prefix.
|
||
const long BudgetUs = 2000;
|
||
string budgetFlag = cpuMed > BudgetUs ? " BUDGET_OVER" : "";
|
||
Console.WriteLine(
|
||
$"[WB-DIAG]{budgetFlag} entSeen={_entitiesSeen} entDrawn={_entitiesDrawn} meshMissing={_meshesMissing} drawsIssued={_drawsIssued} instances={_instancesIssued} groups={_groups.Count} " +
|
||
$"cpu_us={cpuMed}m/{cpuP95}p95 gpu_us={gpuMed}m/{gpuP95}p95");
|
||
_entitiesSeen = _entitiesDrawn = _meshesMissing = _drawsIssued = _instancesIssued = 0;
|
||
_lastLogTick = now;
|
||
// Don't reset the sample buffers — they're a moving window of the
|
||
// last 256 frames; clearing per 5s flush would lose recent history.
|
||
}
|
||
}
|
||
|
||
private static long MedianMicros(long[] samples)
|
||
{
|
||
var copy = (long[])samples.Clone();
|
||
Array.Sort(copy);
|
||
int nz = 0;
|
||
foreach (var v in copy) if (v > 0) nz++;
|
||
if (nz == 0) return 0;
|
||
return copy[copy.Length - nz / 2];
|
||
}
|
||
|
||
private static long Percentile95Micros(long[] samples)
|
||
{
|
||
var copy = (long[])samples.Clone();
|
||
Array.Sort(copy);
|
||
int nz = 0;
|
||
foreach (var v in copy) if (v > 0) nz++;
|
||
if (nz == 0) return 0;
|
||
int idx = copy.Length - 1 - (int)(nz * 0.05);
|
||
return copy[idx];
|
||
}
|
||
|
||
// ── Tier 1 cache (#53) helpers extracted for testability ─────────────────
|
||
//
|
||
// Three pure-CPU static helpers carved out of Draw's per-entity loop so
|
||
// unit tests can exercise the populate/flush algorithm + cache-hit fast
|
||
// path without needing a real GL context. Production code (Draw) calls
|
||
// these helpers; the dispatcher integration tests in
|
||
// WbDrawDispatcherBucketingTests use them to drive the same algorithm
|
||
// through deterministic inputs.
|
||
|
||
/// <summary>
|
||
/// Apply a cache hit's batches into the per-frame group dictionary by
|
||
/// composing <c>cached.RestPose * entityWorld</c> per batch and routing
|
||
/// the result through <paramref name="appendInstance"/>. The delegate
|
||
/// abstracts over <see cref="InstanceGroup"/> so this helper stays
|
||
/// GL-free and unit-testable.
|
||
/// </summary>
|
||
/// <remarks>
|
||
/// Matrix multiplication is non-commutative: it MUST be
|
||
/// <c>RestPose * entityWorld</c>, not the reverse. See
|
||
/// <see cref="ComposePartWorldMatrix"/> for the full part-world product.
|
||
/// </remarks>
|
||
internal static void ApplyCacheHit(
|
||
EntityCacheEntry entry,
|
||
Matrix4x4 entityWorld,
|
||
Action<GroupKey, Matrix4x4> appendInstance)
|
||
{
|
||
foreach (var cached in entry.Batches)
|
||
{
|
||
appendInstance(cached.Key, cached.RestPose * entityWorld);
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Per-tuple flush check. If <paramref name="populateEntityId"/> is set
|
||
/// AND differs from <paramref name="currentEntityId"/>, the previous
|
||
/// entity's accumulated batches are committed to <paramref name="cache"/>
|
||
/// and <paramref name="populateScratch"/> is cleared. Returns the
|
||
/// updated tracker tuple — pass these back into the field locals in the
|
||
/// caller's loop.
|
||
/// </summary>
|
||
/// <remarks>
|
||
/// This is the bug-fix structure from commit 00fa8ae (per-MeshRef
|
||
/// Populate would overwrite earlier MeshRefs because the cache is
|
||
/// keyed by entity.Id; flushing only on entity boundary preserves all
|
||
/// MeshRefs' batches). _walkScratch is in entity-order so all MeshRefs
|
||
/// of one entity arrive contiguously.
|
||
/// </remarks>
|
||
internal static (uint? PopulateEntityId, uint PopulateLandblockId)
|
||
MaybeFlushOnEntityChange(
|
||
uint? populateEntityId,
|
||
uint populateLandblockId,
|
||
uint currentEntityId,
|
||
EntityClassificationCache cache,
|
||
List<CachedBatch> populateScratch)
|
||
{
|
||
if (populateEntityId.HasValue && populateEntityId.Value != currentEntityId)
|
||
{
|
||
if (populateScratch.Count > 0)
|
||
{
|
||
cache.Populate(populateEntityId.Value, populateLandblockId, populateScratch.ToArray());
|
||
}
|
||
populateScratch.Clear();
|
||
return (null, 0u);
|
||
}
|
||
return (populateEntityId, populateLandblockId);
|
||
}
|
||
|
||
/// <summary>
|
||
/// End-of-loop final flush. The last entity in <c>_walkScratch</c> has
|
||
/// no next-iteration to trigger <see cref="MaybeFlushOnEntityChange"/>,
|
||
/// so commit its accumulated batches here. No-op when no populate is
|
||
/// pending (the last entity was animated, or the scratch is empty).
|
||
/// <para>
|
||
/// End-of-loop only — does NOT reset the caller's tracker locals
|
||
/// (intentional, since they go out of scope immediately after).
|
||
/// </para>
|
||
/// </summary>
|
||
internal static void FinalFlushPopulate(
|
||
uint? populateEntityId,
|
||
uint populateLandblockId,
|
||
EntityClassificationCache cache,
|
||
List<CachedBatch> populateScratch)
|
||
{
|
||
if (populateEntityId.HasValue && populateScratch.Count > 0)
|
||
{
|
||
cache.Populate(populateEntityId.Value, populateLandblockId, populateScratch.ToArray());
|
||
populateScratch.Clear();
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Instance-side helper used by <see cref="ApplyCacheHit"/>. Looks up or
|
||
/// creates an <see cref="InstanceGroup"/> for the given key in
|
||
/// <c>_groups</c> and appends the per-instance world matrix.
|
||
/// </summary>
|
||
private void AppendInstanceToGroup(GroupKey key, Matrix4x4 model)
|
||
{
|
||
if (!_groups.TryGetValue(key, out var grp))
|
||
{
|
||
grp = new InstanceGroup
|
||
{
|
||
Ibo = key.Ibo,
|
||
FirstIndex = key.FirstIndex,
|
||
BaseVertex = key.BaseVertex,
|
||
IndexCount = key.IndexCount,
|
||
BindlessTextureHandle = key.BindlessTextureHandle,
|
||
TextureLayer = key.TextureLayer,
|
||
Translucency = key.Translucency,
|
||
};
|
||
_groups[key] = grp;
|
||
}
|
||
grp.Matrices.Add(model);
|
||
}
|
||
|
||
private void ClassifyBatches(
|
||
ObjectRenderData renderData,
|
||
ulong gfxObjId,
|
||
Matrix4x4 model,
|
||
WorldEntity entity,
|
||
MeshRef meshRef,
|
||
ulong palHash,
|
||
AcSurfaceMetadataTable metaTable,
|
||
Matrix4x4 restPose,
|
||
List<CachedBatch>? collector = null)
|
||
{
|
||
for (int batchIdx = 0; batchIdx < renderData.Batches.Count; batchIdx++)
|
||
{
|
||
var batch = renderData.Batches[batchIdx];
|
||
|
||
TranslucencyKind translucency;
|
||
if (metaTable.TryLookup(gfxObjId, batchIdx, out var meta))
|
||
{
|
||
translucency = meta.Translucency;
|
||
}
|
||
else
|
||
{
|
||
translucency = batch.IsAdditive ? TranslucencyKind.Additive
|
||
: batch.IsTransparent ? TranslucencyKind.AlphaBlend
|
||
: TranslucencyKind.Opaque;
|
||
}
|
||
|
||
ulong texHandle = ResolveTexture(entity, meshRef, batch, palHash);
|
||
if (texHandle == 0) continue;
|
||
|
||
// TextureLayer is always 0 for per-instance composites; non-zero when
|
||
// WB atlas is adopted in N.6+ and batches reference a shared atlas layer.
|
||
uint texLayer = 0;
|
||
|
||
var key = new GroupKey(
|
||
batch.IBO, batch.FirstIndex, (int)batch.BaseVertex,
|
||
batch.IndexCount, texHandle, texLayer, translucency);
|
||
|
||
if (!_groups.TryGetValue(key, out var grp))
|
||
{
|
||
grp = new InstanceGroup
|
||
{
|
||
Ibo = batch.IBO,
|
||
FirstIndex = batch.FirstIndex,
|
||
BaseVertex = (int)batch.BaseVertex,
|
||
IndexCount = batch.IndexCount,
|
||
BindlessTextureHandle = texHandle,
|
||
TextureLayer = texLayer,
|
||
Translucency = translucency,
|
||
};
|
||
_groups[key] = grp;
|
||
}
|
||
grp.Matrices.Add(model);
|
||
collector?.Add(new CachedBatch(key, texHandle, restPose));
|
||
}
|
||
}
|
||
|
||
private ulong ResolveTexture(WorldEntity entity, MeshRef meshRef, ObjectRenderBatch batch, ulong palHash)
|
||
{
|
||
uint surfaceId = batch.Key.SurfaceId;
|
||
if (surfaceId == 0 || surfaceId == 0xFFFFFFFF) return 0;
|
||
|
||
uint overrideOrigTex = 0;
|
||
bool hasOrigTexOverride = meshRef.SurfaceOverrides is not null
|
||
&& meshRef.SurfaceOverrides.TryGetValue(surfaceId, out overrideOrigTex);
|
||
uint? origTexOverride = hasOrigTexOverride ? overrideOrigTex : (uint?)null;
|
||
|
||
if (entity.PaletteOverride is not null)
|
||
{
|
||
return _textures.GetOrUploadWithPaletteOverrideBindless(
|
||
surfaceId, origTexOverride, entity.PaletteOverride, palHash);
|
||
}
|
||
else if (hasOrigTexOverride)
|
||
{
|
||
return _textures.GetOrUploadWithOrigTextureOverrideBindless(surfaceId, overrideOrigTex);
|
||
}
|
||
else
|
||
{
|
||
return _textures.GetOrUploadBindless(surfaceId);
|
||
}
|
||
}
|
||
|
||
private static void WriteMatrix(float[] buf, int offset, in Matrix4x4 m)
|
||
{
|
||
buf[offset + 0] = m.M11; buf[offset + 1] = m.M12; buf[offset + 2] = m.M13; buf[offset + 3] = m.M14;
|
||
buf[offset + 4] = m.M21; buf[offset + 5] = m.M22; buf[offset + 6] = m.M23; buf[offset + 7] = m.M24;
|
||
buf[offset + 8] = m.M31; buf[offset + 9] = m.M32; buf[offset + 10] = m.M33; buf[offset + 11] = m.M34;
|
||
buf[offset + 12] = m.M41; buf[offset + 13] = m.M42; buf[offset + 14] = m.M43; buf[offset + 15] = m.M44;
|
||
}
|
||
|
||
public void Dispose()
|
||
{
|
||
if (_disposed) return;
|
||
_disposed = true;
|
||
_gl.DeleteBuffer(_instanceSsbo);
|
||
_gl.DeleteBuffer(_batchSsbo);
|
||
_gl.DeleteBuffer(_indirectBuffer);
|
||
if (_gpuQueriesInitialized)
|
||
{
|
||
_gl.DeleteQuery(_gpuQueryOpaque);
|
||
_gl.DeleteQuery(_gpuQueryTransparent);
|
||
}
|
||
}
|
||
|
||
// ── Public types + helpers for BuildIndirectArrays (Task 9) ─────────────
|
||
//
|
||
// These are public so the pure-CPU unit tests in AcDream.Core.Tests can
|
||
// exercise BuildIndirectArrays without needing a GL context.
|
||
|
||
/// <summary>
|
||
/// Stride in bytes of <c>DrawElementsIndirectCommand</c> in the indirect buffer.
|
||
/// 5 × <c>uint</c> = 20 bytes. Tests and callers reference this symbolically
|
||
/// rather than hard-coding <c>20</c> so a layout change produces a compile error.
|
||
/// </summary>
|
||
public const int DrawCommandStride = 20; // sizeof(DrawElementsIndirectCommand): 5 × uint
|
||
|
||
/// <summary>
|
||
/// Public view of the per-group inputs to <see cref="BuildIndirectArrays"/> — used in tests.
|
||
/// </summary>
|
||
public readonly record struct IndirectGroupInput(
|
||
int IndexCount,
|
||
uint FirstIndex,
|
||
int BaseVertex,
|
||
int InstanceCount,
|
||
int FirstInstance,
|
||
ulong TextureHandle,
|
||
uint TextureLayer,
|
||
TranslucencyKind Translucency);
|
||
|
||
/// <summary>
|
||
/// Public mirror of the per-group <see cref="BatchData"/> uploaded to the SSBO.
|
||
/// Tests verify the layout. Same field shape as the private BatchData.
|
||
/// </summary>
|
||
[StructLayout(LayoutKind.Sequential, Pack = 8)]
|
||
public struct BatchDataPublic
|
||
{
|
||
public ulong TextureHandle;
|
||
public uint TextureLayer;
|
||
public uint Flags;
|
||
}
|
||
|
||
/// <summary>Result of <see cref="BuildIndirectArrays"/>.</summary>
|
||
public readonly record struct IndirectLayoutResult(
|
||
int OpaqueCount,
|
||
int TransparentCount,
|
||
int TransparentByteOffset);
|
||
|
||
/// <summary>
|
||
/// Lays out the indirect commands + parallel BatchData array contiguously:
|
||
/// opaque section first (caller sorts before calling), transparent section second.
|
||
/// Pure CPU, no GL state. Caller passes pre-sized scratch arrays.
|
||
/// </summary>
|
||
/// <remarks>
|
||
/// Classification: Opaque + ClipMap → opaque pass (ClipMap uses discard, not
|
||
/// blending). Everything else (AlphaBlend, Additive, InvAlpha) → transparent pass.
|
||
/// </remarks>
|
||
public static IndirectLayoutResult BuildIndirectArrays(
|
||
IReadOnlyList<IndirectGroupInput> groups,
|
||
DrawElementsIndirectCommand[] indirectScratch,
|
||
BatchDataPublic[] batchScratch)
|
||
{
|
||
int opaqueCount = 0;
|
||
int transparentCount = 0;
|
||
|
||
foreach (var g in groups)
|
||
{
|
||
if (IsOpaque(g.Translucency)) opaqueCount++;
|
||
else transparentCount++;
|
||
}
|
||
|
||
int oi = 0; // opaque write cursor (fills [0..opaqueCount))
|
||
int ti = opaqueCount; // transparent write cursor (fills [opaqueCount..end))
|
||
|
||
foreach (var g in groups)
|
||
{
|
||
var dec = new DrawElementsIndirectCommand
|
||
{
|
||
Count = (uint)g.IndexCount,
|
||
InstanceCount = (uint)g.InstanceCount,
|
||
FirstIndex = g.FirstIndex,
|
||
BaseVertex = g.BaseVertex,
|
||
BaseInstance = (uint)g.FirstInstance,
|
||
};
|
||
var bd = new BatchDataPublic
|
||
{
|
||
TextureHandle = g.TextureHandle,
|
||
TextureLayer = g.TextureLayer,
|
||
Flags = 0,
|
||
};
|
||
|
||
if (IsOpaque(g.Translucency))
|
||
{
|
||
indirectScratch[oi] = dec;
|
||
batchScratch[oi] = bd;
|
||
oi++;
|
||
}
|
||
else
|
||
{
|
||
indirectScratch[ti] = dec;
|
||
batchScratch[ti] = bd;
|
||
ti++;
|
||
}
|
||
}
|
||
|
||
return new IndirectLayoutResult(opaqueCount, transparentCount, opaqueCount * DrawCommandStride);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Public test shim for <see cref="IsOpaque"/>. Locks in the N.5 Decision 2
|
||
/// translucency partition: Opaque + ClipMap → opaque indirect; AlphaBlend +
|
||
/// Additive + InvAlpha → transparent indirect.
|
||
/// </summary>
|
||
public static bool IsOpaquePublic(TranslucencyKind t) => IsOpaque(t);
|
||
|
||
private static bool IsOpaque(TranslucencyKind t)
|
||
=> t == TranslucencyKind.Opaque || t == TranslucencyKind.ClipMap;
|
||
|
||
// ────────────────────────────────────────────────────────────────────────
|
||
|
||
private sealed class InstanceGroup
|
||
{
|
||
public uint Ibo;
|
||
public uint FirstIndex;
|
||
public int BaseVertex;
|
||
public int IndexCount;
|
||
public ulong BindlessTextureHandle; // 64-bit (was uint TextureHandle in N.4)
|
||
public uint TextureLayer; // 0 for per-instance composites; non-zero when WB atlas is adopted in N.6+
|
||
public TranslucencyKind Translucency;
|
||
public int FirstInstance; // offset into the shared instance VBO (in instances, not bytes)
|
||
public int InstanceCount;
|
||
public float SortDistance; // squared distance from camera to first instance, for opaque sort
|
||
public readonly List<Matrix4x4> Matrices = new();
|
||
}
|
||
}
|