leakhunt/tools/scan_rendersurface_refcounts.py
acbot 57b5e43d0e Initial commit — leak-hunt project complete
Five bugs identified and patched in retail Asheron's Call client:
- v3b: palette refcount over-increment (3-byte NOP at two sites)
- v5: RenderSurface PurgeResource no-op stub (vtable slot 2 thunk)
- v11: two dangling-pointer crash guards (NULL-check + reorder)
- v14: CEnvCell::Destroy ClipPlaneList leak (18-byte JMP to cleanup thunk)
- v22: unpacker stale-pointer SEH guard (whole-function __try/__except)

All five ship in leakfix.dll (117 KB, SHA d282f23c…) which is loaded
by acclient.exe at process start via PE import table patching by
tools/install_leakfix.py.

Controlled 15-client fleet soak: unpatched control died at 26h with
palette exhaustion; all 14 patched clients survived past that point
and reached ≥5-day uptime.

Residual ~15 MB/h growth traced to d3d9.dll's internal slab allocator
(260KB surface backing buffers retained after Release). See REPORT.md
§10 for the full investigation; conclusion is that it's unfixable from
outside d3d9.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 21:07:58 +02:00

123 lines
4.7 KiB
Python

"""
scan_rendersurface_refcounts.py <dump.dmp>
Walks every committed private RW region in the dump, looking for objects
that look like RenderSurface instances:
* Located inside any heap region (covered by minidump memory64).
* First DWORD is a vtable pointer into acclient.exe's .rdata range.
* Looks like a DBObj-derived object: m_pMaintainer (offset 0x20) is a
pointer that itself looks like a heap object.
For each candidate, reads m_numLinks at offset 0x24.
Why: the leak hypothesis fork is
cache-freelist hypothesis → m_numLinks == 1 for leaked surfaces
UI-held-ref hypothesis → m_numLinks > 1 for leaked surfaces
We can answer this empirically from a single dump.
Strategy:
1. Find the RenderSurface vtable in EoR by clustering. Look at objects
across the heap; whatever vtable value is the most popular among
objects of size ~0x120 is RenderSurface's vtable.
2. Histogram m_numLinks across those.
"""
import argparse
import os
import struct
import sys
from collections import Counter, defaultdict
from minidump.minidumpfile import MinidumpFile
def _enum_int(v):
if v is None: return 0
if hasattr(v, 'value'): return int(v.value)
return int(v)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("dump")
ap.add_argument("--surface-size", type=lambda x: int(x, 0), default=0x120,
help="expected RenderSurface size (default 0x120)")
ap.add_argument("--scan-step", type=lambda x: int(x, 0), default=8,
help="alignment step for object header search (default 8)")
args = ap.parse_args()
md = MinidumpFile.parse(args.dump)
reader = md.get_reader().get_buffered_reader()
# Find acclient.exe range
acl = None
for m in md.modules.modules:
if os.path.basename(m.name).lower() == "acclient.exe":
acl = m; break
if acl is None:
print("acclient.exe not in module list", file=sys.stderr); sys.exit(1)
acl_lo = acl.baseaddress
acl_hi = acl.baseaddress + acl.size
print(f"acclient.exe: 0x{acl_lo:08x} - 0x{acl_hi:08x}")
# Iterate committed private RW regions, scan for object-headers
region_count = 0
vtable_hits = Counter() # vtable -> count of objects with that first-DWORD
by_vtable_refcounts = defaultdict(list) # vtable -> list of m_numLinks values
for r in md.memory_info.infos:
st = _enum_int(r.State); ty = _enum_int(r.Type); pr = _enum_int(r.Protect) & 0xFF
if st != 0x1000 or ty != 0x20000 or pr not in (0x04, 0x40):
continue
# Scan the region for object-headers at aligned positions
# An "object" header is a DWORD that points into acclient.exe + a sane
# m_pMaintainer field at offset 0x20. We don't impose a size constraint
# because the heap large-block path may pad differently.
region_base = r.BaseAddress
region_size = r.RegionSize
# Cap scan to avoid huge regions
scan_size = min(region_size, 0x4000)
try:
reader.move(region_base)
buf = reader.read(scan_size)
except Exception:
continue
if not buf:
continue
region_count += 1
for off in range(0, len(buf) - 0x28, args.scan_step):
try:
vtbl = struct.unpack_from('<I', buf, off)[0]
except struct.error:
break
if not (acl_lo <= vtbl < acl_hi):
continue
# Looks like a candidate. Read m_pMaintainer at +0x20 and m_numLinks at +0x24.
try:
maintainer, num_links = struct.unpack_from('<II', buf, off + 0x20)
except struct.error:
continue
# m_numLinks should be a small positive int
if not (1 <= num_links <= 10000):
continue
# m_pMaintainer should be a user-mode pointer (or null for non-cached objs)
if maintainer != 0 and not (0x00010000 <= maintainer < 0x80000000):
continue
vtable_hits[vtbl] += 1
by_vtable_refcounts[vtbl].append(num_links)
print(f"scanned {region_count} regions, found {sum(vtable_hits.values())} candidate DBObj-like headers")
print()
print(f"top 20 vtable signatures (by candidate count):")
print(f" {'vtbl_abs':>10} {'vtbl_rva':>10} {'count':>6} {'refcount distrib (mode -> count)':<40}")
for vt, n in vtable_hits.most_common(20):
rc_list = by_vtable_refcounts[vt]
rc_counter = Counter(rc_list)
rc_str = ", ".join(f"{k}->{v}" for k, v in rc_counter.most_common(6))
print(f" 0x{vt:08x} 0x{vt-acl_lo:08x} {n:>6} {rc_str}")
if __name__ == "__main__":
main()