Initial commit — leak-hunt project complete

Five bugs identified and patched in retail Asheron's Call client:
- v3b: palette refcount over-increment (3-byte NOP at two sites)
- v5: RenderSurface PurgeResource no-op stub (vtable slot 2 thunk)
- v11: two dangling-pointer crash guards (NULL-check + reorder)
- v14: CEnvCell::Destroy ClipPlaneList leak (18-byte JMP to cleanup thunk)
- v22: unpacker stale-pointer SEH guard (whole-function __try/__except)

All five ship in leakfix.dll (117 KB, SHA d282f23c…) which is loaded
by acclient.exe at process start via PE import table patching by
tools/install_leakfix.py.

Controlled 15-client fleet soak: unpatched control died at 26h with
palette exhaustion; all 14 patched clients survived past that point
and reached ≥5-day uptime.

Residual ~15 MB/h growth traced to d3d9.dll's internal slab allocator
(260KB surface backing buffers retained after Release). See REPORT.md
§10 for the full investigation; conclusion is that it's unfixable from
outside d3d9.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
acbot 2026-05-23 21:05:17 +02:00
commit 57b5e43d0e
199 changed files with 1648333 additions and 0 deletions

View file

@ -0,0 +1,46 @@
{
"phases": [
{
"name": "idle",
"duration_min": 60,
"description": "Sit at lifestone. No input. Establishes baseline allocator noise.",
"actions": []
},
{
"name": "wander",
"duration_min": 60,
"description": "Walk a fixed route around Holtburg town. Targets streaming + landblock loads.",
"actions": [
{ "type": "walk_route", "waypoints": ["lifestone", "town-square", "marketplace", "south-gate", "lifestone"], "loop": true }
]
},
{
"name": "chat",
"duration_min": 60,
"description": "Spam /say and /tell. Targets chat-log buffers.",
"actions": [
{ "type": "send_chat", "channel": "say", "message_template": "test {counter}", "interval_sec": 2 }
]
},
{
"name": "target-cycle",
"duration_min": 60,
"description": "Tab through nearby targetables. No combat. Targets selection + tooltip allocation.",
"actions": [
{ "type": "press_key", "key": "Tab", "interval_sec": 3 }
]
},
{
"name": "ui-cycle",
"duration_min": 60,
"description": "Open/close inventory, character pane, spells pane. Targets UI-widget allocation.",
"actions": [
{ "type": "press_key", "key": "i", "interval_sec": 5 },
{ "type": "press_key", "key": "c", "interval_sec": 7 },
{ "type": "press_key", "key": "s", "interval_sec": 9 }
]
}
],
"snapshot_interval_min": 15,
"notes": "Phase 2 schedule. Run one phase per session, fresh from bench-verified snapshot. Compare growth rates across phases to localize the leak's subsystem."
}

52
templates/login.ahk Normal file
View file

@ -0,0 +1,52 @@
; AutoHotkey v2 — login skeleton for retail acclient.exe
;
; Drives the launcher login screen. Fills in test credentials, clicks
; through character select. Adjust ImageSearch / Click coordinates after
; first manual run — UI layouts depend on resolution and skin.
;
; Usage: launch this after supervisor.ps1 starts acclient.exe
#Requires AutoHotkey v2.0
#SingleInstance Force
; --- config ---
USERNAME := "testaccount"
PASSWORD := "testpassword"
CHAR_SLOT := 1
WAIT_TIMEOUT_S := 60
; --- end config ---
WinTitle := "Asheron's Call"
; Wait for the AC window
if not WinWait(WinTitle, , WAIT_TIMEOUT_S) {
MsgBox "AC window not found within " WAIT_TIMEOUT_S "s — aborting"
ExitApp 1
}
WinActivate WinTitle
Sleep 2000
; Type username
Send USERNAME
Send "{Tab}"
Send PASSWORD
Send "{Enter}"
; Wait for character select screen — adjust the wait for your skin
Sleep 8000
; Select character (slot 1 is top of list)
Loop CHAR_SLOT - 1 {
Send "{Down}"
Sleep 200
}
Send "{Enter}"
; Wait for in-world load
Sleep 15000
; If you got here, you're in-world.
; The supervisor doesn't need anything else from us; the controller DLL
; (Phase 3) drives in-game activity.
ExitApp 0

26
templates/snapshot.ps1 Normal file
View file

@ -0,0 +1,26 @@
# Take one UMDH stack-tagged heap snapshot of a running process.
#
# Requirements:
# - gflags /i acclient.exe +ust (one-time, registry-set)
# - _NT_SYMBOL_PATH pointing at acclient.pdb directory
# - umdh.exe on PATH (Windows Debugging Tools)
param(
[Parameter(Mandatory=$true)][int]$ProcessId,
[Parameter(Mandatory=$true)][string]$Out
)
$ErrorActionPreference = "Stop"
if (-not $env:_NT_SYMBOL_PATH) {
Write-Warning "_NT_SYMBOL_PATH not set — symbols may not resolve"
}
& umdh.exe -p:$ProcessId -f:$Out
if (-not (Test-Path $Out)) {
throw "umdh produced no output at $Out"
}
$size = (Get-Item $Out).Length
Write-Host "snapshot: $Out ($size bytes)"

119
templates/supervisor.ps1 Normal file
View file

@ -0,0 +1,119 @@
# Supervisor harness for the AC client memory-leak hunt.
#
# What this does:
# - Sets _NT_SYMBOL_PATH so umdh/cdb resolve symbols against acclient.pdb
# - Verifies gflags +ust is enabled (required for stack-tagged allocations)
# - Starts ACE (optionally) and the AC client
# - Periodically calls snapshot.ps1 to capture UMDH snapshots
# - Watches for process exit; on crash, captures procdump + final snapshot
#
# Skeleton — flesh out at Phase 1 time. Configurable up top.
param([Parameter(Mandatory=$true)][string]$Phase)
#region Config
$AcExe = "C:\Turbine\Asheron's Call\acclient.exe"
$PdbDir = "C:\Users\acbot\leakhunt\pdb"
$OutRoot = "C:\Users\acbot\leakhunt\artifacts"
$LauncherPs1 = "C:\Users\acbot\leakhunt\bin\launch_acclient.ps1" # gitignored, has creds
$UmdhExe = "C:\Program Files (x86)\Windows Kits\10\Debuggers\x86\umdh.exe"
$GflagsExe = "C:\Program Files (x86)\Windows Kits\10\Debuggers\x86\gflags.exe"
$SnapshotEvery = 1800 # seconds (30 min — Phase 1 default; bump down for Phase 4)
$MaxDuration = 14400 # seconds (4h default — bump for Phase 4)
$AceCwd = $null # Coldeve real server in use; no local ACE
#endregion
$ErrorActionPreference = "Stop"
function Write-Step([string]$msg) {
Write-Host "[$(Get-Date -Format HH:mm:ss)] $msg" -ForegroundColor Cyan
}
$phaseDir = Join-Path $OutRoot $Phase
New-Item -ItemType Directory -Path $phaseDir -Force | Out-Null
Write-Step "Output: $phaseDir"
# 1. Symbol path for umdh / cdb
$env:_NT_SYMBOL_PATH = $PdbDir
Write-Step "_NT_SYMBOL_PATH = $env:_NT_SYMBOL_PATH"
# 2. Confirm gflags +ust is set on acclient.exe (via IFEO registry — no admin needed)
$ifeoFlag = (Get-ItemProperty "HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion\Image File Execution Options\acclient.exe" -ErrorAction SilentlyContinue).GlobalFlag
if (-not $ifeoFlag -or -not ($ifeoFlag -band 0x1000)) {
throw "gflags +ust NOT set on acclient.exe (GlobalFlag=$ifeoFlag). Run elevated: gflags /i acclient.exe +ust"
}
Write-Step "gflags +ust verified (GlobalFlag=0x$([Convert]::ToString($ifeoFlag,16)))"
# 3. (Optional) start ACE
if ($AceCwd -ne $null) {
Write-Step "Starting ACE in $AceCwd ..."
$aceProc = Start-Process -FilePath "pwsh" -ArgumentList "-c", $AceCmd `
-WorkingDirectory $AceCwd -PassThru -WindowStyle Minimized
Start-Sleep -Seconds 8
if ($aceProc.HasExited) {
throw "ACE exited during startup. Check $AceCwd."
}
} else {
Write-Step "ACE not auto-started — assume user/operator has it running"
}
# 4. Launch acclient via the credentialed launcher (auto-login via -a/-v/-h CLI args)
Write-Step "Launching acclient via $LauncherPs1 ..."
& $LauncherPs1
Start-Sleep -Seconds 5
$acProc = Get-Process -Name acclient -ErrorAction Stop | Sort-Object StartTime -Descending | Select-Object -First 1
$pid_ac = $acProc.Id
Write-Step "acclient pid = $pid_ac"
# Wait for in-world plateau (working set typically settles past ~500 MB once cell data loads)
Write-Step "Waiting for in-world plateau (working set >= 500 MB) ..."
$plateauDeadline = (Get-Date).AddSeconds(180)
while ((Get-Date) -lt $plateauDeadline) {
Start-Sleep -Seconds 5
if ($acProc.HasExited) { throw "acclient exited during login. ExitCode=$($acProc.ExitCode)" }
$ws = (Get-Process -Id $pid_ac).WorkingSet64
if ($ws -gt 500MB) {
Write-Step "Plateau detected: WS=$([math]::Round($ws/1MB,1)) MB"
break
}
}
# 6. Snapshot loop
$start = Get-Date
$snapIdx = 1
$snapshotScript = Join-Path $PSScriptRoot "snapshot.ps1"
while ($true) {
Start-Sleep -Seconds $SnapshotEvery
if ($acProc.HasExited) {
Write-Step "acclient EXITED — code $($acProc.ExitCode)"
# Capture dump if process still around (sometimes lingers briefly)
# & procdump -ma $pid_ac "$phaseDir\crash.dmp" 2>&1 | Out-Null
break
}
$snapPath = Join-Path $phaseDir ("snap_{0:D3}.txt" -f $snapIdx)
Write-Step "snapshot $snapIdx -> $snapPath"
& $snapshotScript -ProcessId $pid_ac -Out $snapPath
$snapIdx++
if (((Get-Date) - $start).TotalSeconds -gt $MaxDuration) {
Write-Step "Max duration reached. Final snapshot done."
break
}
}
# 7. Final diff
if ($snapIdx -gt 2) {
$first = Join-Path $phaseDir ("snap_001.txt")
$last = Join-Path $phaseDir ("snap_{0:D3}.txt" -f ($snapIdx - 1))
$diff = Join-Path $phaseDir "diff_first_to_last.txt"
Write-Step "Diff: $first -> $last"
& $UmdhExe -d $first $last -f:$diff
}
Write-Step "Supervisor done."

48
templates/trace.cdb Normal file
View file

@ -0,0 +1,48 @@
$$ cdb scripting template — attach to acclient.exe, set non-blocking
$$ breakpoints on suspected allocator functions, count hits, auto-detach.
$$
$$ Usage:
$$ cdb.exe -pn acclient.exe -cf <this-file> -logo <output.log>
$$
$$ Or attach by PID:
$$ cdb.exe -p <pid> -cf <this-file> -logo <output.log>
$$
$$ Tips:
$$ - `gc` = "go conditional" — continue without breaking the debuggee
$$ - `qd` = "quit detached" — leaves the debuggee running, exits cdb
$$ - Counter $t0..$t19 are persistent across breakpoint hits
$$ - Don't put `;` inside breakpoint action strings without escaping —
$$ cdb's command parser splits on `;` even inside actions.
.logopen /t leak-trace.log
$$ Symbol path — local PDB only, no symbol server.
.sympath C:\leak-hunt\pdb
.symopt+ 0x40
.reload /f acclient.exe
$$ Verify the symbol we care about resolves (replace as needed)
$$ x acclient!CChatManager::AddLine
$$ ============================================================
$$ Counters
$$ ============================================================
r $t0 = 0 $$ alloc-site hits
r $t1 = 0 $$ free-site hits
r $t2 = 0 $$ unmatched (leak candidate) hits
$$ ============================================================
$$ Breakpoint pattern: increment counter, log every Nth, auto-detach at M
$$ ============================================================
$$ Replace <ALLOC_FN> and <FREE_FN> with the suspected function names.
bp acclient!<ALLOC_FN> "r $t0 = @$t0 + 1; .if (@$t0 % 1000 == 0) { .printf \"alloc hits: %d\\n\", @$t0 }; .if (@$t0 >= 100000) { .printf \"AUTO-DETACH at %d\\n\", @$t0; qd } .else { gc }"
bp acclient!<FREE_FN> "r $t1 = @$t1 + 1; .if (@$t1 % 1000 == 0) { .printf \"free hits: %d\\n\", @$t1 }; gc"
$$ Optional: dump `this` struct on first hit
$$ bp acclient!<ALLOC_FN> "r $t0 = @$t0 + 1; .if (@$t0 == 1) { dt acclient!<ClassName> @ecx }; gc"
g
.logclose