leakhunt/templates/supervisor.ps1
acbot 57b5e43d0e Initial commit — leak-hunt project complete
Five bugs identified and patched in retail Asheron's Call client:
- v3b: palette refcount over-increment (3-byte NOP at two sites)
- v5: RenderSurface PurgeResource no-op stub (vtable slot 2 thunk)
- v11: two dangling-pointer crash guards (NULL-check + reorder)
- v14: CEnvCell::Destroy ClipPlaneList leak (18-byte JMP to cleanup thunk)
- v22: unpacker stale-pointer SEH guard (whole-function __try/__except)

All five ship in leakfix.dll (117 KB, SHA d282f23c…) which is loaded
by acclient.exe at process start via PE import table patching by
tools/install_leakfix.py.

Controlled 15-client fleet soak: unpatched control died at 26h with
palette exhaustion; all 14 patched clients survived past that point
and reached ≥5-day uptime.

Residual ~15 MB/h growth traced to d3d9.dll's internal slab allocator
(260KB surface backing buffers retained after Release). See REPORT.md
§10 for the full investigation; conclusion is that it's unfixable from
outside d3d9.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 21:07:58 +02:00

119 lines
4.4 KiB
PowerShell

# Supervisor harness for the AC client memory-leak hunt.
#
# What this does:
# - Sets _NT_SYMBOL_PATH so umdh/cdb resolve symbols against acclient.pdb
# - Verifies gflags +ust is enabled (required for stack-tagged allocations)
# - Starts ACE (optionally) and the AC client
# - Periodically calls snapshot.ps1 to capture UMDH snapshots
# - Watches for process exit; on crash, captures procdump + final snapshot
#
# Skeleton — flesh out at Phase 1 time. Configurable up top.
param([Parameter(Mandatory=$true)][string]$Phase)
#region Config
$AcExe = "C:\Turbine\Asheron's Call\acclient.exe"
$PdbDir = "C:\Users\acbot\leakhunt\pdb"
$OutRoot = "C:\Users\acbot\leakhunt\artifacts"
$LauncherPs1 = "C:\Users\acbot\leakhunt\bin\launch_acclient.ps1" # gitignored, has creds
$UmdhExe = "C:\Program Files (x86)\Windows Kits\10\Debuggers\x86\umdh.exe"
$GflagsExe = "C:\Program Files (x86)\Windows Kits\10\Debuggers\x86\gflags.exe"
$SnapshotEvery = 1800 # seconds (30 min — Phase 1 default; bump down for Phase 4)
$MaxDuration = 14400 # seconds (4h default — bump for Phase 4)
$AceCwd = $null # Coldeve real server in use; no local ACE
#endregion
$ErrorActionPreference = "Stop"
function Write-Step([string]$msg) {
Write-Host "[$(Get-Date -Format HH:mm:ss)] $msg" -ForegroundColor Cyan
}
$phaseDir = Join-Path $OutRoot $Phase
New-Item -ItemType Directory -Path $phaseDir -Force | Out-Null
Write-Step "Output: $phaseDir"
# 1. Symbol path for umdh / cdb
$env:_NT_SYMBOL_PATH = $PdbDir
Write-Step "_NT_SYMBOL_PATH = $env:_NT_SYMBOL_PATH"
# 2. Confirm gflags +ust is set on acclient.exe (via IFEO registry — no admin needed)
$ifeoFlag = (Get-ItemProperty "HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion\Image File Execution Options\acclient.exe" -ErrorAction SilentlyContinue).GlobalFlag
if (-not $ifeoFlag -or -not ($ifeoFlag -band 0x1000)) {
throw "gflags +ust NOT set on acclient.exe (GlobalFlag=$ifeoFlag). Run elevated: gflags /i acclient.exe +ust"
}
Write-Step "gflags +ust verified (GlobalFlag=0x$([Convert]::ToString($ifeoFlag,16)))"
# 3. (Optional) start ACE
if ($AceCwd -ne $null) {
Write-Step "Starting ACE in $AceCwd ..."
$aceProc = Start-Process -FilePath "pwsh" -ArgumentList "-c", $AceCmd `
-WorkingDirectory $AceCwd -PassThru -WindowStyle Minimized
Start-Sleep -Seconds 8
if ($aceProc.HasExited) {
throw "ACE exited during startup. Check $AceCwd."
}
} else {
Write-Step "ACE not auto-started — assume user/operator has it running"
}
# 4. Launch acclient via the credentialed launcher (auto-login via -a/-v/-h CLI args)
Write-Step "Launching acclient via $LauncherPs1 ..."
& $LauncherPs1
Start-Sleep -Seconds 5
$acProc = Get-Process -Name acclient -ErrorAction Stop | Sort-Object StartTime -Descending | Select-Object -First 1
$pid_ac = $acProc.Id
Write-Step "acclient pid = $pid_ac"
# Wait for in-world plateau (working set typically settles past ~500 MB once cell data loads)
Write-Step "Waiting for in-world plateau (working set >= 500 MB) ..."
$plateauDeadline = (Get-Date).AddSeconds(180)
while ((Get-Date) -lt $plateauDeadline) {
Start-Sleep -Seconds 5
if ($acProc.HasExited) { throw "acclient exited during login. ExitCode=$($acProc.ExitCode)" }
$ws = (Get-Process -Id $pid_ac).WorkingSet64
if ($ws -gt 500MB) {
Write-Step "Plateau detected: WS=$([math]::Round($ws/1MB,1)) MB"
break
}
}
# 6. Snapshot loop
$start = Get-Date
$snapIdx = 1
$snapshotScript = Join-Path $PSScriptRoot "snapshot.ps1"
while ($true) {
Start-Sleep -Seconds $SnapshotEvery
if ($acProc.HasExited) {
Write-Step "acclient EXITED — code $($acProc.ExitCode)"
# Capture dump if process still around (sometimes lingers briefly)
# & procdump -ma $pid_ac "$phaseDir\crash.dmp" 2>&1 | Out-Null
break
}
$snapPath = Join-Path $phaseDir ("snap_{0:D3}.txt" -f $snapIdx)
Write-Step "snapshot $snapIdx -> $snapPath"
& $snapshotScript -ProcessId $pid_ac -Out $snapPath
$snapIdx++
if (((Get-Date) - $start).TotalSeconds -gt $MaxDuration) {
Write-Step "Max duration reached. Final snapshot done."
break
}
}
# 7. Final diff
if ($snapIdx -gt 2) {
$first = Join-Path $phaseDir ("snap_001.txt")
$last = Join-Path $phaseDir ("snap_{0:D3}.txt" -f ($snapIdx - 1))
$diff = Join-Path $phaseDir "diff_first_to_last.txt"
Write-Step "Diff: $first -> $last"
& $UmdhExe -d $first $last -f:$diff
}
Write-Step "Supervisor done."