leakhunt/tools/fleet_monitor.sh
acbot 57b5e43d0e Initial commit — leak-hunt project complete
Five bugs identified and patched in retail Asheron's Call client:
- v3b: palette refcount over-increment (3-byte NOP at two sites)
- v5: RenderSurface PurgeResource no-op stub (vtable slot 2 thunk)
- v11: two dangling-pointer crash guards (NULL-check + reorder)
- v14: CEnvCell::Destroy ClipPlaneList leak (18-byte JMP to cleanup thunk)
- v22: unpacker stale-pointer SEH guard (whole-function __try/__except)

All five ship in leakfix.dll (117 KB, SHA d282f23c…) which is loaded
by acclient.exe at process start via PE import table patching by
tools/install_leakfix.py.

Controlled 15-client fleet soak: unpatched control died at 26h with
palette exhaustion; all 14 patched clients survived past that point
and reached ≥5-day uptime.

Residual ~15 MB/h growth traced to d3d9.dll's internal slab allocator
(260KB surface backing buffers retained after Release). See REPORT.md
§10 for the full investigation; conclusion is that it's unfixable from
outside d3d9.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 21:07:58 +02:00

118 lines
4.5 KiB
Bash

#!/usr/bin/env bash
# Combined fleet monitor:
# - HB every 30 min
# - Snapshot every 1 h (appends to artifacts/snapshots/main.tsv)
# - Every 60 s: scan for acclient PIDs in-world (title has "Coldeve-"),
# apply v3b -> v5 -> v11 -> v12 in cascade. Skip Jerry (control).
#
# Each patcher is idempotent — re-runs are no-ops when bytes already in
# patched state. To keep event log clean, only emits AUTO-* events for
# PIDs we haven't already seen as "done" for a given patch.
set -u
PY="C:/Users/acbot/AppData/Local/Programs/Python/Python312/python.exe"
cd /c/Users/acbot/leakhunt
last_hb=0
last_snap=0
# Per-PID, per-patch tracking sets (sentinel files in /tmp)
SEEN_DIR="/tmp/fleet_mon_seen"
mkdir -p "$SEEN_DIR"
mark_seen() { touch "$SEEN_DIR/${1}-${2}"; }
is_seen() { [ -f "$SEEN_DIR/${1}-${2}" ]; }
while true; do
now=$(date +%s)
# ===== heartbeat: every 30 min =====
if [ $((now - last_hb)) -ge 1800 ]; then
last_hb=$now
hb=$(powershell.exe -NoProfile -Command \
"Get-Process acclient -EA SilentlyContinue | ForEach-Object { \"\$(\$_.Id)=\$([int](\$_.WorkingSet64/1MB))MB\" } | Sort-Object" \
2>/dev/null | tr -d '\r' | tr '\n' ' ')
alive=$(echo "$hb" | tr ' ' '\n' | grep -c '=')
echo "HB $(date -u +%Y-%m-%dT%H:%M:%S) $hb ALIVE=${alive}"
fi
# ===== snapshot: every 1 h =====
if [ $((now - last_snap)) -ge 3600 ]; then
last_snap=$now
snap_log="artifacts/snapshots/last_snap.log"
rows_before=$(wc -l < artifacts/snapshots/main.tsv 2>/dev/null || echo 0)
{
echo "=== run $(date) ==="
echo "pwd=$(pwd)"
echo "PY=$PY"
echo "py-version=$("$PY" --version 2>&1)"
echo "argv0-test=$("$PY" -c "import sys; print(sys.argv)" 2>&1)"
echo "--- invoking snapshot ---"
"$PY" tools/snapshot_compare.py artifacts/snapshots/main.tsv
echo "--- exit=$? ---"
} > "$snap_log" 2>&1
snap_exit=$?
rows_after=$(wc -l < artifacts/snapshots/main.tsv 2>/dev/null || echo 0)
rows_added=$((rows_after - rows_before))
if [ $rows_added -gt 0 ]; then
echo "SNAPSHOT @$(date +%H:%M) appended ${rows_added} rows"
else
echo "SNAPSHOT-FAIL @$(date +%H:%M) exit=$snap_exit rows_added=$rows_added (log: $snap_log)"
fi
fi
# ===== auto-patch cascade: every loop (60 s) =====
pid_titles=$(powershell.exe -NoProfile -Command \
"Get-Process acclient -EA SilentlyContinue | ForEach-Object { \"\$(\$_.Id)|\$(\$_.MainWindowTitle)\" }" \
2>/dev/null | tr -d '\r')
while IFS='|' read -r pid title; do
[ -z "$pid" ] && continue
# Only patch in-world clients (skip splash screen)
if [ -z "$title" ] || ! echo "$title" | grep -q "Coldeve-"; then continue; fi
# Skip Jerry (control)
if echo "$title" | grep -qi "Jerry"; then continue; fi
# Apply in cascade order — one patch per cycle so any AV from a
# bad patch only takes down one phase.
for patch in v3b v5 v11 v12 v14; do
if is_seen "$pid" "$patch"; then continue; fi
case "$patch" in
v3b) script="tools/patch_palette_v3b.py"; extra="" ;;
v5) script="tools/patch_purge_v5_test.py"; extra="" ;;
v11) script="tools/patch_v11_test.py"; extra="" ;;
v12) script="tools/patch_v12_test.py"; extra="" ;;
v14) script="tools/patch_v14_cenvcell_clipplane.py"; extra="--apply" ;;
esac
if [ -n "$extra" ]; then
result=$("$PY" "$script" "$pid" "$extra" 2>&1)
else
result=$("$PY" "$script" "$pid" 2>&1)
fi
tail=$(echo "$result" | tail -1)
# idempotent skip detection
if echo "$result" | grep -q "already patched\|already has a CALL"; then
mark_seen "$pid" "$patch"
continue
fi
# DLL-form detection: if v5 says slots already point elsewhere (not
# the no-op stub), the DLL applied this; treat as success.
if [ "$patch" = "v5" ] && echo "$result" | grep -q "UNEXPECTED.*not the no-op stub"; then
mark_seen "$pid" "$patch"
echo "AUTO-V5-DLL-APPLIED PID=$pid title=\"$title\" $(date +%H:%M:%S)"
continue
fi
if echo "$result" | grep -q "OK\|reverted; now\|patched; now"; then
mark_seen "$pid" "$patch"
echo "AUTO-${patch^^} PID=$pid title=\"$title\" $(date +%H:%M:%S)"
else
# Mark FAIL as seen so we don't retry-spam every 60s.
mark_seen "$pid" "$patch"
echo "AUTO-${patch^^}-FAIL PID=$pid title=\"$title\" tail=\"$tail\""
fi
# only do ONE patch action per PID per cycle (cascade staggered)
break
done
done <<< "$pid_titles"
sleep 60
done