leakhunt/tools/owner_vtable_scan.py

"""
owner_vtable_scan.py <dump.dmp>

Goal: identify which class owns the leaked 256-512KB buffers.

Method:
  1. Enumerate leaked 256-512KB Private RW regions (the "leak set").
  2. Build a set of candidate "pointer-to-buffer" values:
       region_base + delta  for delta in {0, 8, 0x10, 0x18, 0x20, 0x30, 0x40}
     (covers different heap-header sizes incl. +ust, +hpa).
  3. Scan ALL committed RW memory for any DWORD whose value is in that
     candidate set. For each hit, the containing word at offset
     (hit_addr - field_offset) might be a field inside some object.
  4. For each hit, look BACKWARDS within the same heap entry for a vtable
     (a DWORD pointing into image memory, typically rdata). The first
     valid vtable found is the owner-object's vtable.
  5. Histogram by (owner_vtable, field_offset). The top entries reveal
     which class+field owns the leaked buffer.

Output: top vtable hits with their image-module attribution.
"""
import struct, sys
from collections import Counter, defaultdict
from minidump.minidumpfile import MinidumpFile


def _ei(v):
    if v is None: return 0
    if hasattr(v, 'value'): return int(v.value)
    return int(v)


def main():
    md = MinidumpFile.parse(sys.argv[1])
    reader = md.get_reader().get_buffered_reader()

    # Module map -> attribute vtable addresses
    mods = []
    for m in md.modules.modules:
        mods.append((m.baseaddress, m.size, m.name))
    def mod_of(addr):
        for b, s, n in mods:
            if b <= addr < b + s:
                return n.split("\\")[-1]
        return None

    # Image-region ranges (for vtable validation)
    image_ranges = []
    for r in md.memory_info.infos:
        st, ty = _ei(r.State), _ei(r.Type)
        if st == 0x1000 and ty == 0x1000000:
            image_ranges.append((r.BaseAddress, r.BaseAddress + r.RegionSize))
    image_ranges.sort()
    def is_image(addr):
        for lo, hi in image_ranges:
            if lo <= addr < hi:
                return True
            if addr < lo:
                return False
        return False

    # Leaked 256-512KB regions
    leaked = []
    for r in md.memory_info.infos:
        st, ty, pr = _ei(r.State), _ei(r.Type), _ei(r.Protect) & 0xff
        if st == 0x1000 and ty == 0x20000 and pr in (0x04, 0x40) \
                and 256*1024 <= r.RegionSize < 512*1024:
            leaked.append((r.BaseAddress, r.RegionSize))
    print(f"leaked 256-512KB private RW regions: {len(leaked)}")

    # Build candidate "pointer values" set
    deltas = [0, 8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x40, 0x50, 0x60]
    cand_to_region = {}
    for base, _sz in leaked:
        for d in deltas:
            cand_to_region[base + d] = base
    print(f"candidate pointer values: {len(cand_to_region)} (across {len(deltas)} deltas)")

    # Scan all committed RW regions
    scan_regions = []
    for r in md.memory_info.infos:
        st, ty, pr = _ei(r.State), _ei(r.Type), _ei(r.Protect) & 0xff
        if st != 0x1000: continue
        if ty == 0x1000000: continue  # skip Image
        if pr not in (0x04, 0x40): continue
        scan_regions.append((r.BaseAddress, r.RegionSize))
    total_bytes = sum(s for _, s in scan_regions)
    print(f"scanning {len(scan_regions)} writable non-image regions ({total_bytes/(1024*1024):.1f} MB)")

    # Build a per-region buffer cache so we can do "lookback within same region"
    hits = []  # list of (hit_va, region_base_of_leaked_buf, value_pointed_at)
    for base, size in scan_regions:
        try:
            reader.move(base)
            buf = reader.read(size)
        except Exception:
            continue
        if not buf: continue
        end = (len(buf) // 4) * 4
        for off in range(0, end, 4):
            v = struct.unpack_from("<I", buf, off)[0]
            if v in cand_to_region:
                hits.append((base + off, cand_to_region[v], v, base, off, buf))
    print(f"raw pointer-into-leaked hits: {len(hits)}")

    if not hits:
        print("no hits — leaked buffers are orphaned (no live pointers to them).")
        return

    # For each hit, walk backwards within the same buffer up to N words looking
    # for a DWORD that is in image memory and aligned (vtable candidate).
    # Treat the hit as a "field at offset (off - vtbl_off) inside an object".
    LOOKBACK_BYTES = 0x200   # 512 bytes back

    vtable_hits = Counter()         # (vtable, field_offset) -> count
    vtable_only_hits = Counter()    # vtable -> count
    field_offsets_per_vtable = defaultdict(Counter)
    examples = defaultdict(list)
    no_vtable = 0

    for hit_va, leaked_base, ptr_val, reg_base, off, buf in hits:
        start = max(0, off - LOOKBACK_BYTES)
        # Walk backwards in 4-byte steps from (off - 4) down to start
        found = False
        for back in range(off - 4, start - 4, -4):
            if back < 0: break
            v = struct.unpack_from("<I", buf, back)[0]
            if v < 0x00400000 or v > 0x10000000:
                continue
            if is_image(v):
                vtable = v
                field_off = off - back
                vtable_hits[(vtable, field_off)] += 1
                vtable_only_hits[vtable] += 1
                field_offsets_per_vtable[vtable][field_off] += 1
                if len(examples[(vtable, field_off)]) < 3:
                    examples[(vtable, field_off)].append((hit_va, leaked_base, ptr_val))
                found = True
                break
        if not found:
            no_vtable += 1

    print(f"hits with no preceding vtable in 0x200 lookback: {no_vtable}")
    print(f"unique (vtable, field_off) pairs: {len(vtable_hits)}")
    print(f"unique vtables: {len(vtable_only_hits)}")
    print()

    print("=== Top vtables (regardless of field offset) ===")
    for vtbl, cnt in vtable_only_hits.most_common(25):
        owner = mod_of(vtbl) or "?"
        # Show the top field offsets seen for this vtable
        top_offs = field_offsets_per_vtable[vtbl].most_common(4)
        offs_str = " ".join(f"+0x{o:x}={c}" for o, c in top_offs)
        print(f"  0x{vtbl:08x}  count={cnt:<5}  ({owner})  offsets: {offs_str}")

    print()
    print("=== Top (vtable, field_offset) pairs ===")
    for (vtbl, off), cnt in vtable_hits.most_common(25):
        owner = mod_of(vtbl) or "?"
        ex = examples[(vtbl, off)][0]
        print(f"  0x{vtbl:08x}  +0x{off:03x}  count={cnt:<5}  ({owner})  e.g. hit@0x{ex[0]:08x} -> leaked@0x{ex[1]:08x} val=0x{ex[2]:08x}")


if __name__ == "__main__":
    main()