"""Pure-Python MSF 7.00 PDB extractor for acclient.pdb. Reads the Microsoft Program Database, extracts public function symbols (S_PUB32, kind 0x110E) and named struct/class type records (LF_CLASS, LF_STRUCTURE) from the TPI stream, and writes two JSON sidecars to docs/research/named-retail/: symbols.json — every named public function with its image VA types.json — every named struct/class with size This is a foundation for the named-retail workflow. After running, every future session can grep the JSON for instant address↔name lookups. Run with: py tools/pdb-extract/pdb_extract.py refs/acclient.pdb References: https://llvm.org/docs/PDB/MsfFile.html — MSF container layout https://llvm.org/docs/PDB/PublicStream.html — symbol record format https://llvm.org/docs/PDB/TpiStream.html — type-info stream layout https://llvm.org/docs/PDB/DbiStream.html — DBI header + sections No external dependencies — uses stdlib `struct` + `json` only. Wire constants (PDB7 / MSF 7.00): """ import json import os import struct import sys from pathlib import Path # ── MSF / PDB7 constants ─────────────────────────────────────────────────── MSF_MAGIC = b"Microsoft C/C++ MSF 7.00\r\n\x1aDS\0\0\0" SUPERBLOCK_SIZE = 56 # u32 ×6 + magic; MSF 7.00 layout # Stream indices (fixed by spec) STREAM_PDB_INFO = 1 STREAM_TPI = 2 STREAM_DBI = 3 STREAM_IPI = 4 # Symbol record kinds (subset) S_PUB32 = 0x110E # Public symbol (32-bit) # Type record kinds (subset) LF_CLASS = 0x1504 LF_STRUCTURE = 0x1505 LF_UNION = 0x1506 LF_ENUM = 0x1507 # DBI machine type (image-base + section table source) SECTION_HEADERS_DEFAULT_BASE = 0x00400000 # acclient.exe image base # CV public-symbol flag bits PUBSYM_FLAG_CODE = 0x00000002 # ── MSF reader ───────────────────────────────────────────────────────────── class Msf: """In-memory wrapper over a PDB file. Loads the superblock + stream directory, exposes per-stream byte buffers reconstructed from page chains.""" def __init__(self, path): with open(path, "rb") as f: self._data = f.read() if not self._data.startswith(MSF_MAGIC): raise ValueError(f"not an MSF 7.00 file: {path}") # Superblock fields (after the 32-byte magic). (self.block_size, self.free_block_map, self.num_blocks, self.num_dir_bytes, _reserved, self.block_map_addr) = struct.unpack_from("<6I", self._data, 32) if self.block_size not in (512, 1024, 2048, 4096): raise ValueError(f"unexpected block size: {self.block_size}") # Stream directory: read the directory-block-map first (a list # of page indices that themselves spell out the directory's # page list). Then read the directory pages. dir_pages_needed = _ceil_div(self.num_dir_bytes, self.block_size) block_map_pages_needed = _ceil_div(dir_pages_needed * 4, self.block_size) # The block_map_addr is the page index of the FIRST page in the # block-map. The block-map's pages are stored sequentially. # Wait, actually it's a single page index pointing to one page # full of u32 page indices. If there are >block_size/4 pages in # the directory, this overflows; for typical small PDBs it # doesn't. acclient.pdb's 119 KB directory at 4 KB pages = 30 # pages -> 30 u32s = 120 bytes, fits in one page. OK. block_map = self._read_page(self.block_map_addr) dir_page_indices = struct.unpack_from( f"<{dir_pages_needed}I", block_map, 0) dir_data = bytearray() for pi in dir_page_indices: dir_data.extend(self._read_page(pi)) dir_data = bytes(dir_data[:self.num_dir_bytes]) # Directory layout: # u32 NumStreams # u32 StreamSizes[NumStreams] # for i in 0..NumStreams: u32 StreamPageIndices[ceil(size_i / block_size)] num_streams = struct.unpack_from(" 5 else 0xFFFF return sym_record_stream, section_hdr_stream # ── Public symbols extractor (S_PUB32 from sym-record stream) ────────────── def _demangle(mangled): """Best-effort demangle of MSVC C++ symbol names to Class::Method form. PDB public symbols use the MSVC ABI mangling. A full demangler would parse arg types + calling conventions; we only need the readable qualified name for grep workflows. Examples: "?EnchantAttribute@CACQualities@@IBEHKAAK@Z" -> "CACQualities::EnchantAttribute" "??0CEnchantmentRegistry@@QAE@XZ" -> "CEnchantmentRegistry::CEnchantmentRegistry" "??1Foo@@QAE@XZ" -> "Foo::~Foo" "_someThing" -> "_someThing" (C-style, kept as-is) "?GlobalFunc@@..." -> "GlobalFunc" (no class) """ if not mangled or not mangled.startswith("?"): return mangled # C-linkage symbol, return as-is # Strip the leading '?' (or '??' for ctors/dtors) if mangled.startswith("??0"): # constructor rest = mangled[3:] ctor_dtor = "ctor" elif mangled.startswith("??1"): # destructor rest = mangled[3:] ctor_dtor = "dtor" elif mangled.startswith("??_"): # vtable / vbtable / etc — leave as-is return mangled else: rest = mangled[1:] ctor_dtor = None # `Name@Class@Outer@@` -> split on '@@' to drop the signature suffix sep = rest.find("@@") if sep < 0: return mangled # not a recognised pattern qualified = rest[:sep] parts = qualified.split("@") parts = [p for p in parts if p] if not parts: return mangled if ctor_dtor: # parts[0] is the class name (the only part); ctor name = class name. cls = parts[0] outer = "::".join(reversed(parts[1:])) full = f"{outer}::{cls}" if outer else cls if ctor_dtor == "dtor": return f"{full}::~{cls}" return f"{full}::{cls}" # Method: parts[0] is the function name, parts[1:] are nested classes # (innermost first -> reverse for outer::inner::method order). method = parts[0] classes = list(reversed(parts[1:])) if classes: return "::".join(classes) + "::" + method return method def _extract_pub32(sym_bytes, section_bases): """Iterate S_PUB32 records, compute image VA, return list of dicts. sym_bytes: raw sym-record-stream bytes. section_bases: list of section base VAs (1-indexed via segment field). """ pos = 0 end = len(sym_bytes) out = [] while pos + 4 <= end: rec_len, kind = struct.unpack_from("= 1 and (seg - 1) < len(section_bases): va = section_bases[seg - 1] + offset out.append({ "address": f"0x{va:08X}", "name": _demangle(mangled), "mangled": mangled, "flags": flags, }) pos = rec_end # Records align to 4 bytes if pos % 4: pos += 4 - (pos % 4) return out # ── Section-header stream parser (40 bytes per IMAGE_SECTION_HEADER) ─────── def _parse_section_headers(sec_bytes, image_base=SECTION_HEADERS_DEFAULT_BASE): """Each entry is a 40-byte IMAGE_SECTION_HEADER. Returns list of section-VA bases (so symbol[seg-1] + offset = VA).""" bases = [] SECTION_SIZE = 40 offset = 0 while offset + SECTION_SIZE <= len(sec_bytes): # Layout: char Name[8], u32 VirtSize, u32 VirtAddress, ... virt_size, virt_addr = struct.unpack_from(" 0: # Padding / empty trailing entry — stop. break bases.append(image_base + virt_addr) offset += SECTION_SIZE return bases # ── TPI parser — minimal pass to extract LF_CLASS/STRUCTURE names + sizes ── def _extract_named_types(tpi_bytes): """Walk the TPI stream's type record array and yield named class / struct / union / enum records with their declared size. Skips forward-declared (incomplete) records.""" if len(tpi_bytes) < 56: return [] # TPI header (56 bytes) (version, header_size, ti_min, ti_max, gpi_size, gpi_substream_offset, hash_aux_idx, hash_key_size, num_hash_buckets, hash_value_off, hash_value_len, ti_off_off, ti_off_len, hash_adj_off, hash_adj_len) = struct.unpack_from( " len(tpi_bytes): end = len(tpi_bytes) out = [] while pos + 4 <= end: rec_len, kind = struct.unpack_from(" name_start: name = tpi_bytes[name_start:zero].decode("ascii", errors="replace") # Skip forward-decls: bit 7 of `props` (0x80). is_fwdref = (props & 0x80) != 0 if not is_fwdref and name and not name.startswith(" repo root is two directories up.""" return Path(__file__).resolve().parent.parent.parent def _main(): if len(sys.argv) != 2: print("usage: py pdb_extract.py ", file=sys.stderr) sys.exit(2) pdb_path = Path(sys.argv[1]) if not pdb_path.exists(): print(f"not found: {pdb_path}", file=sys.stderr) sys.exit(2) print(f"loading {pdb_path} ({pdb_path.stat().st_size / 1024 / 1024:.1f} MB)...") msf = Msf(str(pdb_path)) print(f" block size: {msf.block_size}, streams: {len(msf.streams)}") # 1. DBI -> find sym-record + section-header stream indices dbi_bytes = msf.stream(STREAM_DBI) sym_stream_idx, sec_hdr_stream_idx = _parse_dbi(dbi_bytes) print(f" sym stream: {sym_stream_idx}, section-hdr stream: {sec_hdr_stream_idx}") # 2. Section headers -> segment bases sec_bytes = msf.stream(sec_hdr_stream_idx) section_bases = _parse_section_headers(sec_bytes) print(f" sections: {len(section_bases)} (text base = 0x{section_bases[0]:08X})") # 3. Symbol records -> S_PUB32 entries with image VAs sym_bytes = msf.stream(sym_stream_idx) print(f" sym record stream: {len(sym_bytes) / 1024:.1f} KB") symbols = _extract_pub32(sym_bytes, section_bases) print(f" extracted {len(symbols)} public function symbols") # 4. TPI -> named types tpi_bytes = msf.stream(STREAM_TPI) print(f" TPI stream: {len(tpi_bytes) / 1024:.1f} KB") types = _extract_named_types(tpi_bytes) # Dedup by name (templates/forward-decl spam can produce duplicates) seen = set() unique_types = [] for t in types: if t["name"] not in seen: seen.add(t["name"]) unique_types.append(t) print(f" extracted {len(unique_types)} unique named types ({len(types)} total records)") # 5. Write outputs repo = _resolve_repo_root() out_dir = repo / "docs" / "research" / "named-retail" out_dir.mkdir(parents=True, exist_ok=True) sym_out = out_dir / "symbols.json" type_out = out_dir / "types.json" with open(sym_out, "w", encoding="utf-8") as f: # Keep address + demangled name + raw mangled name (for callers # that need the C++ ABI form). Strip flags as not useful for grep. compact = [ {"address": s["address"], "name": s["name"], "mangled": s["mangled"]} for s in symbols ] json.dump(compact, f, indent=2) with open(type_out, "w", encoding="utf-8") as f: json.dump(unique_types, f, indent=2) print(f"\nwrote {sym_out} ({sym_out.stat().st_size / 1024:.1f} KB)") print(f"wrote {type_out} ({type_out.stat().st_size / 1024:.1f} KB)") # Spot check: CEnchantmentRegistry::EnchantAttribute should be at 0x594570 per discovery agent. target = "CEnchantmentRegistry::EnchantAttribute" for s in symbols: if s["name"] == target: print(f"\nspot check: {target} -> {s['address']} (expected 0x00594570)") break else: print(f"\nspot check: {target} NOT FOUND in symbols (PDB lookup mismatch?)") _main()