MosswartOverlord/go-services/compare/compare_live.py
Erik 1af47520c0 feat(go-services): tracker-go Phase 0/1 — /live + /trails read parity
Parallel Go reimplementation of the dereth-tracker read side, deployed
loopback-only (:8770) and reading the dereth TimescaleDB read-only. The live
Python stack is untouched (added via a compose override, not by editing the
tracked docker-compose.yml).

- Phase 0 scaffold: stdlib net/http server (Go 1.22+ method+path routing),
  /health + /api-version, multi-stage distroless Docker build, and
  go-services/docker-compose.go.yml override (loopback :8770).
- Phase 1: pgx v5 pool forced into read-only transactions, a 5s /live + /trails
  cache loop using the exact main.py:837 SQL, and Python-isoformat timestamps
  so output matches FastAPI's jsonable_encoder.
- compare/compare_live.py: parity harness vs the live Python service. Uses the
  server-stamped received_at to prove same-row full-field equality and to make
  the online-set diff boundary-aware.

Verified on live traffic (73 players): identical online set + 23-key schema,
identity/type parity for all, every same-row pair matches on every field, and
diff-row pairs differ only by the ~6s two-cache refresh skew.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-24 09:24:22 +02:00

223 lines
8.5 KiB
Python

#!/usr/bin/env python3
"""Compare the Go tracker's /live (and /trails) against the live Python service.
Run on the server (or anywhere with loopback access to both):
python3 compare_live.py # default loopback ports
python3 compare_live.py --py http://127.0.0.1:8765 --go http://127.0.0.1:8770
Parity strategy for a live firehose
-----------------------------------
The two services rebuild their /live cache independently every 5s, so an
actively-updating character can legitimately show a newer telemetry row in one
than the other. We separate "is this a real divergence?" from "is this just
cache timing?" using the server-stamped received_at:
* SAME ROW (py.received_at == go.received_at): both rendered the *same*
telemetry_events row -> every field MUST match (numbers within epsilon,
timestamps compared as instants). This is the rigorous render-parity proof.
* DIFFERENT ROW: a newer row arrived between the two refreshes -> we only
require identity + key-set + type/null-pattern parity, and report the
volatile-field skew (which should be small and recent).
Exit code 0 if no real parity violations, 1 otherwise.
"""
import argparse
import json
import sys
import urllib.request
from datetime import datetime, timezone
EPS = 1e-6
# Fields that identify the entity / join keys — must always match for a player
# present in both outputs.
IDENTITY = ("character_name", "char_tag", "session_id")
# Slowly-changing aggregates — informational when they differ on a same-row pair
# (a kill/rare recorded between refreshes can bump these even for the same
# telemetry row).
AGGREGATES = ("total_kills", "total_rares", "session_rares")
TIMESTAMP_FIELDS = ("timestamp", "received_at")
def fetch(base, path):
with urllib.request.urlopen(base.rstrip("/") + path, timeout=8) as r:
return json.load(r)
def jtype(v):
if v is None:
return "null"
if isinstance(v, bool):
return "bool"
if isinstance(v, (int, float)):
return "num"
if isinstance(v, str):
return "str"
return type(v).__name__
def parse_ts(s):
if s is None:
return None
return datetime.fromisoformat(s.replace("Z", "+00:00"))
def values_equal(key, a, b):
"""Semantic equality for a single field value."""
if a is None or b is None:
return a is b or a == b
if key in TIMESTAMP_FIELDS and isinstance(a, str) and isinstance(b, str):
return parse_ts(a) == parse_ts(b)
an, bn = isinstance(a, (int, float)) and not isinstance(a, bool), isinstance(b, (int, float)) and not isinstance(b, bool)
if an and bn:
return abs(float(a) - float(b)) <= EPS
return a == b
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--py", default="http://127.0.0.1:8765")
ap.add_argument("--go", default="http://127.0.0.1:8770")
args = ap.parse_args()
py = fetch(args.py, "/live")["players"]
go = fetch(args.go, "/live")["players"]
now = datetime.now(timezone.utc)
pyi = {p["character_name"]: p for p in py}
goi = {p["character_name"]: p for p in go}
common = sorted(set(pyi) & set(goi))
only_py = sorted(set(pyi) - set(goi))
only_go = sorted(set(goi) - set(pyi))
print("=" * 72)
print("/live PARITY python(%s) vs go(%s)" % (args.py, args.go))
print("=" * 72)
print(f"python players : {len(py)}")
print(f"go players : {len(go)}")
print(f"common : {len(common)}")
violations = 0
# --- key-set parity (all players) ---
py_keys = set().union(*[set(p) for p in py]) if py else set()
go_keys = set().union(*[set(p) for p in go]) if go else set()
if py_keys == go_keys:
print(f"key set : IDENTICAL ({len(py_keys)} keys)")
else:
violations += 1
print("key set : MISMATCH")
print(" only in python:", sorted(py_keys - go_keys))
print(" only in go :", sorted(go_keys - py_keys))
# --- online-set parity (boundary-aware) ---
def age(p):
ts = parse_ts(p.get("received_at") or p.get("timestamp"))
return (now - ts).total_seconds() if ts else None
print("\n-- online set --")
if not only_py and not only_go:
print("online set : IDENTICAL")
else:
# Players near the 30s boundary can flap between the two refreshes.
def explain(names, idx):
for n in names:
a = age(idx[n])
tag = "boundary-flap (age %.1fs)" % a if a is not None and 22 <= a <= 38 else "age %s" % (None if a is None else round(a, 1))
print(f" only_{('py' if idx is pyi else 'go')}: {n:<20} {tag}")
if only_py:
print(f"only in python : {len(only_py)}")
explain(only_py, pyi)
if only_go:
print(f"only in go : {len(only_go)}")
explain(only_go, goi)
unexplained = [n for n in (only_py + only_go)
if not (lambda a: a is not None and 22 <= a <= 38)(age((pyi.get(n) or goi.get(n))))]
if unexplained:
violations += 1
print(" UNEXPLAINED set difference (not near 30s boundary):", unexplained)
else:
print(" (all set differences explained by the 30s online boundary)")
# --- per-player field parity ---
same_row = [] # py.received_at == go.received_at -> must fully match
diff_row = [] # newer row arrived between refreshes
for n in common:
a, b = pyi[n], goi[n]
if a.get("received_at") is not None and a.get("received_at") == b.get("received_at"):
same_row.append(n)
else:
diff_row.append(n)
print("\n-- per-player parity --")
print(f"same-row pairs (identical received_at, must fully match): {len(same_row)}")
print(f"diff-row pairs (newer telemetry between refreshes) : {len(diff_row)}")
# Identity + type/null-pattern parity across ALL common players.
id_bad = type_bad = 0
for n in common:
a, b = pyi[n], goi[n]
for k in IDENTITY:
if a.get(k) != b.get(k):
id_bad += 1
print(f" IDENTITY mismatch {n}.{k}: py={a.get(k)!r} go={b.get(k)!r}")
for k in py_keys:
ta, tb = jtype(a.get(k)), jtype(b.get(k))
if ta != tb:
# null vs num/str is a real null-pattern divergence; num-vs-num
# whole-float (0.0) vs int (0) is already unified under "num".
type_bad += 1
print(f" TYPE mismatch {n}.{k}: py={ta}({a.get(k)!r}) go={tb}({b.get(k)!r})")
if id_bad:
violations += id_bad
if type_bad:
violations += type_bad
if not id_bad and not type_bad:
print("identity+type : IDENTICAL for all common players")
# Rigorous: same-row pairs must match on every field.
sr_full_match = 0
for n in same_row:
a, b = pyi[n], goi[n]
diffs = []
for k in py_keys:
if not values_equal(k, a.get(k), b.get(k)):
diffs.append((k, a.get(k), b.get(k)))
if not diffs:
sr_full_match += 1
else:
# Aggregate-only diffs are timing-explainable even on a same row.
non_agg = [d for d in diffs if d[0] not in AGGREGATES]
if non_agg:
violations += 1
print(f" SAME-ROW FIELD divergence {n}: " +
", ".join(f"{k}: py={pa!r} go={ga!r}" for k, pa, ga in non_agg))
else:
print(f" (same-row {n}: only aggregate fields differ — kill/rare between refreshes: "
+ ", ".join(f"{k} py={pa} go={ga}" for k, pa, ga in diffs) + ")")
print(f"same-row full-field matches: {sr_full_match}/{len(same_row)}")
# Volatile-field skew on diff-row pairs (informational).
if diff_row:
ts_deltas = []
for n in diff_row:
da, db = parse_ts(pyi[n].get("timestamp")), parse_ts(goi[n].get("timestamp"))
if da and db:
ts_deltas.append(abs((da - db).total_seconds()))
if ts_deltas:
ts_deltas.sort()
print(f"diff-row timestamp skew: min={ts_deltas[0]:.1f}s "
f"median={ts_deltas[len(ts_deltas)//2]:.1f}s max={ts_deltas[-1]:.1f}s "
"(bounded by the two 5s refresh cycles)")
print("\n" + "=" * 72)
if violations == 0:
print("RESULT: PARITY OK — no structural or same-row divergences.")
else:
print(f"RESULT: {violations} PARITY VIOLATION(S) — see above.")
print("=" * 72)
return 1 if violations else 0
if __name__ == "__main__":
sys.exit(main())