MosswartOverlord/go-services/tracker-go/main.go
Erik 27757636e4 feat(go-services): tracker WS servers (/ws/position + /ws/live) + robust shadow
Completes the Go tracker as a cutover-ready drop-in:
- wslive.go: browser broadcast hub with per-client subscribe filters (nil=all),
  request_dungeon_map replies, and command routing; auth = internal-trust or
  session cookie. The ingestor broadcasts every handled event to it.
- wsposition.go: plugin ingest server with X-Plugin-Secret/SHARED_SECRET auth
  (constant-time, fails closed, legacy fallback), register -> plugin_conns, and
  dispatch into the shared Ingestor. plugin registry for backend->plugin commands.
- main.go: statusRecorder.Unwrap() so coder/websocket can hijack through the
  logging middleware (WS handshakes failed without it); /ws/ bypasses HTTP auth.

Shadow consumer robustness (the harness was being evicted under the full
firehose): decouple socket read from processing — the read loop only copies raw
frames to a queue; a worker unmarshals + dispatches. JSON parsing in the read
loop was slowing it enough that Python's broadcast send errored and evicted us
(Read then blocked forever). Added a 25s read-deadline watchdog to self-heal.

Validated live: shadow /live online = 73 = production; telemetry sustained ~12/s,
0 drops, no eviction; and the shadow's /ws/live re-broadcast stream is IDENTICAL
to production's (TOTAL 2150=2150, every event type exact).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-24 11:15:05 +02:00

286 lines
9.9 KiB
Go

// Command tracker-go is a Go reimplementation of the MosswartOverlord
// "dereth-tracker" backend, deployed in parallel with the live Python service
// for side-by-side comparison (strangler-fig migration).
//
// Phase 1: read-side parity. Connects READ-ONLY to the existing dereth
// TimescaleDB and reimplements the HTTP read API, starting with the /live and
// /trails caches (the 5s _refresh_cache_loop). It never touches anything the
// Python service writes.
//
// Routes are declared WITHOUT the nginx-stripped "/go/" prefix, mirroring the
// Python service's "no /api/ prefix" convention. nginx's `location /go/` strips
// the prefix before proxying to this service on 127.0.0.1:8770.
package main
import (
"context"
"encoding/json"
"errors"
"log/slog"
"net/http"
"net/http/httputil"
"os"
"os/signal"
"syscall"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
// buildVersion is injected at build time via -ldflags "-X main.buildVersion=...".
// Mirrors the Python service's APP_VERSION / "/api-version" stamp.
var buildVersion = "dev"
// Server holds the shared dependencies for HTTP handlers.
type Server struct {
pool *pgxpool.Pool
cache *liveCache
totals *totalsCache
invProxy *httputil.ReverseProxy
staticDir string
secretKey string
sharedSecret string
sharedSecretLegacy string
ingestor *Ingestor // non-nil only in ingest/shadow mode
hub *Hub // browser /ws/live fan-out
plugins *pluginRegistry
log *slog.Logger
}
func main() {
// `tracker-go combat-merge` reads a JSON array of cumulative session
// snapshots from stdin and prints the folded lifetime — a deterministic hook
// for cross-language parity testing against the Python combat functions.
if len(os.Args) > 1 && os.Args[1] == "combat-merge" {
runCombatMergeCLI()
return
}
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelInfo}))
slog.SetDefault(logger)
cfg := loadConfig()
logger.Info("starting tracker-go", "version", buildVersion, "addr", cfg.Addr)
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
srv := &Server{
cache: newLiveCache(),
totals: newTotalsCache(),
staticDir: cfg.StaticDir,
secretKey: cfg.SecretKey,
sharedSecret: cfg.SharedSecret,
sharedSecretLegacy: cfg.SharedSecretLegacy,
hub: newHub(),
plugins: newPluginRegistry(logger),
log: logger,
}
if cfg.SecretKey == "" {
// Fail closed like the Python service: with no key, no external cookie
// can verify, so only internal-trust (loopback/compose) requests pass.
logger.Warn("SECRET_KEY unset — external (nginx-proxied) requests will all be rejected")
}
// Inventory-service reverse proxy (independent of the DB).
if err := srv.initInvProxy(cfg.InventoryURL); err != nil {
logger.Error("inventory proxy init failed", "err", err, "target", cfg.InventoryURL)
os.Exit(1)
}
// Connect to the dereth DB (read-only). If DATABASE_URL is unset we still
// serve health/version (Phase-0 mode) so the container is observable.
if cfg.DatabaseURL == "" {
logger.Warn("DATABASE_URL unset — running without DB; DB-backed endpoints will be empty")
} else {
connectCtx, cancel := context.WithTimeout(ctx, 15*time.Second)
pool, err := newPool(connectCtx, cfg.DatabaseURL, cfg.ReadOnly)
cancel()
if err != nil {
logger.Error("db pool init failed", "err", err)
os.Exit(1)
}
defer pool.Close()
srv.pool = pool
// Ingest/shadow mode owns its own DB: create the schema on first run.
if !cfg.ReadOnly {
schemaCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
initSchema(schemaCtx, pool, logger)
cancel()
}
// Shadow ingest: replay the Python /ws/live firehose into our handlers.
if cfg.IngestWS != "" {
if cfg.ReadOnly {
logger.Error("SHADOW_INGEST_WS set but READ_ONLY=true; refusing to ingest into the production DB")
os.Exit(1)
}
srv.ingestor = newIngestor(pool, logger, srv.hub.broadcast)
go srv.runShadowConsumer(ctx, cfg.IngestWS)
logger.Info("shadow ingest enabled", "source", cfg.IngestWS)
}
go srv.runCacheLoop(ctx)
go srv.runTotalsLoop(ctx)
logger.Info("db connected; cache loops started",
"read_only", cfg.ReadOnly, "live_interval", cacheInterval.String(), "totals_interval", totalsInterval.String())
}
mux := http.NewServeMux()
srv.registerRoutes(mux)
httpSrv := &http.Server{
Addr: cfg.Addr,
Handler: withRequestLogging(srv.authMiddleware(mux)),
ReadHeaderTimeout: 10 * time.Second,
}
go func() {
if err := httpSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
logger.Error("http server failed", "err", err)
os.Exit(1)
}
}()
logger.Info("listening", "addr", cfg.Addr)
<-ctx.Done()
logger.Info("shutdown signal received, draining")
shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := httpSrv.Shutdown(shutdownCtx); err != nil {
logger.Error("graceful shutdown failed", "err", err)
}
logger.Info("stopped")
}
// config holds runtime configuration sourced from environment variables,
// matching the Python service's env var names where they overlap.
type config struct {
Addr string // listen address, e.g. ":8770"
DatabaseURL string // dereth TimescaleDB DSN
ReadOnly bool // true = read-side parity (force read-only txns); false = ingest/shadow (owns its DB)
InventoryURL string // inventory-service base URL
StaticDir string // directory for static assets / openissues.json
SecretKey string // session-cookie signing key (must match the Python service)
SharedSecret string // plugin /ws/position auth
SharedSecretLegacy string // plugin auth rotation fallback
IngestWS string // optional: a /ws/live URL to shadow-ingest from (Python tracker)
}
func loadConfig() config {
return config{
Addr: ":" + envOr("PORT", "8770"),
DatabaseURL: os.Getenv("DATABASE_URL"),
ReadOnly: envOr("READ_ONLY", "true") != "false",
InventoryURL: envOr("INVENTORY_SERVICE_URL", "http://inventory-service:8000"),
StaticDir: envOr("STATIC_DIR", "static"),
SecretKey: os.Getenv("SECRET_KEY"),
SharedSecret: os.Getenv("SHARED_SECRET"),
SharedSecretLegacy: os.Getenv("SHARED_SECRET_LEGACY"),
IngestWS: os.Getenv("SHADOW_INGEST_WS"),
}
}
func envOr(key, def string) string {
if v := os.Getenv(key); v != "" {
return v
}
return def
}
func (s *Server) registerRoutes(mux *http.ServeMux) {
mux.HandleFunc("GET /health", s.handleHealth)
// Mirrors Python's GET /api-version (hyphenated so nginx never strips it).
mux.HandleFunc("GET /api-version", s.handleVersion)
// Phase 1 read-side: the 5s caches.
mux.HandleFunc("GET /live", s.handleLive)
mux.HandleFunc("GET /live/", s.handleLive)
mux.HandleFunc("GET /trails", s.handleTrails)
mux.HandleFunc("GET /trails/", s.handleTrails)
// Totals (5-minute caches).
mux.HandleFunc("GET /total-rares", s.handleTotalRares)
mux.HandleFunc("GET /total-rares/", s.handleTotalRares)
mux.HandleFunc("GET /total-kills", s.handleTotalKills)
mux.HandleFunc("GET /total-kills/", s.handleTotalKills)
// Per-character & aggregate DB reads.
mux.HandleFunc("GET /stats/{character_name}", s.handleStats)
mux.HandleFunc("GET /portals", s.handlePortals)
mux.HandleFunc("GET /spawns/heatmap", s.handleSpawnHeatmap)
mux.HandleFunc("GET /server-health", s.handleServerHealth)
mux.HandleFunc("GET /character-stats/{name}", s.handleCharacterStats)
mux.HandleFunc("GET /combat-stats", s.handleCombatStatsAll)
mux.HandleFunc("GET /combat-stats/{character_name}", s.handleCombatStatsOne)
mux.HandleFunc("GET /inventories", s.handleInventories)
mux.HandleFunc("GET /inventory/{character_name}/search", s.handleInventorySearch)
// Ingest-only state (empty/default in Phase 1).
mux.HandleFunc("GET /quest-status", s.handleQuestStatus)
mux.HandleFunc("GET /vital-sharing/peers", s.handleVitalSharingPeers)
mux.HandleFunc("GET /equipment-cantrip-state/{name}", s.handleEquipmentCantrip)
mux.HandleFunc("GET /issues", s.handleIssues)
mux.HandleFunc("GET /me", s.handleMe)
// WebSocket servers (cutover-ready): browser fan-out + plugin ingest.
mux.HandleFunc("GET /ws/live", s.handleWSLive)
mux.HandleFunc("GET /ws/position", s.handleWSPosition)
// Inventory-service reverse proxies.
s.registerProxyRoutes(mux)
}
func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, map[string]any{
"status": "ok",
"service": "tracker-go",
"version": buildVersion,
"db": s.pool != nil,
})
}
func (s *Server) handleVersion(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, map[string]any{"version": buildVersion})
}
func writeJSON(w http.ResponseWriter, status int, v any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
if err := json.NewEncoder(w).Encode(v); err != nil {
slog.Error("json encode failed", "err", err)
}
}
// withRequestLogging is a thin access-log middleware.
func withRequestLogging(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
sr := &statusRecorder{ResponseWriter: w, status: http.StatusOK}
next.ServeHTTP(sr, r)
slog.Info("http",
"method", r.Method,
"path", r.URL.Path,
"status", sr.status,
"dur_ms", time.Since(start).Milliseconds(),
)
})
}
type statusRecorder struct {
http.ResponseWriter
status int
}
func (s *statusRecorder) WriteHeader(code int) {
s.status = code
s.ResponseWriter.WriteHeader(code)
}
// Unwrap lets http.ResponseController (used by coder/websocket to hijack the
// connection for /ws upgrades) reach the underlying ResponseWriter through this
// logging wrapper. Without it, WebSocket handshakes fail.
func (s *statusRecorder) Unwrap() http.ResponseWriter {
return s.ResponseWriter
}