From ca12f4807b595aa28561f5f1abcacfc5da0201e3 Mon Sep 17 00:00:00 2001 From: erik Date: Fri, 20 Jun 2025 07:17:01 +0000 Subject: [PATCH] added server tracking plus server metrics --- db_async.py | 35 +++++ main.py | 342 +++++++++++++++++++++++++++++++++++++++++++++- static/index.html | 22 ++- static/script.js | 82 +++++++++++ static/style.css | 92 +++++++++++++ 5 files changed, 567 insertions(+), 6 deletions(-) diff --git a/db_async.py b/db_async.py index 20001b8d..d9bb1de8 100644 --- a/db_async.py +++ b/db_async.py @@ -126,6 +126,41 @@ character_inventories = Table( UniqueConstraint("character_name", "item_id", name="uq_char_item"), ) +# Server health monitoring tables +server_health_checks = Table( + # Time-series data for server health checks + "server_health_checks", + metadata, + Column("id", Integer, primary_key=True), + Column("server_name", String, nullable=False, index=True), + Column("server_address", String, nullable=False), + Column("timestamp", DateTime(timezone=True), nullable=False, default=sqlalchemy.func.now()), + Column("status", String(10), nullable=False), # 'up' or 'down' + Column("latency_ms", Float, nullable=True), + Column("player_count", Integer, nullable=True), +) + +server_status = Table( + # Current server status and uptime tracking + "server_status", + metadata, + Column("server_name", String, primary_key=True), + Column("current_status", String(10), nullable=False), + Column("last_seen_up", DateTime(timezone=True), nullable=True), + Column("last_restart", DateTime(timezone=True), nullable=True), + Column("total_uptime_seconds", BigInteger, default=0), + Column("last_check", DateTime(timezone=True), nullable=True), + Column("last_latency_ms", Float, nullable=True), + Column("last_player_count", Integer, nullable=True), +) + +# Index for efficient server health check queries +Index( + 'ix_server_health_checks_name_ts', + server_health_checks.c.server_name, + server_health_checks.c.timestamp.desc() +) + async def init_db_async(): """Initialize PostgreSQL/TimescaleDB schema and hypertable. diff --git a/main.py b/main.py index 81b220cc..0c2a9e6b 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,9 @@ import sys import time from typing import Dict, List, Any from pathlib import Path +import asyncio +import socket +import struct from fastapi import FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect, Request from fastapi.responses import JSONResponse, Response @@ -34,6 +37,8 @@ from db_async import ( spawn_events, rare_events, character_inventories, + server_health_checks, + server_status, init_db_async ) import asyncio @@ -83,6 +88,282 @@ _total_query_time = 0.0 _recent_telemetry_messages = [] _max_recent_messages = 50 +# Server health monitoring +_server_health_task = None +_server_status_cache = { + "status": "unknown", + "latency_ms": None, + "player_count": None, + "last_check": None, + "uptime_seconds": 0, + "last_restart": None +} + +# AC Hash32 checksum algorithm (based on ThwargLauncher) +def calculate_hash32(data: bytes) -> int: + """Calculate AC Hash32 checksum as used in ThwargLauncher.""" + length = len(data) + checksum = (length << 16) & 0xFFFFFFFF + + # Process 4-byte chunks + for i in range(0, length - 3, 4): + chunk = struct.unpack(' tuple[bool, float, int]: + """Check AC server health via UDP packet. + + Returns: (is_up, latency_ms, player_count) + """ + logger.debug(f"🔍 Starting health check for {address}:{port}") + start_time = time.time() + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock.setblocking(False) + + try: + # Send login packet (same as ThwargLauncher) + await asyncio.get_event_loop().sock_sendto(sock, AC_LOGIN_PACKET, (address, port)) + + # Wait for response with timeout + try: + data, addr = await asyncio.wait_for( + asyncio.get_event_loop().sock_recvfrom(sock, 1024), + timeout=timeout + ) + + latency_ms = (time.time() - start_time) * 1000 + logger.debug(f"📥 Received response from {addr}: {len(data)} bytes, latency: {latency_ms:.1f}ms") + + # Check if valid response (support both TimeSynch 0x800000 and ConnectRequest 0x40000) + if len(data) >= 24: + flags = struct.unpack(' int: + """Get player count from TreeStats.net API (same as ThwargLauncher).""" + try: + async with httpx.AsyncClient() as client: + response = await client.get("http://treestats.net/player_counts-latest.json", timeout=10) + if response.status_code == 200: + data = response.json() + for server_data in data: + if server_data.get("server") == server_name: + return server_data.get("count", 0) + return 0 + except Exception as e: + logger.debug(f"Failed to get player count from TreeStats.net: {e}") + return 0 + +async def monitor_server_health(): + """Background task to monitor server health every 30 seconds.""" + server_name = "Coldeve" + server_address = "play.coldeve.ac" + server_port = 9000 + check_interval = 30 # seconds + player_count_interval = 300 # 5 minutes (like ThwargLauncher's 10 minutes, but more frequent) + last_player_count_check = 0 + current_player_count = None + + # Initialize server status in database + try: + existing = await database.fetch_one( + "SELECT * FROM server_status WHERE server_name = :name", + {"name": server_name} + ) + if not existing: + await database.execute( + server_status.insert().values( + server_name=server_name, + current_status="unknown", + total_uptime_seconds=0 + ) + ) + except Exception as e: + logger.error(f"Failed to initialize server status: {e}") + + while True: + try: + logger.debug(f"🏥 Running scheduled health check for {server_name} ({server_address}:{server_port})") + # Check server health via UDP (for status and latency) + is_up, latency_ms, _ = await check_server_health(server_address, server_port) + status = "up" if is_up else "down" + now = datetime.now(timezone.utc) + + # Get player count from TreeStats.net API (like ThwargLauncher) + current_time = time.time() + if current_time - last_player_count_check >= player_count_interval or current_player_count is None: + new_player_count = await get_player_count_from_treestats(server_name) + if new_player_count > 0: # Only update if we got a valid count + current_player_count = new_player_count + last_player_count_check = current_time + logger.info(f"🏥 Updated player count from TreeStats.net: {current_player_count}") + + logger.debug(f"🏥 Health check result: {status}, latency: {latency_ms}, players: {current_player_count}") + + # Record health check + await database.execute( + server_health_checks.insert().values( + server_name=server_name, + server_address=f"{server_address}:{server_port}", + timestamp=now, + status=status, + latency_ms=latency_ms, + player_count=current_player_count + ) + ) + + # Get previous status + prev_status = await database.fetch_one( + "SELECT * FROM server_status WHERE server_name = :name", + {"name": server_name} + ) + + # Calculate uptime and detect restarts + last_restart = prev_status["last_restart"] if prev_status else None + + if prev_status and prev_status["current_status"] == "down" and status == "up": + # Server came back up - this is a restart + last_restart = now + logger.info(f"Server {server_name} came back online") + # Broadcast to all browser clients + await _broadcast_to_browser_clients({ + "type": "server_status", + "server": server_name, + "status": "up", + "message": "Server is back online" + }) + + # Calculate uptime from last restart time (not accumulated) + if last_restart and status == "up": + uptime_seconds = int((now - last_restart).total_seconds()) + else: + uptime_seconds = 0 + + # Update server status (always include current_player_count if we have it) + await database.execute( + """ + INSERT INTO server_status (server_name, current_status, last_seen_up, last_restart, + total_uptime_seconds, last_check, last_latency_ms, last_player_count) + VALUES (:name, :status, :last_seen, :restart, :uptime, :check, :latency, :players) + ON CONFLICT (server_name) DO UPDATE SET + current_status = :status, + last_seen_up = CASE WHEN :status = 'up' THEN :last_seen ELSE server_status.last_seen_up END, + last_restart = :restart, + total_uptime_seconds = :uptime, + last_check = :check, + last_latency_ms = :latency, + last_player_count = CASE WHEN :players IS NOT NULL THEN :players ELSE server_status.last_player_count END + """, + { + "name": server_name, + "status": status, + "last_seen": now if status == "up" else None, + "restart": last_restart, + "uptime": uptime_seconds, + "check": now, + "latency": latency_ms, + "players": current_player_count + } + ) + + # Update cache + global _server_status_cache + _server_status_cache = { + "status": status, + "latency_ms": latency_ms, + "player_count": current_player_count, + "last_check": now.isoformat(), + "uptime_seconds": uptime_seconds, + "last_restart": last_restart.isoformat() if last_restart else None + } + + logger.debug(f"Server health check: {status}, latency={latency_ms}ms, players={current_player_count}") + + except Exception as e: + logger.error(f"Server health monitoring error: {e}", exc_info=True) + + await asyncio.sleep(check_interval) + def _track_player_changes(new_players: list) -> None: """Track player changes for debugging flapping issues.""" from datetime import datetime, timezone @@ -559,10 +840,11 @@ async def on_startup(): else: raise RuntimeError(f"Could not connect to database after {max_attempts} attempts") # Start background cache refresh (live & trails) - global _cache_task, _rares_cache_task + global _cache_task, _rares_cache_task, _server_health_task _cache_task = asyncio.create_task(_refresh_cache_loop()) _rares_cache_task = asyncio.create_task(_refresh_total_rares_cache()) - logger.info("Background cache refresh tasks started") + _server_health_task = asyncio.create_task(monitor_server_health()) + logger.info("Background cache refresh and server monitoring tasks started") @app.on_event("shutdown") async def on_shutdown(): """Event handler triggered when application is shutting down. @@ -570,7 +852,7 @@ async def on_shutdown(): Ensures the database connection is closed cleanly. """ # Stop cache refresh tasks - global _cache_task, _rares_cache_task + global _cache_task, _rares_cache_task, _server_health_task if _cache_task: logger.info("Stopping background cache refresh task") _cache_task.cancel() @@ -586,6 +868,14 @@ async def on_shutdown(): await _rares_cache_task except asyncio.CancelledError: pass + + if _server_health_task: + logger.info("Stopping server health monitoring task") + _server_health_task.cancel() + try: + await _server_health_task + except asyncio.CancelledError: + pass logger.info("Disconnecting from database") await database.disconnect() @@ -695,6 +985,52 @@ async def get_recent_activity(): logger.error(f"Failed to get recent activity data: {e}", exc_info=True) raise HTTPException(status_code=500, detail="Internal server error") +@app.get("/server-health") +async def get_server_health(): + """Return current server health status.""" + try: + # Get latest status from database if cache is stale + if not _server_status_cache.get("last_check") or \ + (datetime.now(timezone.utc) - datetime.fromisoformat(_server_status_cache["last_check"].replace('Z', '+00:00')) > timedelta(minutes=2)): + + row = await database.fetch_one( + "SELECT * FROM server_status WHERE server_name = :name", + {"name": "Coldeve"} + ) + + if row: + _server_status_cache.update({ + "status": row["current_status"], + "latency_ms": row["last_latency_ms"], + "player_count": row["last_player_count"], + "last_check": row["last_check"].isoformat() if row["last_check"] else None, + "uptime_seconds": row["total_uptime_seconds"], + "last_restart": row["last_restart"].isoformat() if row["last_restart"] else None + }) + + # Format uptime + uptime_seconds = _server_status_cache.get("uptime_seconds", 0) + days = uptime_seconds // 86400 + hours = (uptime_seconds % 86400) // 3600 + minutes = (uptime_seconds % 3600) // 60 + + uptime_str = f"{days}d {hours}h {minutes}m" if days > 0 else f"{hours}h {minutes}m" + + return { + "server_name": "Coldeve", + "status": _server_status_cache.get("status", "unknown"), + "latency_ms": _server_status_cache.get("latency_ms"), + "player_count": _server_status_cache.get("player_count"), + "uptime": uptime_str, + "uptime_seconds": uptime_seconds, + "last_restart": _server_status_cache.get("last_restart"), + "last_check": _server_status_cache.get("last_check") + } + + except Exception as e: + logger.error(f"Failed to get server health data: {e}", exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error") + @app.get("/live", response_model=dict) @app.get("/live/", response_model=dict) async def get_live_players(): diff --git a/static/index.html b/static/index.html index 841162a6..047bcb85 100644 --- a/static/index.html +++ b/static/index.html @@ -14,10 +14,23 @@