added server tracking plus server metrics

2025-06-20 07:17:01 +00:00 · 2025-06-20 07:17:01 +00:00 · ca12f4807b
commit ca12f4807b
parent 80a0a16bab
5 changed files with 567 additions and 6 deletions
--- a/main.py
+++ b/main.py
@ -13,6 +13,9 @@ import sys
 import time
 from typing import Dict, List, Any
 from pathlib import Path
+import asyncio
+import socket
+import struct

 from fastapi import FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect, Request
 from fastapi.responses import JSONResponse, Response
@ -34,6 +37,8 @@ from db_async import (
    spawn_events,
    rare_events,
    character_inventories,
+    server_health_checks,
+    server_status,
    init_db_async
 )
 import asyncio
@ -83,6 +88,282 @@ _total_query_time = 0.0
 _recent_telemetry_messages = []
 _max_recent_messages = 50

+# Server health monitoring
+_server_health_task = None
+_server_status_cache = {
+    "status": "unknown",
+    "latency_ms": None,
+    "player_count": None,
+    "last_check": None,
+    "uptime_seconds": 0,
+    "last_restart": None
+}
+
+# AC Hash32 checksum algorithm (based on ThwargLauncher)
+def calculate_hash32(data: bytes) -> int:
+    """Calculate AC Hash32 checksum as used in ThwargLauncher."""
+    length = len(data)
+    checksum = (length << 16) & 0xFFFFFFFF
+    
+    # Process 4-byte chunks
+    for i in range(0, length - 3, 4):
+        chunk = struct.unpack('<I', data[i:i+4])[0]
+        checksum = (checksum + chunk) & 0xFFFFFFFF
+    
+    # Handle remaining bytes
+    remaining_start = (length // 4) * 4
+    shift = 24
+    for i in range(remaining_start, length):
+        byte_val = data[i] << shift
+        checksum = (checksum + byte_val) & 0xFFFFFFFF
+        shift -= 8
+    
+    return checksum
+
+# Create AC EchoRequest packet for server health check (based on ThwargLauncher)
+def create_echo_request_packet():
+    """Create an AC EchoRequest packet for server health checking."""
+    # AC packet header: sequence(4) + flags(4) + checksum(4) + id(2) + time(2) + size(2) + table(2) = 20 bytes + padding
+    packet = bytearray(32)  # 32 bytes total (0x20)
+    
+    # Sequence (4 bytes) - can be 0
+    struct.pack_into('<I', packet, 0, 0)
+    
+    # Flags (4 bytes) - EchoRequest = 0x02000000
+    struct.pack_into('<I', packet, 4, 0x02000000)
+    
+    # Temporary checksum (4 bytes) - required for proper checksum calculation
+    struct.pack_into('<I', packet, 8, 0x0BADD70D)
+    
+    # ID (2 bytes) - can be 0
+    struct.pack_into('<H', packet, 12, 0)
+    
+    # Time (2 bytes) - can be 0
+    struct.pack_into('<H', packet, 14, 0)
+    
+    # Size (2 bytes) - header size = 32 (0x20)
+    struct.pack_into('<H', packet, 16, 32)
+    
+    # Table (2 bytes) - can be 0
+    struct.pack_into('<H', packet, 18, 0)
+    
+    # Calculate proper AC Hash32 checksum
+    # First, set checksum field to 0
+    struct.pack_into('<I', packet, 8, 0)
+    
+    # Calculate checksum using Hash32 algorithm
+    checksum = calculate_hash32(bytes(packet))
+    struct.pack_into('<I', packet, 8, checksum)
+    
+    return bytes(packet)
+
+AC_ECHO_PACKET = create_echo_request_packet()
+
+# AC login packet for server health check (same as ThwargLauncher MakeLoginPacket)
+AC_LOGIN_PACKET = bytes([
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x93, 0x00, 0xd0, 0x05,
+    0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x31, 0x38,
+    0x30, 0x32, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x3e, 0xb8, 0xa8, 0x58, 0x1c, 0x00, 0x61, 0x63,
+    0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x65,
+    0x72, 0x3a, 0x6a, 0x6a, 0x39, 0x68, 0x32, 0x36, 0x68, 0x63, 0x73, 0x67,
+    0x67, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+])
+
+async def check_server_health(address: str, port: int, timeout: float = 3.0) -> tuple[bool, float, int]:
+    """Check AC server health via UDP packet.
+    
+    Returns: (is_up, latency_ms, player_count)
+    """
+    logger.debug(f"🔍 Starting health check for {address}:{port}")
+    start_time = time.time()
+    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    sock.setblocking(False)
+    
+    try:
+        # Send login packet (same as ThwargLauncher)
+        await asyncio.get_event_loop().sock_sendto(sock, AC_LOGIN_PACKET, (address, port))
+        
+        # Wait for response with timeout
+        try:
+            data, addr = await asyncio.wait_for(
+                asyncio.get_event_loop().sock_recvfrom(sock, 1024),
+                timeout=timeout
+            )
+            
+            latency_ms = (time.time() - start_time) * 1000
+            logger.debug(f"📥 Received response from {addr}: {len(data)} bytes, latency: {latency_ms:.1f}ms")
+            
+            # Check if valid response (support both TimeSynch 0x800000 and ConnectRequest 0x40000)
+            if len(data) >= 24:
+                flags = struct.unpack('<I', data[4:8])[0]
+                
+                # Accept both TimeSynch (0x800000) and ConnectRequest (0x40000) as valid responses
+                if (flags & 0x800000) or (flags & 0x40000):
+                    # UDP health check is for server status and latency only
+                    # Player count comes from TreeStats.net API (like ThwargLauncher)
+                    logger.debug(f"✅ Valid server response: latency: {latency_ms:.1f}ms")
+                    return True, latency_ms, None
+            
+            # Any response indicates server is up, even if not the expected format
+            logger.info(f"✅ Server response (non-standard format): latency: {latency_ms:.1f}ms")
+            return True, latency_ms, None
+            
+        except asyncio.TimeoutError:
+            logger.debug(f"⏰ TIMEOUT: No response from {address}:{port} after {timeout}s - server down")
+            return False, None, None
+            
+    except Exception as e:
+        logger.error(f"Server health check error: {e}")
+        return False, None, None
+    finally:
+        sock.close()
+
+async def get_player_count_from_treestats(server_name: str) -> int:
+    """Get player count from TreeStats.net API (same as ThwargLauncher)."""
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.get("http://treestats.net/player_counts-latest.json", timeout=10)
+            if response.status_code == 200:
+                data = response.json()
+                for server_data in data:
+                    if server_data.get("server") == server_name:
+                        return server_data.get("count", 0)
+        return 0
+    except Exception as e:
+        logger.debug(f"Failed to get player count from TreeStats.net: {e}")
+        return 0
+
+async def monitor_server_health():
+    """Background task to monitor server health every 30 seconds."""
+    server_name = "Coldeve"
+    server_address = "play.coldeve.ac"
+    server_port = 9000
+    check_interval = 30  # seconds
+    player_count_interval = 300  # 5 minutes (like ThwargLauncher's 10 minutes, but more frequent)
+    last_player_count_check = 0
+    current_player_count = None
+    
+    # Initialize server status in database
+    try:
+        existing = await database.fetch_one(
+            "SELECT * FROM server_status WHERE server_name = :name",
+            {"name": server_name}
+        )
+        if not existing:
+            await database.execute(
+                server_status.insert().values(
+                    server_name=server_name,
+                    current_status="unknown",
+                    total_uptime_seconds=0
+                )
+            )
+    except Exception as e:
+        logger.error(f"Failed to initialize server status: {e}")
+    
+    while True:
+        try:
+            logger.debug(f"🏥 Running scheduled health check for {server_name} ({server_address}:{server_port})")
+            # Check server health via UDP (for status and latency)
+            is_up, latency_ms, _ = await check_server_health(server_address, server_port)
+            status = "up" if is_up else "down"
+            now = datetime.now(timezone.utc)
+            
+            # Get player count from TreeStats.net API (like ThwargLauncher)
+            current_time = time.time()
+            if current_time - last_player_count_check >= player_count_interval or current_player_count is None:
+                new_player_count = await get_player_count_from_treestats(server_name)
+                if new_player_count > 0:  # Only update if we got a valid count
+                    current_player_count = new_player_count
+                    last_player_count_check = current_time
+                    logger.info(f"🏥 Updated player count from TreeStats.net: {current_player_count}")
+            
+            logger.debug(f"🏥 Health check result: {status}, latency: {latency_ms}, players: {current_player_count}")
+            
+            # Record health check
+            await database.execute(
+                server_health_checks.insert().values(
+                    server_name=server_name,
+                    server_address=f"{server_address}:{server_port}",
+                    timestamp=now,
+                    status=status,
+                    latency_ms=latency_ms,
+                    player_count=current_player_count
+                )
+            )
+            
+            # Get previous status
+            prev_status = await database.fetch_one(
+                "SELECT * FROM server_status WHERE server_name = :name",
+                {"name": server_name}
+            )
+            
+            # Calculate uptime and detect restarts
+            last_restart = prev_status["last_restart"] if prev_status else None
+            
+            if prev_status and prev_status["current_status"] == "down" and status == "up":
+                # Server came back up - this is a restart
+                last_restart = now
+                logger.info(f"Server {server_name} came back online")
+                # Broadcast to all browser clients
+                await _broadcast_to_browser_clients({
+                    "type": "server_status",
+                    "server": server_name,
+                    "status": "up",
+                    "message": "Server is back online"
+                })
+            
+            # Calculate uptime from last restart time (not accumulated)
+            if last_restart and status == "up":
+                uptime_seconds = int((now - last_restart).total_seconds())
+            else:
+                uptime_seconds = 0
+            
+            # Update server status (always include current_player_count if we have it)
+            await database.execute(
+                """
+                INSERT INTO server_status (server_name, current_status, last_seen_up, last_restart, 
+                                         total_uptime_seconds, last_check, last_latency_ms, last_player_count)
+                VALUES (:name, :status, :last_seen, :restart, :uptime, :check, :latency, :players)
+                ON CONFLICT (server_name) DO UPDATE SET
+                    current_status = :status,
+                    last_seen_up = CASE WHEN :status = 'up' THEN :last_seen ELSE server_status.last_seen_up END,
+                    last_restart = :restart,
+                    total_uptime_seconds = :uptime,
+                    last_check = :check,
+                    last_latency_ms = :latency,
+                    last_player_count = CASE WHEN :players IS NOT NULL THEN :players ELSE server_status.last_player_count END
+                """,
+                {
+                    "name": server_name,
+                    "status": status,
+                    "last_seen": now if status == "up" else None,
+                    "restart": last_restart,
+                    "uptime": uptime_seconds,
+                    "check": now,
+                    "latency": latency_ms,
+                    "players": current_player_count
+                }
+            )
+            
+            # Update cache
+            global _server_status_cache
+            _server_status_cache = {
+                "status": status,
+                "latency_ms": latency_ms,
+                "player_count": current_player_count,
+                "last_check": now.isoformat(),
+                "uptime_seconds": uptime_seconds,
+                "last_restart": last_restart.isoformat() if last_restart else None
+            }
+            
+            logger.debug(f"Server health check: {status}, latency={latency_ms}ms, players={current_player_count}")
+            
+        except Exception as e:
+            logger.error(f"Server health monitoring error: {e}", exc_info=True)
+        
+        await asyncio.sleep(check_interval)
+
 def _track_player_changes(new_players: list) -> None:
    """Track player changes for debugging flapping issues."""
    from datetime import datetime, timezone
@ -559,10 +840,11 @@ async def on_startup():
    else:
        raise RuntimeError(f"Could not connect to database after {max_attempts} attempts")
    # Start background cache refresh (live & trails)
-    global _cache_task, _rares_cache_task
+    global _cache_task, _rares_cache_task, _server_health_task
    _cache_task = asyncio.create_task(_refresh_cache_loop())
    _rares_cache_task = asyncio.create_task(_refresh_total_rares_cache())
-    logger.info("Background cache refresh tasks started")
+    _server_health_task = asyncio.create_task(monitor_server_health())
+    logger.info("Background cache refresh and server monitoring tasks started")
@app.on_event("shutdown")
 async def on_shutdown():
    """Event handler triggered when application is shutting down.
@ -570,7 +852,7 @@ async def on_shutdown():
    Ensures the database connection is closed cleanly.
    """
    # Stop cache refresh tasks
-    global _cache_task, _rares_cache_task
+    global _cache_task, _rares_cache_task, _server_health_task
    if _cache_task:
        logger.info("Stopping background cache refresh task")
        _cache_task.cancel()
@ -586,6 +868,14 @@ async def on_shutdown():
            await _rares_cache_task
        except asyncio.CancelledError:
            pass
+    
+    if _server_health_task:
+        logger.info("Stopping server health monitoring task")
+        _server_health_task.cancel()
+        try:
+            await _server_health_task
+        except asyncio.CancelledError:
+            pass
    logger.info("Disconnecting from database")
    await database.disconnect()

@ -695,6 +985,52 @@ async def get_recent_activity():
        logger.error(f"Failed to get recent activity data: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail="Internal server error")

+@app.get("/server-health")
+async def get_server_health():
+    """Return current server health status."""
+    try:
+        # Get latest status from database if cache is stale
+        if not _server_status_cache.get("last_check") or \
+           (datetime.now(timezone.utc) - datetime.fromisoformat(_server_status_cache["last_check"].replace('Z', '+00:00')) > timedelta(minutes=2)):
+            
+            row = await database.fetch_one(
+                "SELECT * FROM server_status WHERE server_name = :name",
+                {"name": "Coldeve"}
+            )
+            
+            if row:
+                _server_status_cache.update({
+                    "status": row["current_status"],
+                    "latency_ms": row["last_latency_ms"],
+                    "player_count": row["last_player_count"],
+                    "last_check": row["last_check"].isoformat() if row["last_check"] else None,
+                    "uptime_seconds": row["total_uptime_seconds"],
+                    "last_restart": row["last_restart"].isoformat() if row["last_restart"] else None
+                })
+        
+        # Format uptime
+        uptime_seconds = _server_status_cache.get("uptime_seconds", 0)
+        days = uptime_seconds // 86400
+        hours = (uptime_seconds % 86400) // 3600
+        minutes = (uptime_seconds % 3600) // 60
+        
+        uptime_str = f"{days}d {hours}h {minutes}m" if days > 0 else f"{hours}h {minutes}m"
+        
+        return {
+            "server_name": "Coldeve",
+            "status": _server_status_cache.get("status", "unknown"),
+            "latency_ms": _server_status_cache.get("latency_ms"),
+            "player_count": _server_status_cache.get("player_count"),
+            "uptime": uptime_str,
+            "uptime_seconds": uptime_seconds,
+            "last_restart": _server_status_cache.get("last_restart"),
+            "last_check": _server_status_cache.get("last_check")
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to get server health data: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail="Internal server error")
+
@app.get("/live", response_model=dict)
@app.get("/live/", response_model=dict)
 async def get_live_players():