added server tracking plus server metrics

This commit is contained in:
erik 2025-06-20 07:17:01 +00:00
parent 80a0a16bab
commit ca12f4807b
5 changed files with 567 additions and 6 deletions

342
main.py
View file

@ -13,6 +13,9 @@ import sys
import time
from typing import Dict, List, Any
from pathlib import Path
import asyncio
import socket
import struct
from fastapi import FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect, Request
from fastapi.responses import JSONResponse, Response
@ -34,6 +37,8 @@ from db_async import (
spawn_events,
rare_events,
character_inventories,
server_health_checks,
server_status,
init_db_async
)
import asyncio
@ -83,6 +88,282 @@ _total_query_time = 0.0
_recent_telemetry_messages = []
_max_recent_messages = 50
# Server health monitoring
_server_health_task = None
_server_status_cache = {
"status": "unknown",
"latency_ms": None,
"player_count": None,
"last_check": None,
"uptime_seconds": 0,
"last_restart": None
}
# AC Hash32 checksum algorithm (based on ThwargLauncher)
def calculate_hash32(data: bytes) -> int:
"""Calculate AC Hash32 checksum as used in ThwargLauncher."""
length = len(data)
checksum = (length << 16) & 0xFFFFFFFF
# Process 4-byte chunks
for i in range(0, length - 3, 4):
chunk = struct.unpack('<I', data[i:i+4])[0]
checksum = (checksum + chunk) & 0xFFFFFFFF
# Handle remaining bytes
remaining_start = (length // 4) * 4
shift = 24
for i in range(remaining_start, length):
byte_val = data[i] << shift
checksum = (checksum + byte_val) & 0xFFFFFFFF
shift -= 8
return checksum
# Create AC EchoRequest packet for server health check (based on ThwargLauncher)
def create_echo_request_packet():
"""Create an AC EchoRequest packet for server health checking."""
# AC packet header: sequence(4) + flags(4) + checksum(4) + id(2) + time(2) + size(2) + table(2) = 20 bytes + padding
packet = bytearray(32) # 32 bytes total (0x20)
# Sequence (4 bytes) - can be 0
struct.pack_into('<I', packet, 0, 0)
# Flags (4 bytes) - EchoRequest = 0x02000000
struct.pack_into('<I', packet, 4, 0x02000000)
# Temporary checksum (4 bytes) - required for proper checksum calculation
struct.pack_into('<I', packet, 8, 0x0BADD70D)
# ID (2 bytes) - can be 0
struct.pack_into('<H', packet, 12, 0)
# Time (2 bytes) - can be 0
struct.pack_into('<H', packet, 14, 0)
# Size (2 bytes) - header size = 32 (0x20)
struct.pack_into('<H', packet, 16, 32)
# Table (2 bytes) - can be 0
struct.pack_into('<H', packet, 18, 0)
# Calculate proper AC Hash32 checksum
# First, set checksum field to 0
struct.pack_into('<I', packet, 8, 0)
# Calculate checksum using Hash32 algorithm
checksum = calculate_hash32(bytes(packet))
struct.pack_into('<I', packet, 8, checksum)
return bytes(packet)
AC_ECHO_PACKET = create_echo_request_packet()
# AC login packet for server health check (same as ThwargLauncher MakeLoginPacket)
AC_LOGIN_PACKET = bytes([
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x93, 0x00, 0xd0, 0x05,
0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x31, 0x38,
0x30, 0x32, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3e, 0xb8, 0xa8, 0x58, 0x1c, 0x00, 0x61, 0x63,
0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x65,
0x72, 0x3a, 0x6a, 0x6a, 0x39, 0x68, 0x32, 0x36, 0x68, 0x63, 0x73, 0x67,
0x67, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
])
async def check_server_health(address: str, port: int, timeout: float = 3.0) -> tuple[bool, float, int]:
"""Check AC server health via UDP packet.
Returns: (is_up, latency_ms, player_count)
"""
logger.debug(f"🔍 Starting health check for {address}:{port}")
start_time = time.time()
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.setblocking(False)
try:
# Send login packet (same as ThwargLauncher)
await asyncio.get_event_loop().sock_sendto(sock, AC_LOGIN_PACKET, (address, port))
# Wait for response with timeout
try:
data, addr = await asyncio.wait_for(
asyncio.get_event_loop().sock_recvfrom(sock, 1024),
timeout=timeout
)
latency_ms = (time.time() - start_time) * 1000
logger.debug(f"📥 Received response from {addr}: {len(data)} bytes, latency: {latency_ms:.1f}ms")
# Check if valid response (support both TimeSynch 0x800000 and ConnectRequest 0x40000)
if len(data) >= 24:
flags = struct.unpack('<I', data[4:8])[0]
# Accept both TimeSynch (0x800000) and ConnectRequest (0x40000) as valid responses
if (flags & 0x800000) or (flags & 0x40000):
# UDP health check is for server status and latency only
# Player count comes from TreeStats.net API (like ThwargLauncher)
logger.debug(f"✅ Valid server response: latency: {latency_ms:.1f}ms")
return True, latency_ms, None
# Any response indicates server is up, even if not the expected format
logger.info(f"✅ Server response (non-standard format): latency: {latency_ms:.1f}ms")
return True, latency_ms, None
except asyncio.TimeoutError:
logger.debug(f"⏰ TIMEOUT: No response from {address}:{port} after {timeout}s - server down")
return False, None, None
except Exception as e:
logger.error(f"Server health check error: {e}")
return False, None, None
finally:
sock.close()
async def get_player_count_from_treestats(server_name: str) -> int:
"""Get player count from TreeStats.net API (same as ThwargLauncher)."""
try:
async with httpx.AsyncClient() as client:
response = await client.get("http://treestats.net/player_counts-latest.json", timeout=10)
if response.status_code == 200:
data = response.json()
for server_data in data:
if server_data.get("server") == server_name:
return server_data.get("count", 0)
return 0
except Exception as e:
logger.debug(f"Failed to get player count from TreeStats.net: {e}")
return 0
async def monitor_server_health():
"""Background task to monitor server health every 30 seconds."""
server_name = "Coldeve"
server_address = "play.coldeve.ac"
server_port = 9000
check_interval = 30 # seconds
player_count_interval = 300 # 5 minutes (like ThwargLauncher's 10 minutes, but more frequent)
last_player_count_check = 0
current_player_count = None
# Initialize server status in database
try:
existing = await database.fetch_one(
"SELECT * FROM server_status WHERE server_name = :name",
{"name": server_name}
)
if not existing:
await database.execute(
server_status.insert().values(
server_name=server_name,
current_status="unknown",
total_uptime_seconds=0
)
)
except Exception as e:
logger.error(f"Failed to initialize server status: {e}")
while True:
try:
logger.debug(f"🏥 Running scheduled health check for {server_name} ({server_address}:{server_port})")
# Check server health via UDP (for status and latency)
is_up, latency_ms, _ = await check_server_health(server_address, server_port)
status = "up" if is_up else "down"
now = datetime.now(timezone.utc)
# Get player count from TreeStats.net API (like ThwargLauncher)
current_time = time.time()
if current_time - last_player_count_check >= player_count_interval or current_player_count is None:
new_player_count = await get_player_count_from_treestats(server_name)
if new_player_count > 0: # Only update if we got a valid count
current_player_count = new_player_count
last_player_count_check = current_time
logger.info(f"🏥 Updated player count from TreeStats.net: {current_player_count}")
logger.debug(f"🏥 Health check result: {status}, latency: {latency_ms}, players: {current_player_count}")
# Record health check
await database.execute(
server_health_checks.insert().values(
server_name=server_name,
server_address=f"{server_address}:{server_port}",
timestamp=now,
status=status,
latency_ms=latency_ms,
player_count=current_player_count
)
)
# Get previous status
prev_status = await database.fetch_one(
"SELECT * FROM server_status WHERE server_name = :name",
{"name": server_name}
)
# Calculate uptime and detect restarts
last_restart = prev_status["last_restart"] if prev_status else None
if prev_status and prev_status["current_status"] == "down" and status == "up":
# Server came back up - this is a restart
last_restart = now
logger.info(f"Server {server_name} came back online")
# Broadcast to all browser clients
await _broadcast_to_browser_clients({
"type": "server_status",
"server": server_name,
"status": "up",
"message": "Server is back online"
})
# Calculate uptime from last restart time (not accumulated)
if last_restart and status == "up":
uptime_seconds = int((now - last_restart).total_seconds())
else:
uptime_seconds = 0
# Update server status (always include current_player_count if we have it)
await database.execute(
"""
INSERT INTO server_status (server_name, current_status, last_seen_up, last_restart,
total_uptime_seconds, last_check, last_latency_ms, last_player_count)
VALUES (:name, :status, :last_seen, :restart, :uptime, :check, :latency, :players)
ON CONFLICT (server_name) DO UPDATE SET
current_status = :status,
last_seen_up = CASE WHEN :status = 'up' THEN :last_seen ELSE server_status.last_seen_up END,
last_restart = :restart,
total_uptime_seconds = :uptime,
last_check = :check,
last_latency_ms = :latency,
last_player_count = CASE WHEN :players IS NOT NULL THEN :players ELSE server_status.last_player_count END
""",
{
"name": server_name,
"status": status,
"last_seen": now if status == "up" else None,
"restart": last_restart,
"uptime": uptime_seconds,
"check": now,
"latency": latency_ms,
"players": current_player_count
}
)
# Update cache
global _server_status_cache
_server_status_cache = {
"status": status,
"latency_ms": latency_ms,
"player_count": current_player_count,
"last_check": now.isoformat(),
"uptime_seconds": uptime_seconds,
"last_restart": last_restart.isoformat() if last_restart else None
}
logger.debug(f"Server health check: {status}, latency={latency_ms}ms, players={current_player_count}")
except Exception as e:
logger.error(f"Server health monitoring error: {e}", exc_info=True)
await asyncio.sleep(check_interval)
def _track_player_changes(new_players: list) -> None:
"""Track player changes for debugging flapping issues."""
from datetime import datetime, timezone
@ -559,10 +840,11 @@ async def on_startup():
else:
raise RuntimeError(f"Could not connect to database after {max_attempts} attempts")
# Start background cache refresh (live & trails)
global _cache_task, _rares_cache_task
global _cache_task, _rares_cache_task, _server_health_task
_cache_task = asyncio.create_task(_refresh_cache_loop())
_rares_cache_task = asyncio.create_task(_refresh_total_rares_cache())
logger.info("Background cache refresh tasks started")
_server_health_task = asyncio.create_task(monitor_server_health())
logger.info("Background cache refresh and server monitoring tasks started")
@app.on_event("shutdown")
async def on_shutdown():
"""Event handler triggered when application is shutting down.
@ -570,7 +852,7 @@ async def on_shutdown():
Ensures the database connection is closed cleanly.
"""
# Stop cache refresh tasks
global _cache_task, _rares_cache_task
global _cache_task, _rares_cache_task, _server_health_task
if _cache_task:
logger.info("Stopping background cache refresh task")
_cache_task.cancel()
@ -586,6 +868,14 @@ async def on_shutdown():
await _rares_cache_task
except asyncio.CancelledError:
pass
if _server_health_task:
logger.info("Stopping server health monitoring task")
_server_health_task.cancel()
try:
await _server_health_task
except asyncio.CancelledError:
pass
logger.info("Disconnecting from database")
await database.disconnect()
@ -695,6 +985,52 @@ async def get_recent_activity():
logger.error(f"Failed to get recent activity data: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@app.get("/server-health")
async def get_server_health():
"""Return current server health status."""
try:
# Get latest status from database if cache is stale
if not _server_status_cache.get("last_check") or \
(datetime.now(timezone.utc) - datetime.fromisoformat(_server_status_cache["last_check"].replace('Z', '+00:00')) > timedelta(minutes=2)):
row = await database.fetch_one(
"SELECT * FROM server_status WHERE server_name = :name",
{"name": "Coldeve"}
)
if row:
_server_status_cache.update({
"status": row["current_status"],
"latency_ms": row["last_latency_ms"],
"player_count": row["last_player_count"],
"last_check": row["last_check"].isoformat() if row["last_check"] else None,
"uptime_seconds": row["total_uptime_seconds"],
"last_restart": row["last_restart"].isoformat() if row["last_restart"] else None
})
# Format uptime
uptime_seconds = _server_status_cache.get("uptime_seconds", 0)
days = uptime_seconds // 86400
hours = (uptime_seconds % 86400) // 3600
minutes = (uptime_seconds % 3600) // 60
uptime_str = f"{days}d {hours}h {minutes}m" if days > 0 else f"{hours}h {minutes}m"
return {
"server_name": "Coldeve",
"status": _server_status_cache.get("status", "unknown"),
"latency_ms": _server_status_cache.get("latency_ms"),
"player_count": _server_status_cache.get("player_count"),
"uptime": uptime_str,
"uptime_seconds": uptime_seconds,
"last_restart": _server_status_cache.get("last_restart"),
"last_check": _server_status_cache.get("last_check")
}
except Exception as e:
logger.error(f"Failed to get server health data: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@app.get("/live", response_model=dict)
@app.get("/live/", response_model=dict)
async def get_live_players():