added server tracking plus server metrics
This commit is contained in:
parent
80a0a16bab
commit
ca12f4807b
5 changed files with 567 additions and 6 deletions
342
main.py
342
main.py
|
|
@ -13,6 +13,9 @@ import sys
|
|||
import time
|
||||
from typing import Dict, List, Any
|
||||
from pathlib import Path
|
||||
import asyncio
|
||||
import socket
|
||||
import struct
|
||||
|
||||
from fastapi import FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect, Request
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
|
@ -34,6 +37,8 @@ from db_async import (
|
|||
spawn_events,
|
||||
rare_events,
|
||||
character_inventories,
|
||||
server_health_checks,
|
||||
server_status,
|
||||
init_db_async
|
||||
)
|
||||
import asyncio
|
||||
|
|
@ -83,6 +88,282 @@ _total_query_time = 0.0
|
|||
_recent_telemetry_messages = []
|
||||
_max_recent_messages = 50
|
||||
|
||||
# Server health monitoring
|
||||
_server_health_task = None
|
||||
_server_status_cache = {
|
||||
"status": "unknown",
|
||||
"latency_ms": None,
|
||||
"player_count": None,
|
||||
"last_check": None,
|
||||
"uptime_seconds": 0,
|
||||
"last_restart": None
|
||||
}
|
||||
|
||||
# AC Hash32 checksum algorithm (based on ThwargLauncher)
|
||||
def calculate_hash32(data: bytes) -> int:
|
||||
"""Calculate AC Hash32 checksum as used in ThwargLauncher."""
|
||||
length = len(data)
|
||||
checksum = (length << 16) & 0xFFFFFFFF
|
||||
|
||||
# Process 4-byte chunks
|
||||
for i in range(0, length - 3, 4):
|
||||
chunk = struct.unpack('<I', data[i:i+4])[0]
|
||||
checksum = (checksum + chunk) & 0xFFFFFFFF
|
||||
|
||||
# Handle remaining bytes
|
||||
remaining_start = (length // 4) * 4
|
||||
shift = 24
|
||||
for i in range(remaining_start, length):
|
||||
byte_val = data[i] << shift
|
||||
checksum = (checksum + byte_val) & 0xFFFFFFFF
|
||||
shift -= 8
|
||||
|
||||
return checksum
|
||||
|
||||
# Create AC EchoRequest packet for server health check (based on ThwargLauncher)
|
||||
def create_echo_request_packet():
|
||||
"""Create an AC EchoRequest packet for server health checking."""
|
||||
# AC packet header: sequence(4) + flags(4) + checksum(4) + id(2) + time(2) + size(2) + table(2) = 20 bytes + padding
|
||||
packet = bytearray(32) # 32 bytes total (0x20)
|
||||
|
||||
# Sequence (4 bytes) - can be 0
|
||||
struct.pack_into('<I', packet, 0, 0)
|
||||
|
||||
# Flags (4 bytes) - EchoRequest = 0x02000000
|
||||
struct.pack_into('<I', packet, 4, 0x02000000)
|
||||
|
||||
# Temporary checksum (4 bytes) - required for proper checksum calculation
|
||||
struct.pack_into('<I', packet, 8, 0x0BADD70D)
|
||||
|
||||
# ID (2 bytes) - can be 0
|
||||
struct.pack_into('<H', packet, 12, 0)
|
||||
|
||||
# Time (2 bytes) - can be 0
|
||||
struct.pack_into('<H', packet, 14, 0)
|
||||
|
||||
# Size (2 bytes) - header size = 32 (0x20)
|
||||
struct.pack_into('<H', packet, 16, 32)
|
||||
|
||||
# Table (2 bytes) - can be 0
|
||||
struct.pack_into('<H', packet, 18, 0)
|
||||
|
||||
# Calculate proper AC Hash32 checksum
|
||||
# First, set checksum field to 0
|
||||
struct.pack_into('<I', packet, 8, 0)
|
||||
|
||||
# Calculate checksum using Hash32 algorithm
|
||||
checksum = calculate_hash32(bytes(packet))
|
||||
struct.pack_into('<I', packet, 8, checksum)
|
||||
|
||||
return bytes(packet)
|
||||
|
||||
AC_ECHO_PACKET = create_echo_request_packet()
|
||||
|
||||
# AC login packet for server health check (same as ThwargLauncher MakeLoginPacket)
|
||||
AC_LOGIN_PACKET = bytes([
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x93, 0x00, 0xd0, 0x05,
|
||||
0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x31, 0x38,
|
||||
0x30, 0x32, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x3e, 0xb8, 0xa8, 0x58, 0x1c, 0x00, 0x61, 0x63,
|
||||
0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x65,
|
||||
0x72, 0x3a, 0x6a, 0x6a, 0x39, 0x68, 0x32, 0x36, 0x68, 0x63, 0x73, 0x67,
|
||||
0x67, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
])
|
||||
|
||||
async def check_server_health(address: str, port: int, timeout: float = 3.0) -> tuple[bool, float, int]:
|
||||
"""Check AC server health via UDP packet.
|
||||
|
||||
Returns: (is_up, latency_ms, player_count)
|
||||
"""
|
||||
logger.debug(f"🔍 Starting health check for {address}:{port}")
|
||||
start_time = time.time()
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
sock.setblocking(False)
|
||||
|
||||
try:
|
||||
# Send login packet (same as ThwargLauncher)
|
||||
await asyncio.get_event_loop().sock_sendto(sock, AC_LOGIN_PACKET, (address, port))
|
||||
|
||||
# Wait for response with timeout
|
||||
try:
|
||||
data, addr = await asyncio.wait_for(
|
||||
asyncio.get_event_loop().sock_recvfrom(sock, 1024),
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
latency_ms = (time.time() - start_time) * 1000
|
||||
logger.debug(f"📥 Received response from {addr}: {len(data)} bytes, latency: {latency_ms:.1f}ms")
|
||||
|
||||
# Check if valid response (support both TimeSynch 0x800000 and ConnectRequest 0x40000)
|
||||
if len(data) >= 24:
|
||||
flags = struct.unpack('<I', data[4:8])[0]
|
||||
|
||||
# Accept both TimeSynch (0x800000) and ConnectRequest (0x40000) as valid responses
|
||||
if (flags & 0x800000) or (flags & 0x40000):
|
||||
# UDP health check is for server status and latency only
|
||||
# Player count comes from TreeStats.net API (like ThwargLauncher)
|
||||
logger.debug(f"✅ Valid server response: latency: {latency_ms:.1f}ms")
|
||||
return True, latency_ms, None
|
||||
|
||||
# Any response indicates server is up, even if not the expected format
|
||||
logger.info(f"✅ Server response (non-standard format): latency: {latency_ms:.1f}ms")
|
||||
return True, latency_ms, None
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.debug(f"⏰ TIMEOUT: No response from {address}:{port} after {timeout}s - server down")
|
||||
return False, None, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Server health check error: {e}")
|
||||
return False, None, None
|
||||
finally:
|
||||
sock.close()
|
||||
|
||||
async def get_player_count_from_treestats(server_name: str) -> int:
|
||||
"""Get player count from TreeStats.net API (same as ThwargLauncher)."""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get("http://treestats.net/player_counts-latest.json", timeout=10)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
for server_data in data:
|
||||
if server_data.get("server") == server_name:
|
||||
return server_data.get("count", 0)
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get player count from TreeStats.net: {e}")
|
||||
return 0
|
||||
|
||||
async def monitor_server_health():
|
||||
"""Background task to monitor server health every 30 seconds."""
|
||||
server_name = "Coldeve"
|
||||
server_address = "play.coldeve.ac"
|
||||
server_port = 9000
|
||||
check_interval = 30 # seconds
|
||||
player_count_interval = 300 # 5 minutes (like ThwargLauncher's 10 minutes, but more frequent)
|
||||
last_player_count_check = 0
|
||||
current_player_count = None
|
||||
|
||||
# Initialize server status in database
|
||||
try:
|
||||
existing = await database.fetch_one(
|
||||
"SELECT * FROM server_status WHERE server_name = :name",
|
||||
{"name": server_name}
|
||||
)
|
||||
if not existing:
|
||||
await database.execute(
|
||||
server_status.insert().values(
|
||||
server_name=server_name,
|
||||
current_status="unknown",
|
||||
total_uptime_seconds=0
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize server status: {e}")
|
||||
|
||||
while True:
|
||||
try:
|
||||
logger.debug(f"🏥 Running scheduled health check for {server_name} ({server_address}:{server_port})")
|
||||
# Check server health via UDP (for status and latency)
|
||||
is_up, latency_ms, _ = await check_server_health(server_address, server_port)
|
||||
status = "up" if is_up else "down"
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Get player count from TreeStats.net API (like ThwargLauncher)
|
||||
current_time = time.time()
|
||||
if current_time - last_player_count_check >= player_count_interval or current_player_count is None:
|
||||
new_player_count = await get_player_count_from_treestats(server_name)
|
||||
if new_player_count > 0: # Only update if we got a valid count
|
||||
current_player_count = new_player_count
|
||||
last_player_count_check = current_time
|
||||
logger.info(f"🏥 Updated player count from TreeStats.net: {current_player_count}")
|
||||
|
||||
logger.debug(f"🏥 Health check result: {status}, latency: {latency_ms}, players: {current_player_count}")
|
||||
|
||||
# Record health check
|
||||
await database.execute(
|
||||
server_health_checks.insert().values(
|
||||
server_name=server_name,
|
||||
server_address=f"{server_address}:{server_port}",
|
||||
timestamp=now,
|
||||
status=status,
|
||||
latency_ms=latency_ms,
|
||||
player_count=current_player_count
|
||||
)
|
||||
)
|
||||
|
||||
# Get previous status
|
||||
prev_status = await database.fetch_one(
|
||||
"SELECT * FROM server_status WHERE server_name = :name",
|
||||
{"name": server_name}
|
||||
)
|
||||
|
||||
# Calculate uptime and detect restarts
|
||||
last_restart = prev_status["last_restart"] if prev_status else None
|
||||
|
||||
if prev_status and prev_status["current_status"] == "down" and status == "up":
|
||||
# Server came back up - this is a restart
|
||||
last_restart = now
|
||||
logger.info(f"Server {server_name} came back online")
|
||||
# Broadcast to all browser clients
|
||||
await _broadcast_to_browser_clients({
|
||||
"type": "server_status",
|
||||
"server": server_name,
|
||||
"status": "up",
|
||||
"message": "Server is back online"
|
||||
})
|
||||
|
||||
# Calculate uptime from last restart time (not accumulated)
|
||||
if last_restart and status == "up":
|
||||
uptime_seconds = int((now - last_restart).total_seconds())
|
||||
else:
|
||||
uptime_seconds = 0
|
||||
|
||||
# Update server status (always include current_player_count if we have it)
|
||||
await database.execute(
|
||||
"""
|
||||
INSERT INTO server_status (server_name, current_status, last_seen_up, last_restart,
|
||||
total_uptime_seconds, last_check, last_latency_ms, last_player_count)
|
||||
VALUES (:name, :status, :last_seen, :restart, :uptime, :check, :latency, :players)
|
||||
ON CONFLICT (server_name) DO UPDATE SET
|
||||
current_status = :status,
|
||||
last_seen_up = CASE WHEN :status = 'up' THEN :last_seen ELSE server_status.last_seen_up END,
|
||||
last_restart = :restart,
|
||||
total_uptime_seconds = :uptime,
|
||||
last_check = :check,
|
||||
last_latency_ms = :latency,
|
||||
last_player_count = CASE WHEN :players IS NOT NULL THEN :players ELSE server_status.last_player_count END
|
||||
""",
|
||||
{
|
||||
"name": server_name,
|
||||
"status": status,
|
||||
"last_seen": now if status == "up" else None,
|
||||
"restart": last_restart,
|
||||
"uptime": uptime_seconds,
|
||||
"check": now,
|
||||
"latency": latency_ms,
|
||||
"players": current_player_count
|
||||
}
|
||||
)
|
||||
|
||||
# Update cache
|
||||
global _server_status_cache
|
||||
_server_status_cache = {
|
||||
"status": status,
|
||||
"latency_ms": latency_ms,
|
||||
"player_count": current_player_count,
|
||||
"last_check": now.isoformat(),
|
||||
"uptime_seconds": uptime_seconds,
|
||||
"last_restart": last_restart.isoformat() if last_restart else None
|
||||
}
|
||||
|
||||
logger.debug(f"Server health check: {status}, latency={latency_ms}ms, players={current_player_count}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Server health monitoring error: {e}", exc_info=True)
|
||||
|
||||
await asyncio.sleep(check_interval)
|
||||
|
||||
def _track_player_changes(new_players: list) -> None:
|
||||
"""Track player changes for debugging flapping issues."""
|
||||
from datetime import datetime, timezone
|
||||
|
|
@ -559,10 +840,11 @@ async def on_startup():
|
|||
else:
|
||||
raise RuntimeError(f"Could not connect to database after {max_attempts} attempts")
|
||||
# Start background cache refresh (live & trails)
|
||||
global _cache_task, _rares_cache_task
|
||||
global _cache_task, _rares_cache_task, _server_health_task
|
||||
_cache_task = asyncio.create_task(_refresh_cache_loop())
|
||||
_rares_cache_task = asyncio.create_task(_refresh_total_rares_cache())
|
||||
logger.info("Background cache refresh tasks started")
|
||||
_server_health_task = asyncio.create_task(monitor_server_health())
|
||||
logger.info("Background cache refresh and server monitoring tasks started")
|
||||
@app.on_event("shutdown")
|
||||
async def on_shutdown():
|
||||
"""Event handler triggered when application is shutting down.
|
||||
|
|
@ -570,7 +852,7 @@ async def on_shutdown():
|
|||
Ensures the database connection is closed cleanly.
|
||||
"""
|
||||
# Stop cache refresh tasks
|
||||
global _cache_task, _rares_cache_task
|
||||
global _cache_task, _rares_cache_task, _server_health_task
|
||||
if _cache_task:
|
||||
logger.info("Stopping background cache refresh task")
|
||||
_cache_task.cancel()
|
||||
|
|
@ -586,6 +868,14 @@ async def on_shutdown():
|
|||
await _rares_cache_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
if _server_health_task:
|
||||
logger.info("Stopping server health monitoring task")
|
||||
_server_health_task.cancel()
|
||||
try:
|
||||
await _server_health_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
logger.info("Disconnecting from database")
|
||||
await database.disconnect()
|
||||
|
||||
|
|
@ -695,6 +985,52 @@ async def get_recent_activity():
|
|||
logger.error(f"Failed to get recent activity data: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Internal server error")
|
||||
|
||||
@app.get("/server-health")
|
||||
async def get_server_health():
|
||||
"""Return current server health status."""
|
||||
try:
|
||||
# Get latest status from database if cache is stale
|
||||
if not _server_status_cache.get("last_check") or \
|
||||
(datetime.now(timezone.utc) - datetime.fromisoformat(_server_status_cache["last_check"].replace('Z', '+00:00')) > timedelta(minutes=2)):
|
||||
|
||||
row = await database.fetch_one(
|
||||
"SELECT * FROM server_status WHERE server_name = :name",
|
||||
{"name": "Coldeve"}
|
||||
)
|
||||
|
||||
if row:
|
||||
_server_status_cache.update({
|
||||
"status": row["current_status"],
|
||||
"latency_ms": row["last_latency_ms"],
|
||||
"player_count": row["last_player_count"],
|
||||
"last_check": row["last_check"].isoformat() if row["last_check"] else None,
|
||||
"uptime_seconds": row["total_uptime_seconds"],
|
||||
"last_restart": row["last_restart"].isoformat() if row["last_restart"] else None
|
||||
})
|
||||
|
||||
# Format uptime
|
||||
uptime_seconds = _server_status_cache.get("uptime_seconds", 0)
|
||||
days = uptime_seconds // 86400
|
||||
hours = (uptime_seconds % 86400) // 3600
|
||||
minutes = (uptime_seconds % 3600) // 60
|
||||
|
||||
uptime_str = f"{days}d {hours}h {minutes}m" if days > 0 else f"{hours}h {minutes}m"
|
||||
|
||||
return {
|
||||
"server_name": "Coldeve",
|
||||
"status": _server_status_cache.get("status", "unknown"),
|
||||
"latency_ms": _server_status_cache.get("latency_ms"),
|
||||
"player_count": _server_status_cache.get("player_count"),
|
||||
"uptime": uptime_str,
|
||||
"uptime_seconds": uptime_seconds,
|
||||
"last_restart": _server_status_cache.get("last_restart"),
|
||||
"last_check": _server_status_cache.get("last_check")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get server health data: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Internal server error")
|
||||
|
||||
@app.get("/live", response_model=dict)
|
||||
@app.get("/live/", response_model=dict)
|
||||
async def get_live_players():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue