added server tracking plus server metrics

This commit is contained in:
erik 2025-06-20 07:17:01 +00:00
parent 80a0a16bab
commit ca12f4807b
5 changed files with 567 additions and 6 deletions

View file

@ -126,6 +126,41 @@ character_inventories = Table(
UniqueConstraint("character_name", "item_id", name="uq_char_item"),
)
# Server health monitoring tables
server_health_checks = Table(
# Time-series data for server health checks
"server_health_checks",
metadata,
Column("id", Integer, primary_key=True),
Column("server_name", String, nullable=False, index=True),
Column("server_address", String, nullable=False),
Column("timestamp", DateTime(timezone=True), nullable=False, default=sqlalchemy.func.now()),
Column("status", String(10), nullable=False), # 'up' or 'down'
Column("latency_ms", Float, nullable=True),
Column("player_count", Integer, nullable=True),
)
server_status = Table(
# Current server status and uptime tracking
"server_status",
metadata,
Column("server_name", String, primary_key=True),
Column("current_status", String(10), nullable=False),
Column("last_seen_up", DateTime(timezone=True), nullable=True),
Column("last_restart", DateTime(timezone=True), nullable=True),
Column("total_uptime_seconds", BigInteger, default=0),
Column("last_check", DateTime(timezone=True), nullable=True),
Column("last_latency_ms", Float, nullable=True),
Column("last_player_count", Integer, nullable=True),
)
# Index for efficient server health check queries
Index(
'ix_server_health_checks_name_ts',
server_health_checks.c.server_name,
server_health_checks.c.timestamp.desc()
)
async def init_db_async():
"""Initialize PostgreSQL/TimescaleDB schema and hypertable.

342
main.py
View file

@ -13,6 +13,9 @@ import sys
import time
from typing import Dict, List, Any
from pathlib import Path
import asyncio
import socket
import struct
from fastapi import FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect, Request
from fastapi.responses import JSONResponse, Response
@ -34,6 +37,8 @@ from db_async import (
spawn_events,
rare_events,
character_inventories,
server_health_checks,
server_status,
init_db_async
)
import asyncio
@ -83,6 +88,282 @@ _total_query_time = 0.0
_recent_telemetry_messages = []
_max_recent_messages = 50
# Server health monitoring
_server_health_task = None
_server_status_cache = {
"status": "unknown",
"latency_ms": None,
"player_count": None,
"last_check": None,
"uptime_seconds": 0,
"last_restart": None
}
# AC Hash32 checksum algorithm (based on ThwargLauncher)
def calculate_hash32(data: bytes) -> int:
"""Calculate AC Hash32 checksum as used in ThwargLauncher."""
length = len(data)
checksum = (length << 16) & 0xFFFFFFFF
# Process 4-byte chunks
for i in range(0, length - 3, 4):
chunk = struct.unpack('<I', data[i:i+4])[0]
checksum = (checksum + chunk) & 0xFFFFFFFF
# Handle remaining bytes
remaining_start = (length // 4) * 4
shift = 24
for i in range(remaining_start, length):
byte_val = data[i] << shift
checksum = (checksum + byte_val) & 0xFFFFFFFF
shift -= 8
return checksum
# Create AC EchoRequest packet for server health check (based on ThwargLauncher)
def create_echo_request_packet():
"""Create an AC EchoRequest packet for server health checking."""
# AC packet header: sequence(4) + flags(4) + checksum(4) + id(2) + time(2) + size(2) + table(2) = 20 bytes + padding
packet = bytearray(32) # 32 bytes total (0x20)
# Sequence (4 bytes) - can be 0
struct.pack_into('<I', packet, 0, 0)
# Flags (4 bytes) - EchoRequest = 0x02000000
struct.pack_into('<I', packet, 4, 0x02000000)
# Temporary checksum (4 bytes) - required for proper checksum calculation
struct.pack_into('<I', packet, 8, 0x0BADD70D)
# ID (2 bytes) - can be 0
struct.pack_into('<H', packet, 12, 0)
# Time (2 bytes) - can be 0
struct.pack_into('<H', packet, 14, 0)
# Size (2 bytes) - header size = 32 (0x20)
struct.pack_into('<H', packet, 16, 32)
# Table (2 bytes) - can be 0
struct.pack_into('<H', packet, 18, 0)
# Calculate proper AC Hash32 checksum
# First, set checksum field to 0
struct.pack_into('<I', packet, 8, 0)
# Calculate checksum using Hash32 algorithm
checksum = calculate_hash32(bytes(packet))
struct.pack_into('<I', packet, 8, checksum)
return bytes(packet)
AC_ECHO_PACKET = create_echo_request_packet()
# AC login packet for server health check (same as ThwargLauncher MakeLoginPacket)
AC_LOGIN_PACKET = bytes([
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x93, 0x00, 0xd0, 0x05,
0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x31, 0x38,
0x30, 0x32, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x3e, 0xb8, 0xa8, 0x58, 0x1c, 0x00, 0x61, 0x63,
0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x65,
0x72, 0x3a, 0x6a, 0x6a, 0x39, 0x68, 0x32, 0x36, 0x68, 0x63, 0x73, 0x67,
0x67, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
])
async def check_server_health(address: str, port: int, timeout: float = 3.0) -> tuple[bool, float, int]:
"""Check AC server health via UDP packet.
Returns: (is_up, latency_ms, player_count)
"""
logger.debug(f"🔍 Starting health check for {address}:{port}")
start_time = time.time()
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.setblocking(False)
try:
# Send login packet (same as ThwargLauncher)
await asyncio.get_event_loop().sock_sendto(sock, AC_LOGIN_PACKET, (address, port))
# Wait for response with timeout
try:
data, addr = await asyncio.wait_for(
asyncio.get_event_loop().sock_recvfrom(sock, 1024),
timeout=timeout
)
latency_ms = (time.time() - start_time) * 1000
logger.debug(f"📥 Received response from {addr}: {len(data)} bytes, latency: {latency_ms:.1f}ms")
# Check if valid response (support both TimeSynch 0x800000 and ConnectRequest 0x40000)
if len(data) >= 24:
flags = struct.unpack('<I', data[4:8])[0]
# Accept both TimeSynch (0x800000) and ConnectRequest (0x40000) as valid responses
if (flags & 0x800000) or (flags & 0x40000):
# UDP health check is for server status and latency only
# Player count comes from TreeStats.net API (like ThwargLauncher)
logger.debug(f"✅ Valid server response: latency: {latency_ms:.1f}ms")
return True, latency_ms, None
# Any response indicates server is up, even if not the expected format
logger.info(f"✅ Server response (non-standard format): latency: {latency_ms:.1f}ms")
return True, latency_ms, None
except asyncio.TimeoutError:
logger.debug(f"⏰ TIMEOUT: No response from {address}:{port} after {timeout}s - server down")
return False, None, None
except Exception as e:
logger.error(f"Server health check error: {e}")
return False, None, None
finally:
sock.close()
async def get_player_count_from_treestats(server_name: str) -> int:
"""Get player count from TreeStats.net API (same as ThwargLauncher)."""
try:
async with httpx.AsyncClient() as client:
response = await client.get("http://treestats.net/player_counts-latest.json", timeout=10)
if response.status_code == 200:
data = response.json()
for server_data in data:
if server_data.get("server") == server_name:
return server_data.get("count", 0)
return 0
except Exception as e:
logger.debug(f"Failed to get player count from TreeStats.net: {e}")
return 0
async def monitor_server_health():
"""Background task to monitor server health every 30 seconds."""
server_name = "Coldeve"
server_address = "play.coldeve.ac"
server_port = 9000
check_interval = 30 # seconds
player_count_interval = 300 # 5 minutes (like ThwargLauncher's 10 minutes, but more frequent)
last_player_count_check = 0
current_player_count = None
# Initialize server status in database
try:
existing = await database.fetch_one(
"SELECT * FROM server_status WHERE server_name = :name",
{"name": server_name}
)
if not existing:
await database.execute(
server_status.insert().values(
server_name=server_name,
current_status="unknown",
total_uptime_seconds=0
)
)
except Exception as e:
logger.error(f"Failed to initialize server status: {e}")
while True:
try:
logger.debug(f"🏥 Running scheduled health check for {server_name} ({server_address}:{server_port})")
# Check server health via UDP (for status and latency)
is_up, latency_ms, _ = await check_server_health(server_address, server_port)
status = "up" if is_up else "down"
now = datetime.now(timezone.utc)
# Get player count from TreeStats.net API (like ThwargLauncher)
current_time = time.time()
if current_time - last_player_count_check >= player_count_interval or current_player_count is None:
new_player_count = await get_player_count_from_treestats(server_name)
if new_player_count > 0: # Only update if we got a valid count
current_player_count = new_player_count
last_player_count_check = current_time
logger.info(f"🏥 Updated player count from TreeStats.net: {current_player_count}")
logger.debug(f"🏥 Health check result: {status}, latency: {latency_ms}, players: {current_player_count}")
# Record health check
await database.execute(
server_health_checks.insert().values(
server_name=server_name,
server_address=f"{server_address}:{server_port}",
timestamp=now,
status=status,
latency_ms=latency_ms,
player_count=current_player_count
)
)
# Get previous status
prev_status = await database.fetch_one(
"SELECT * FROM server_status WHERE server_name = :name",
{"name": server_name}
)
# Calculate uptime and detect restarts
last_restart = prev_status["last_restart"] if prev_status else None
if prev_status and prev_status["current_status"] == "down" and status == "up":
# Server came back up - this is a restart
last_restart = now
logger.info(f"Server {server_name} came back online")
# Broadcast to all browser clients
await _broadcast_to_browser_clients({
"type": "server_status",
"server": server_name,
"status": "up",
"message": "Server is back online"
})
# Calculate uptime from last restart time (not accumulated)
if last_restart and status == "up":
uptime_seconds = int((now - last_restart).total_seconds())
else:
uptime_seconds = 0
# Update server status (always include current_player_count if we have it)
await database.execute(
"""
INSERT INTO server_status (server_name, current_status, last_seen_up, last_restart,
total_uptime_seconds, last_check, last_latency_ms, last_player_count)
VALUES (:name, :status, :last_seen, :restart, :uptime, :check, :latency, :players)
ON CONFLICT (server_name) DO UPDATE SET
current_status = :status,
last_seen_up = CASE WHEN :status = 'up' THEN :last_seen ELSE server_status.last_seen_up END,
last_restart = :restart,
total_uptime_seconds = :uptime,
last_check = :check,
last_latency_ms = :latency,
last_player_count = CASE WHEN :players IS NOT NULL THEN :players ELSE server_status.last_player_count END
""",
{
"name": server_name,
"status": status,
"last_seen": now if status == "up" else None,
"restart": last_restart,
"uptime": uptime_seconds,
"check": now,
"latency": latency_ms,
"players": current_player_count
}
)
# Update cache
global _server_status_cache
_server_status_cache = {
"status": status,
"latency_ms": latency_ms,
"player_count": current_player_count,
"last_check": now.isoformat(),
"uptime_seconds": uptime_seconds,
"last_restart": last_restart.isoformat() if last_restart else None
}
logger.debug(f"Server health check: {status}, latency={latency_ms}ms, players={current_player_count}")
except Exception as e:
logger.error(f"Server health monitoring error: {e}", exc_info=True)
await asyncio.sleep(check_interval)
def _track_player_changes(new_players: list) -> None:
"""Track player changes for debugging flapping issues."""
from datetime import datetime, timezone
@ -559,10 +840,11 @@ async def on_startup():
else:
raise RuntimeError(f"Could not connect to database after {max_attempts} attempts")
# Start background cache refresh (live & trails)
global _cache_task, _rares_cache_task
global _cache_task, _rares_cache_task, _server_health_task
_cache_task = asyncio.create_task(_refresh_cache_loop())
_rares_cache_task = asyncio.create_task(_refresh_total_rares_cache())
logger.info("Background cache refresh tasks started")
_server_health_task = asyncio.create_task(monitor_server_health())
logger.info("Background cache refresh and server monitoring tasks started")
@app.on_event("shutdown")
async def on_shutdown():
"""Event handler triggered when application is shutting down.
@ -570,7 +852,7 @@ async def on_shutdown():
Ensures the database connection is closed cleanly.
"""
# Stop cache refresh tasks
global _cache_task, _rares_cache_task
global _cache_task, _rares_cache_task, _server_health_task
if _cache_task:
logger.info("Stopping background cache refresh task")
_cache_task.cancel()
@ -586,6 +868,14 @@ async def on_shutdown():
await _rares_cache_task
except asyncio.CancelledError:
pass
if _server_health_task:
logger.info("Stopping server health monitoring task")
_server_health_task.cancel()
try:
await _server_health_task
except asyncio.CancelledError:
pass
logger.info("Disconnecting from database")
await database.disconnect()
@ -695,6 +985,52 @@ async def get_recent_activity():
logger.error(f"Failed to get recent activity data: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@app.get("/server-health")
async def get_server_health():
"""Return current server health status."""
try:
# Get latest status from database if cache is stale
if not _server_status_cache.get("last_check") or \
(datetime.now(timezone.utc) - datetime.fromisoformat(_server_status_cache["last_check"].replace('Z', '+00:00')) > timedelta(minutes=2)):
row = await database.fetch_one(
"SELECT * FROM server_status WHERE server_name = :name",
{"name": "Coldeve"}
)
if row:
_server_status_cache.update({
"status": row["current_status"],
"latency_ms": row["last_latency_ms"],
"player_count": row["last_player_count"],
"last_check": row["last_check"].isoformat() if row["last_check"] else None,
"uptime_seconds": row["total_uptime_seconds"],
"last_restart": row["last_restart"].isoformat() if row["last_restart"] else None
})
# Format uptime
uptime_seconds = _server_status_cache.get("uptime_seconds", 0)
days = uptime_seconds // 86400
hours = (uptime_seconds % 86400) // 3600
minutes = (uptime_seconds % 3600) // 60
uptime_str = f"{days}d {hours}h {minutes}m" if days > 0 else f"{hours}h {minutes}m"
return {
"server_name": "Coldeve",
"status": _server_status_cache.get("status", "unknown"),
"latency_ms": _server_status_cache.get("latency_ms"),
"player_count": _server_status_cache.get("player_count"),
"uptime": uptime_str,
"uptime_seconds": uptime_seconds,
"last_restart": _server_status_cache.get("last_restart"),
"last_check": _server_status_cache.get("last_check")
}
except Exception as e:
logger.error(f"Failed to get server health data: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal server error")
@app.get("/live", response_model=dict)
@app.get("/live/", response_model=dict)
async def get_live_players():

View file

@ -14,10 +14,23 @@
<!-- Sidebar for active players list and filters -->
<aside id="sidebar">
<!-- Container for sort and filter controls -->
<div id="sortButtons" class="sort-buttons"></div>
<h2 id="activePlayersHeader">Active Mosswart Enjoyers</h2>
<!-- Server Status -->
<div id="serverStatus" class="server-status-container">
<h3>Coldeve Server Status</h3>
<div class="status-indicator">
<span class="status-dot" id="statusDot"></span>
<span id="statusText">Checking...</span>
</div>
<div class="status-details">
<div>Players: <span id="playerCount">-</span></div>
<div>Latency: <span id="latencyMs">-</span> ms</div>
<div>Uptime: <span id="uptime">-</span></div>
<div>Last Restart: <span id="lastRestart">-</span></div>
</div>
</div>
<!-- Total rares counter -->
<div id="totalRaresCounter" class="total-rares-counter">
🔥 Total Rares: <span id="totalRaresCount">Loading...</span>
@ -60,6 +73,9 @@
</a>
</div>
<!-- Container for sort and filter controls -->
<div id="sortButtons" class="sort-buttons"></div>
<!-- Text input to filter active players by name -->
<input type="text" id="playerFilter" class="player-filter" placeholder="Filter players..." />

View file

@ -1067,13 +1067,87 @@ function updateTotalRaresDisplay(data) {
}
}
async function pollServerHealth() {
try {
const response = await fetch(`${API_BASE}/server-health`);
const data = await response.json();
updateServerStatusDisplay(data);
} catch (e) {
console.error('Server health fetch failed:', e);
updateServerStatusDisplay({ status: 'error' });
}
}
function updateServerStatusDisplay(data) {
const statusDot = document.getElementById('statusDot');
const statusText = document.getElementById('statusText');
const playerCount = document.getElementById('playerCount');
const latencyMs = document.getElementById('latencyMs');
const uptime = document.getElementById('uptime');
const lastRestart = document.getElementById('lastRestart');
if (!statusDot || !statusText) return;
// Update status indicator
const status = data.status || 'unknown';
statusDot.className = `status-dot status-${status}`;
statusText.textContent = status.charAt(0).toUpperCase() + status.slice(1);
// Update player count
if (playerCount) {
playerCount.textContent = data.player_count !== null && data.player_count !== undefined ? data.player_count : '-';
}
// Update latency
if (latencyMs) {
latencyMs.textContent = data.latency_ms ? Math.round(data.latency_ms) : '-';
}
// Update uptime
if (uptime) {
uptime.textContent = data.uptime || '-';
}
// Update last restart with Stockholm timezone (24h format, no year)
if (lastRestart) {
if (data.last_restart) {
const restartDate = new Date(data.last_restart);
const formattedDate = restartDate.toLocaleString('sv-SE', {
timeZone: 'Europe/Stockholm',
month: '2-digit',
day: '2-digit',
hour: '2-digit',
minute: '2-digit',
hour12: false
});
lastRestart.textContent = formattedDate;
} else {
lastRestart.textContent = 'Unknown';
}
}
}
function handleServerStatusUpdate(msg) {
// Handle real-time server status updates via WebSocket
if (msg.status === 'up' && msg.message) {
// Show notification for server coming back online
console.log(`Server Status: ${msg.message}`);
}
// Trigger an immediate server health poll to refresh the display
pollServerHealth();
}
function startPolling() {
if (pollID !== null) return;
pollLive();
pollTotalRares(); // Initial fetch
pollServerHealth(); // Initial server health check
pollID = setInterval(pollLive, POLL_MS);
// Poll total rares every 5 minutes (300,000 ms)
setInterval(pollTotalRares, 300000);
// Poll server health every 30 seconds (30,000 ms)
setInterval(pollServerHealth, 30000);
}
img.onload = () => {
@ -1091,6 +1165,12 @@ img.onload = () => {
initHeatMap();
};
// Ensure server health polling starts regardless of image loading
document.addEventListener('DOMContentLoaded', () => {
// Start server health polling immediately on DOM ready
pollServerHealth();
});
/* ---------- rendering sorted list & dots ------------------------ */
/**
* Filter and sort the currentPlayers, then render them.
@ -1293,6 +1373,8 @@ function initWebSocket() {
updateVitalsDisplay(msg);
} else if (msg.type === 'rare') {
triggerEpicRareNotification(msg.character_name, msg.name);
} else if (msg.type === 'server_status') {
handleServerStatusUpdate(msg);
}
});
socket.addEventListener('close', () => setTimeout(initWebSocket, 2000));

View file

@ -203,6 +203,98 @@ body {
to { text-shadow: 0 0 18px rgba(255, 255, 255, 0.9), 0 0 25px rgba(136, 102, 255, 0.5); }
}
/* Server Status Styling */
.server-status-container {
margin: 0 0 16px 0;
padding: 12px;
background: linear-gradient(135deg, #2a4a2a, #1a3a1a);
border: 2px solid #44aa44;
border-radius: 8px;
box-shadow: 0 3px 8px rgba(0, 0, 0, 0.4);
}
.server-status-container h3 {
margin: 0 0 10px 0;
font-size: 1.1rem;
color: #aaffaa;
text-align: center;
font-weight: 600;
}
.status-indicator {
display: flex;
align-items: center;
justify-content: center;
margin-bottom: 8px;
font-weight: 600;
font-size: 1rem;
}
.status-dot {
width: 12px;
height: 12px;
border-radius: 50%;
margin-right: 8px;
box-shadow: 0 0 6px rgba(0, 0, 0, 0.3);
}
.status-dot.status-up {
background-color: #44ff44;
box-shadow: 0 0 8px rgba(68, 255, 68, 0.6);
animation: status-pulse-up 2s ease-in-out infinite;
}
.status-dot.status-down {
background-color: #ff4444;
box-shadow: 0 0 8px rgba(255, 68, 68, 0.6);
animation: status-pulse-down 2s ease-in-out infinite;
}
.status-dot.status-unknown,
.status-dot.status-error {
background-color: #ffaa44;
box-shadow: 0 0 8px rgba(255, 170, 68, 0.6);
}
@keyframes status-pulse-up {
0%, 100% {
box-shadow: 0 0 8px rgba(68, 255, 68, 0.6);
}
50% {
box-shadow: 0 0 16px rgba(68, 255, 68, 0.9);
}
}
@keyframes status-pulse-down {
0%, 100% {
box-shadow: 0 0 8px rgba(255, 68, 68, 0.6);
}
50% {
box-shadow: 0 0 16px rgba(255, 68, 68, 0.9);
}
}
.status-details {
font-size: 0.85rem;
color: #ccc;
line-height: 1.6;
display: grid;
grid-template-columns: 1fr 1fr;
gap: 8px 16px;
}
.status-details div {
display: flex;
align-items: center;
white-space: nowrap;
}
.status-details span {
color: #fff;
font-weight: 500;
margin-left: 6px;
}
.total-kills-counter {
margin: 0 0 12px 0;
padding: 8px 12px;