From 9d4c724b7fcaf4437883ee1c6d3d562396625eff Mon Sep 17 00:00:00 2001 From: Erik Date: Sat, 25 Apr 2026 21:25:40 +0200 Subject: [PATCH] =?UTF-8?q?feat(agent):=20security=20hardening=20=E2=80=94?= =?UTF-8?q?=20systemd=20lockdown,=20rate=20limit,=20audit=20log?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit systemd unit now applies defense-in-depth: - ProtectSystem=strict + ProtectHome=read-only (rest of FS sealed) - ReadWritePaths only for ~/.claude (session JSONLs) and venv + audit log - InaccessiblePaths blocks /etc/shadow, /etc/ssh, /root, ~/.ssh, shell history - NoNewPrivileges + dropped capabilities (no setuid escalation, no caps) - PrivateTmp, PrivateDevices, ProtectKernel*, MemoryDenyWriteExecute - SystemCallFilter @system-service ~@privileged ~@debug ~@mount etc. - RestrictAddressFamilies blocks raw/packet sockets Application layer: - Per-user rate limit 60/hour (configurable via AGENT_RATE_MAX) - Per-user concurrency cap of 1 in-flight (no parallel claude burns) - JSONL audit log of every /agent/ask to /var/log/overlord-agent/audit.jsonl Logs username, message preview, result preview, timing, errors. Plus secrets migration: EnvironmentFile now prefers /etc/overlord/agent.env (root:erik 0640) over /home/erik/MosswartOverlord/.env, so even the read-only /home doesn't expose them. Falls back to old path during transition. --- agent/overlord-agent.service | 79 +++++++++++++++++- agent/service.py | 152 ++++++++++++++++++++++++++++++++--- 2 files changed, 219 insertions(+), 12 deletions(-) diff --git a/agent/overlord-agent.service b/agent/overlord-agent.service index 5a026a81..2e05529c 100644 --- a/agent/overlord-agent.service +++ b/agent/overlord-agent.service @@ -11,19 +11,92 @@ Group=erik # - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/ # - .mcp.json is auto-loaded WorkingDirectory=/home/erik/MosswartOverlord +# Secrets moved OUT of /home/erik/ to /etc/overlord/agent.env so +# ProtectHome=read-only blocks their read entirely. The file is +# root-owned, mode 0640, group=erik. +EnvironmentFile=-/etc/overlord/agent.env +# Backwards-compat: also try the old location during transition. EnvironmentFile=-/home/erik/MosswartOverlord/.env # Run inside the venv populated by install.sh. ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service Restart=on-failure RestartSec=3 -# Don't tie up the disk with stdout — let journald handle it. StandardOutput=journal StandardError=journal -# Resource hints — the service is light, but cap so a runaway can't -# starve the host. +# ─── Resource caps ───────────────────────────────────────────────── MemoryLimit=512M CPUQuota=200% +TasksMax=128 + +# ─── Filesystem hardening ────────────────────────────────────────── +# /usr, /boot, /efi become read-only; /etc + /var get a writable overlay +# that's discarded on stop. Subprocesses inherit these protections. +ProtectSystem=strict +ProtectHome=read-only +# Allow writing only to the explicit paths claude / our service need. +# - ~/.claude — session JSONL files +# - .venv pycache — minor pip cache writes +ReadWritePaths=/home/erik/.claude +ReadWritePaths=/home/erik/MosswartOverlord/agent/.venv +ReadWritePaths=/var/log/overlord-agent +# Keep $HOME visible to the venv python so it can find pip cache etc. +# (read-only via ProtectHome=read-only — this writable carve-out is +# narrowly the .claude session dir above.) +LogsDirectory=overlord-agent +LogsDirectoryMode=0755 +PrivateTmp=true +PrivateDevices=true +ProtectClock=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectKernelLogs=true +ProtectControlGroups=true +ProtectHostname=true +ProtectProc=invisible +ProcSubset=pid + +# Hide sensitive host paths even if something in the python or claude +# subprocess tree tries to read them. +InaccessiblePaths=/etc/shadow +InaccessiblePaths=/etc/gshadow +InaccessiblePaths=/etc/ssh +InaccessiblePaths=/root +InaccessiblePaths=-/home/erik/.ssh +InaccessiblePaths=-/home/erik/.bash_history +InaccessiblePaths=-/home/erik/.zsh_history + +# ─── Privilege & capability hardening ────────────────────────────── +NoNewPrivileges=true +CapabilityBoundingSet= +AmbientCapabilities= +LockPersonality=true +RestrictRealtime=true +RestrictSUIDSGID=true +RemoveIPC=true +MemoryDenyWriteExecute=true +RestrictNamespaces=true + +# ─── Network family restriction ──────────────────────────────────── +# Block raw/packet sockets so even a kernel-LPE-class bug can't sniff +# traffic or forge packets. We don't IPAddressAllow-restrict because +# Anthropic's Cloudflare IPs shift and the whitelist would break claude. +# If you need true egress filtering, run nftables scoped to this +# service's cgroup — that's reliable in a way IPAddressAllow isn't. +RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 + +# ─── Syscall filter ──────────────────────────────────────────────── +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@privileged +SystemCallFilter=~@resources +SystemCallFilter=~@debug +SystemCallFilter=~@mount +SystemCallFilter=~@cpu-emulation +SystemCallFilter=~@obsolete +SystemCallFilter=~@reboot +SystemCallFilter=~@swap +SystemCallFilter=~@raw-io [Install] WantedBy=multi-user.target diff --git a/agent/service.py b/agent/service.py index d3fbb1fb..d2b7d6a0 100644 --- a/agent/service.py +++ b/agent/service.py @@ -15,10 +15,13 @@ cookie that dereth-tracker issues. from __future__ import annotations +import asyncio import json import logging +import os import time import uuid +from collections import deque from pathlib import Path from typing import Any @@ -35,6 +38,66 @@ logging.basicConfig( ) logger = logging.getLogger("agent") +# Audit log — every /agent/ask request gets a JSONL line here, separate +# from journald so the operator can grep without root. Set to /dev/null +# to disable. Rotated externally (logrotate) if it gets big. +AUDIT_LOG_PATH = Path(os.getenv("AGENT_AUDIT_LOG", "/var/log/overlord-agent/audit.jsonl")) +audit_logger = logging.getLogger("agent.audit") +try: + AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True) + _h = logging.FileHandler(AUDIT_LOG_PATH) + _h.setFormatter(logging.Formatter("%(message)s")) + audit_logger.addHandler(_h) + audit_logger.propagate = False + audit_logger.setLevel(logging.INFO) +except OSError as e: + logger.warning("audit log path %s not writable (%s); logging only via journal", AUDIT_LOG_PATH, e) + +# Rate limit: per-user count over a rolling window. Defaults are generous +# for a single human at a keyboard but block automated abuse. +RATE_LIMIT_WINDOW_S = int(os.getenv("AGENT_RATE_WINDOW_S", "3600")) +RATE_LIMIT_MAX = int(os.getenv("AGENT_RATE_MAX", "60")) +# Per-user concurrent request cap (no fanning out 50 calls in parallel). +CONCURRENCY_LIMIT_PER_USER = int(os.getenv("AGENT_CONCURRENCY_PER_USER", "1")) + +# Rolling timestamps of recent /agent/ask calls per user. +_rate_state: dict[str, deque[float]] = {} +# Per-user semaphores so a single user can't run multiple concurrent claude +# subprocesses (each is expensive). +_user_semaphores: dict[str, asyncio.Semaphore] = {} + + +def _check_rate_limit(username: str) -> tuple[bool, int]: + """Return (allowed, retry_after_seconds).""" + now = time.monotonic() + window = _rate_state.setdefault(username, deque()) + cutoff = now - RATE_LIMIT_WINDOW_S + while window and window[0] < cutoff: + window.popleft() + if len(window) >= RATE_LIMIT_MAX: + retry_after = int(window[0] + RATE_LIMIT_WINDOW_S - now) + 1 + return False, retry_after + window.append(now) + return True, 0 + + +def _user_semaphore(username: str) -> asyncio.Semaphore: + sem = _user_semaphores.get(username) + if sem is None: + sem = asyncio.Semaphore(CONCURRENCY_LIMIT_PER_USER) + _user_semaphores[username] = sem + return sem + + +def _audit(event: dict[str, Any]) -> None: + """Emit one JSONL line to the audit log.""" + event["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + try: + audit_logger.info(json.dumps(event, ensure_ascii=False)) + except Exception: # noqa: BLE001 + pass + + app = FastAPI(title="Overlord Agent", version="0.1.0") @@ -101,25 +164,96 @@ async def new_session(_user: dict = Depends(auth.require_user)) -> NewSessionRes async def agent_ask( req: AskRequest, user: dict = Depends(auth.require_user) ) -> AskResponse: - """Forward a message to claude -p resuming the given session.""" - started = time.monotonic() - try: - result = await ask_claude(req.message, req.session_id) - except ClaudeError as e: - logger.warning( - "claude failed user=%s session=%s err=%s", user["username"], req.session_id, e + """Forward a message to claude -p resuming the given session. + + Enforces: + * Per-user rate limit (60 requests/hour by default). + * Per-user concurrency cap (1 in-flight at a time by default). + * Audit log of every request (JSONL). + """ + username = user["username"] + + # Rate limit BEFORE acquiring the user semaphore — cheaper to reject. + allowed, retry_after = _check_rate_limit(username) + if not allowed: + _audit( + { + "event": "rate_limited", + "user": username, + "session_id": req.session_id, + "retry_after_s": retry_after, + } ) - raise HTTPException(status_code=502, detail=str(e)) + raise HTTPException( + status_code=429, + detail=f"Rate limit exceeded; retry in {retry_after}s", + headers={"Retry-After": str(retry_after)}, + ) + + sem = _user_semaphore(username) + if sem.locked(): + _audit( + { + "event": "concurrency_blocked", + "user": username, + "session_id": req.session_id, + } + ) + raise HTTPException( + status_code=429, detail="A previous question is still being processed" + ) + + started = time.monotonic() + async with sem: + _audit( + { + "event": "ask_start", + "user": username, + "session_id": req.session_id, + "message": req.message[:500], + "message_len": len(req.message), + } + ) + try: + result = await ask_claude(req.message, req.session_id) + except ClaudeError as e: + elapsed_ms = int((time.monotonic() - started) * 1000) + logger.warning( + "claude failed user=%s session=%s err=%s", username, req.session_id, e + ) + _audit( + { + "event": "ask_error", + "user": username, + "session_id": req.session_id, + "error": str(e)[:500], + "elapsed_ms": elapsed_ms, + } + ) + raise HTTPException(status_code=502, detail=str(e)) elapsed_ms = int((time.monotonic() - started) * 1000) logger.info( "ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)", - user["username"], + username, result.session_id, result.num_turns, elapsed_ms, result.duration_ms, ) + _audit( + { + "event": "ask_ok", + "user": username, + "session_id": result.session_id, + "result_preview": (result.result or "")[:300], + "result_len": len(result.result or ""), + "turns": result.num_turns, + "elapsed_ms": elapsed_ms, + "subprocess_ms": result.duration_ms, + "is_error": result.is_error, + } + ) return AskResponse( result=result.result,