feat(agent): security hardening — systemd lockdown, rate limit, audit log

systemd unit now applies defense-in-depth: - ProtectSystem=strict + ProtectHome=read-only (rest of FS sealed) - ReadWritePaths only for ~/.claude (session JSONLs) and venv + audit log - InaccessiblePaths blocks /etc/shadow, /etc/ssh, /root, ~/.ssh, shell history - NoNewPrivileges + dropped capabilities (no setuid escalation, no caps) - PrivateTmp, PrivateDevices, ProtectKernel*, MemoryDenyWriteExecute - SystemCallFilter @system-service ~@privileged ~@debug ~@mount etc. - RestrictAddressFamilies blocks raw/packet sockets Application layer: - Per-user rate limit 60/hour (configurable via AGENT_RATE_MAX) - Per-user concurrency cap of 1 in-flight (no parallel claude burns) - JSONL audit log of every /agent/ask to /var/log/overlord-agent/audit.jsonl Logs username, message preview, result preview, timing, errors. Plus secrets migration: EnvironmentFile now prefers /etc/overlord/agent.env (root:erik 0640) over /home/erik/MosswartOverlord/.env, so even the read-only /home doesn't expose them. Falls back to old path during transition.
2026-04-25 21:25:40 +02:00 · 2026-04-25 21:25:40 +02:00 · 9d4c724b7f
commit 9d4c724b7f
parent 4ae18536be
2 changed files with 219 additions and 12 deletions
--- a/agent/overlord-agent.service
+++ b/agent/overlord-agent.service
@ -11,19 +11,92 @@ Group=erik
 #   - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/
 #   - .mcp.json is auto-loaded
 WorkingDirectory=/home/erik/MosswartOverlord
 # Secrets moved OUT of /home/erik/ to /etc/overlord/agent.env so
 # ProtectHome=read-only blocks their read entirely. The file is
 # root-owned, mode 0640, group=erik.
 EnvironmentFile=-/etc/overlord/agent.env
 # Backwards-compat: also try the old location during transition.
 EnvironmentFile=-/home/erik/MosswartOverlord/.env
 # Run inside the venv populated by install.sh.
 ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service
 Restart=on-failure
 RestartSec=3
 # Don't tie up the disk with stdout — let journald handle it.
 StandardOutput=journal
 StandardError=journal
-# Resource hints — the service is light, but cap so a runaway can't
+# ─── Resource caps ─────────────────────────────────────────────────
 # starve the host.
 MemoryLimit=512M
 CPUQuota=200%
 TasksMax=128
 # ─── Filesystem hardening ──────────────────────────────────────────
 # /usr, /boot, /efi become read-only; /etc + /var get a writable overlay
 # that's discarded on stop. Subprocesses inherit these protections.
 ProtectSystem=strict
 ProtectHome=read-only
 # Allow writing only to the explicit paths claude / our service need.
 # - ~/.claude — session JSONL files
 # - .venv pycache — minor pip cache writes
 ReadWritePaths=/home/erik/.claude
 ReadWritePaths=/home/erik/MosswartOverlord/agent/.venv
 ReadWritePaths=/var/log/overlord-agent
 # Keep $HOME visible to the venv python so it can find pip cache etc.
 # (read-only via ProtectHome=read-only — this writable carve-out is
 # narrowly the .claude session dir above.)
 LogsDirectory=overlord-agent
 LogsDirectoryMode=0755
 PrivateTmp=true
 PrivateDevices=true
 ProtectClock=true
 ProtectKernelTunables=true
 ProtectKernelModules=true
 ProtectKernelLogs=true
 ProtectControlGroups=true
 ProtectHostname=true
 ProtectProc=invisible
 ProcSubset=pid
 # Hide sensitive host paths even if something in the python or claude
 # subprocess tree tries to read them.
 InaccessiblePaths=/etc/shadow
 InaccessiblePaths=/etc/gshadow
 InaccessiblePaths=/etc/ssh
 InaccessiblePaths=/root
 InaccessiblePaths=-/home/erik/.ssh
 InaccessiblePaths=-/home/erik/.bash_history
 InaccessiblePaths=-/home/erik/.zsh_history
 # ─── Privilege & capability hardening ──────────────────────────────
 NoNewPrivileges=true
 CapabilityBoundingSet=
 AmbientCapabilities=
 LockPersonality=true
 RestrictRealtime=true
 RestrictSUIDSGID=true
 RemoveIPC=true
 MemoryDenyWriteExecute=true
 RestrictNamespaces=true
 # ─── Network family restriction ────────────────────────────────────
 # Block raw/packet sockets so even a kernel-LPE-class bug can't sniff
 # traffic or forge packets. We don't IPAddressAllow-restrict because
 # Anthropic's Cloudflare IPs shift and the whitelist would break claude.
 # If you need true egress filtering, run nftables scoped to this
 # service's cgroup — that's reliable in a way IPAddressAllow isn't.
 RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
 # ─── Syscall filter ────────────────────────────────────────────────
 SystemCallArchitectures=native
 SystemCallFilter=@system-service
 SystemCallFilter=~@privileged
 SystemCallFilter=~@resources
 SystemCallFilter=~@debug
 SystemCallFilter=~@mount
 SystemCallFilter=~@cpu-emulation
 SystemCallFilter=~@obsolete
 SystemCallFilter=~@reboot
 SystemCallFilter=~@swap
 SystemCallFilter=~@raw-io
 [Install]
 WantedBy=multi-user.target
--- a/agent/service.py
+++ b/agent/service.py
@ -15,10 +15,13 @@ cookie that dereth-tracker issues.
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import os
 import time
 import uuid
 from collections import deque
 from pathlib import Path
 from typing import Any
@ -35,6 +38,66 @@ logging.basicConfig(
 )
 logger = logging.getLogger("agent")
 # Audit log — every /agent/ask request gets a JSONL line here, separate
 # from journald so the operator can grep without root. Set to /dev/null
 # to disable. Rotated externally (logrotate) if it gets big.
 AUDIT_LOG_PATH = Path(os.getenv("AGENT_AUDIT_LOG", "/var/log/overlord-agent/audit.jsonl"))
 audit_logger = logging.getLogger("agent.audit")
 try:
    AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
    _h = logging.FileHandler(AUDIT_LOG_PATH)
    _h.setFormatter(logging.Formatter("%(message)s"))
    audit_logger.addHandler(_h)
    audit_logger.propagate = False
    audit_logger.setLevel(logging.INFO)
 except OSError as e:
    logger.warning("audit log path %s not writable (%s); logging only via journal", AUDIT_LOG_PATH, e)
 # Rate limit: per-user count over a rolling window. Defaults are generous
 # for a single human at a keyboard but block automated abuse.
 RATE_LIMIT_WINDOW_S = int(os.getenv("AGENT_RATE_WINDOW_S", "3600"))
 RATE_LIMIT_MAX = int(os.getenv("AGENT_RATE_MAX", "60"))
 # Per-user concurrent request cap (no fanning out 50 calls in parallel).
 CONCURRENCY_LIMIT_PER_USER = int(os.getenv("AGENT_CONCURRENCY_PER_USER", "1"))
 # Rolling timestamps of recent /agent/ask calls per user.
 _rate_state: dict[str, deque[float]] = {}
 # Per-user semaphores so a single user can't run multiple concurrent claude
 # subprocesses (each is expensive).
 _user_semaphores: dict[str, asyncio.Semaphore] = {}
 def _check_rate_limit(username: str) -> tuple[bool, int]:
    """Return (allowed, retry_after_seconds)."""
    now = time.monotonic()
    window = _rate_state.setdefault(username, deque())
    cutoff = now - RATE_LIMIT_WINDOW_S
    while window and window[0] < cutoff:
        window.popleft()
    if len(window) >= RATE_LIMIT_MAX:
        retry_after = int(window[0] + RATE_LIMIT_WINDOW_S - now) + 1
        return False, retry_after
    window.append(now)
    return True, 0
 def _user_semaphore(username: str) -> asyncio.Semaphore:
    sem = _user_semaphores.get(username)
    if sem is None:
        sem = asyncio.Semaphore(CONCURRENCY_LIMIT_PER_USER)
        _user_semaphores[username] = sem
    return sem
 def _audit(event: dict[str, Any]) -> None:
    """Emit one JSONL line to the audit log."""
    event["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
    try:
        audit_logger.info(json.dumps(event, ensure_ascii=False))
    except Exception:  # noqa: BLE001
        pass
 app = FastAPI(title="Overlord Agent", version="0.1.0")
@ -101,25 +164,96 @@ async def new_session(_user: dict = Depends(auth.require_user)) -> NewSessionRes
 async def agent_ask(
    req: AskRequest, user: dict = Depends(auth.require_user)
 ) -> AskResponse:
-    """Forward a message to claude -p resuming the given session."""
+    """Forward a message to claude -p resuming the given session.
-    started = time.monotonic()
+
-    try:
+    Enforces:
-        result = await ask_claude(req.message, req.session_id)
+      * Per-user rate limit (60 requests/hour by default).
-    except ClaudeError as e:
+      * Per-user concurrency cap (1 in-flight at a time by default).
-        logger.warning(
+      * Audit log of every request (JSONL).
-            "claude failed user=%s session=%s err=%s", user["username"], req.session_id, e
+    """
    username = user["username"]
    # Rate limit BEFORE acquiring the user semaphore — cheaper to reject.
    allowed, retry_after = _check_rate_limit(username)
    if not allowed:
        _audit(
            {
                "event": "rate_limited",
                "user": username,
                "session_id": req.session_id,
                "retry_after_s": retry_after,
            }
        )
-        raise HTTPException(status_code=502, detail=str(e))
+        raise HTTPException(
            status_code=429,
            detail=f"Rate limit exceeded; retry in {retry_after}s",
            headers={"Retry-After": str(retry_after)},
        )
    sem = _user_semaphore(username)
    if sem.locked():
        _audit(
            {
                "event": "concurrency_blocked",
                "user": username,
                "session_id": req.session_id,
            }
        )
        raise HTTPException(
            status_code=429, detail="A previous question is still being processed"
        )
    started = time.monotonic()
    async with sem:
        _audit(
            {
                "event": "ask_start",
                "user": username,
                "session_id": req.session_id,
                "message": req.message[:500],
                "message_len": len(req.message),
            }
        )
        try:
            result = await ask_claude(req.message, req.session_id)
        except ClaudeError as e:
            elapsed_ms = int((time.monotonic() - started) * 1000)
            logger.warning(
                "claude failed user=%s session=%s err=%s", username, req.session_id, e
            )
            _audit(
                {
                    "event": "ask_error",
                    "user": username,
                    "session_id": req.session_id,
                    "error": str(e)[:500],
                    "elapsed_ms": elapsed_ms,
                }
            )
            raise HTTPException(status_code=502, detail=str(e))
    elapsed_ms = int((time.monotonic() - started) * 1000)
    logger.info(
        "ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)",
-        user["username"],
+        username,
        result.session_id,
        result.num_turns,
        elapsed_ms,
        result.duration_ms,
    )
    _audit(
        {
            "event": "ask_ok",
            "user": username,
            "session_id": result.session_id,
            "result_preview": (result.result or "")[:300],
            "result_len": len(result.result or ""),
            "turns": result.num_turns,
            "elapsed_ms": elapsed_ms,
            "subprocess_ms": result.duration_ms,
            "is_error": result.is_error,
        }
    )
    return AskResponse(
        result=result.result,