feat(agent): security hardening — systemd lockdown, rate limit, audit log

systemd unit now applies defense-in-depth:
- ProtectSystem=strict + ProtectHome=read-only (rest of FS sealed)
- ReadWritePaths only for ~/.claude (session JSONLs) and venv + audit log
- InaccessiblePaths blocks /etc/shadow, /etc/ssh, /root, ~/.ssh, shell history
- NoNewPrivileges + dropped capabilities (no setuid escalation, no caps)
- PrivateTmp, PrivateDevices, ProtectKernel*, MemoryDenyWriteExecute
- SystemCallFilter @system-service ~@privileged ~@debug ~@mount etc.
- RestrictAddressFamilies blocks raw/packet sockets

Application layer:
- Per-user rate limit 60/hour (configurable via AGENT_RATE_MAX)
- Per-user concurrency cap of 1 in-flight (no parallel claude burns)
- JSONL audit log of every /agent/ask to /var/log/overlord-agent/audit.jsonl
  Logs username, message preview, result preview, timing, errors.

Plus secrets migration: EnvironmentFile now prefers /etc/overlord/agent.env
(root:erik 0640) over /home/erik/MosswartOverlord/.env, so even the
read-only /home doesn't expose them. Falls back to old path during
transition.
This commit is contained in:
Erik 2026-04-25 21:25:40 +02:00
parent 4ae18536be
commit 9d4c724b7f
2 changed files with 219 additions and 12 deletions

View file

@ -11,19 +11,92 @@ Group=erik
# - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/
# - .mcp.json is auto-loaded
WorkingDirectory=/home/erik/MosswartOverlord
# Secrets moved OUT of /home/erik/ to /etc/overlord/agent.env so
# ProtectHome=read-only blocks their read entirely. The file is
# root-owned, mode 0640, group=erik.
EnvironmentFile=-/etc/overlord/agent.env
# Backwards-compat: also try the old location during transition.
EnvironmentFile=-/home/erik/MosswartOverlord/.env
# Run inside the venv populated by install.sh.
ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service
Restart=on-failure
RestartSec=3
# Don't tie up the disk with stdout — let journald handle it.
StandardOutput=journal
StandardError=journal
# Resource hints — the service is light, but cap so a runaway can't
# starve the host.
# ─── Resource caps ─────────────────────────────────────────────────
MemoryLimit=512M
CPUQuota=200%
TasksMax=128
# ─── Filesystem hardening ──────────────────────────────────────────
# /usr, /boot, /efi become read-only; /etc + /var get a writable overlay
# that's discarded on stop. Subprocesses inherit these protections.
ProtectSystem=strict
ProtectHome=read-only
# Allow writing only to the explicit paths claude / our service need.
# - ~/.claude — session JSONL files
# - .venv pycache — minor pip cache writes
ReadWritePaths=/home/erik/.claude
ReadWritePaths=/home/erik/MosswartOverlord/agent/.venv
ReadWritePaths=/var/log/overlord-agent
# Keep $HOME visible to the venv python so it can find pip cache etc.
# (read-only via ProtectHome=read-only — this writable carve-out is
# narrowly the .claude session dir above.)
LogsDirectory=overlord-agent
LogsDirectoryMode=0755
PrivateTmp=true
PrivateDevices=true
ProtectClock=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectKernelLogs=true
ProtectControlGroups=true
ProtectHostname=true
ProtectProc=invisible
ProcSubset=pid
# Hide sensitive host paths even if something in the python or claude
# subprocess tree tries to read them.
InaccessiblePaths=/etc/shadow
InaccessiblePaths=/etc/gshadow
InaccessiblePaths=/etc/ssh
InaccessiblePaths=/root
InaccessiblePaths=-/home/erik/.ssh
InaccessiblePaths=-/home/erik/.bash_history
InaccessiblePaths=-/home/erik/.zsh_history
# ─── Privilege & capability hardening ──────────────────────────────
NoNewPrivileges=true
CapabilityBoundingSet=
AmbientCapabilities=
LockPersonality=true
RestrictRealtime=true
RestrictSUIDSGID=true
RemoveIPC=true
MemoryDenyWriteExecute=true
RestrictNamespaces=true
# ─── Network family restriction ────────────────────────────────────
# Block raw/packet sockets so even a kernel-LPE-class bug can't sniff
# traffic or forge packets. We don't IPAddressAllow-restrict because
# Anthropic's Cloudflare IPs shift and the whitelist would break claude.
# If you need true egress filtering, run nftables scoped to this
# service's cgroup — that's reliable in a way IPAddressAllow isn't.
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
# ─── Syscall filter ────────────────────────────────────────────────
SystemCallArchitectures=native
SystemCallFilter=@system-service
SystemCallFilter=~@privileged
SystemCallFilter=~@resources
SystemCallFilter=~@debug
SystemCallFilter=~@mount
SystemCallFilter=~@cpu-emulation
SystemCallFilter=~@obsolete
SystemCallFilter=~@reboot
SystemCallFilter=~@swap
SystemCallFilter=~@raw-io
[Install]
WantedBy=multi-user.target

View file

@ -15,10 +15,13 @@ cookie that dereth-tracker issues.
from __future__ import annotations
import asyncio
import json
import logging
import os
import time
import uuid
from collections import deque
from pathlib import Path
from typing import Any
@ -35,6 +38,66 @@ logging.basicConfig(
)
logger = logging.getLogger("agent")
# Audit log — every /agent/ask request gets a JSONL line here, separate
# from journald so the operator can grep without root. Set to /dev/null
# to disable. Rotated externally (logrotate) if it gets big.
AUDIT_LOG_PATH = Path(os.getenv("AGENT_AUDIT_LOG", "/var/log/overlord-agent/audit.jsonl"))
audit_logger = logging.getLogger("agent.audit")
try:
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
_h = logging.FileHandler(AUDIT_LOG_PATH)
_h.setFormatter(logging.Formatter("%(message)s"))
audit_logger.addHandler(_h)
audit_logger.propagate = False
audit_logger.setLevel(logging.INFO)
except OSError as e:
logger.warning("audit log path %s not writable (%s); logging only via journal", AUDIT_LOG_PATH, e)
# Rate limit: per-user count over a rolling window. Defaults are generous
# for a single human at a keyboard but block automated abuse.
RATE_LIMIT_WINDOW_S = int(os.getenv("AGENT_RATE_WINDOW_S", "3600"))
RATE_LIMIT_MAX = int(os.getenv("AGENT_RATE_MAX", "60"))
# Per-user concurrent request cap (no fanning out 50 calls in parallel).
CONCURRENCY_LIMIT_PER_USER = int(os.getenv("AGENT_CONCURRENCY_PER_USER", "1"))
# Rolling timestamps of recent /agent/ask calls per user.
_rate_state: dict[str, deque[float]] = {}
# Per-user semaphores so a single user can't run multiple concurrent claude
# subprocesses (each is expensive).
_user_semaphores: dict[str, asyncio.Semaphore] = {}
def _check_rate_limit(username: str) -> tuple[bool, int]:
"""Return (allowed, retry_after_seconds)."""
now = time.monotonic()
window = _rate_state.setdefault(username, deque())
cutoff = now - RATE_LIMIT_WINDOW_S
while window and window[0] < cutoff:
window.popleft()
if len(window) >= RATE_LIMIT_MAX:
retry_after = int(window[0] + RATE_LIMIT_WINDOW_S - now) + 1
return False, retry_after
window.append(now)
return True, 0
def _user_semaphore(username: str) -> asyncio.Semaphore:
sem = _user_semaphores.get(username)
if sem is None:
sem = asyncio.Semaphore(CONCURRENCY_LIMIT_PER_USER)
_user_semaphores[username] = sem
return sem
def _audit(event: dict[str, Any]) -> None:
"""Emit one JSONL line to the audit log."""
event["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
try:
audit_logger.info(json.dumps(event, ensure_ascii=False))
except Exception: # noqa: BLE001
pass
app = FastAPI(title="Overlord Agent", version="0.1.0")
@ -101,25 +164,96 @@ async def new_session(_user: dict = Depends(auth.require_user)) -> NewSessionRes
async def agent_ask(
req: AskRequest, user: dict = Depends(auth.require_user)
) -> AskResponse:
"""Forward a message to claude -p resuming the given session."""
started = time.monotonic()
try:
result = await ask_claude(req.message, req.session_id)
except ClaudeError as e:
logger.warning(
"claude failed user=%s session=%s err=%s", user["username"], req.session_id, e
"""Forward a message to claude -p resuming the given session.
Enforces:
* Per-user rate limit (60 requests/hour by default).
* Per-user concurrency cap (1 in-flight at a time by default).
* Audit log of every request (JSONL).
"""
username = user["username"]
# Rate limit BEFORE acquiring the user semaphore — cheaper to reject.
allowed, retry_after = _check_rate_limit(username)
if not allowed:
_audit(
{
"event": "rate_limited",
"user": username,
"session_id": req.session_id,
"retry_after_s": retry_after,
}
)
raise HTTPException(status_code=502, detail=str(e))
raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded; retry in {retry_after}s",
headers={"Retry-After": str(retry_after)},
)
sem = _user_semaphore(username)
if sem.locked():
_audit(
{
"event": "concurrency_blocked",
"user": username,
"session_id": req.session_id,
}
)
raise HTTPException(
status_code=429, detail="A previous question is still being processed"
)
started = time.monotonic()
async with sem:
_audit(
{
"event": "ask_start",
"user": username,
"session_id": req.session_id,
"message": req.message[:500],
"message_len": len(req.message),
}
)
try:
result = await ask_claude(req.message, req.session_id)
except ClaudeError as e:
elapsed_ms = int((time.monotonic() - started) * 1000)
logger.warning(
"claude failed user=%s session=%s err=%s", username, req.session_id, e
)
_audit(
{
"event": "ask_error",
"user": username,
"session_id": req.session_id,
"error": str(e)[:500],
"elapsed_ms": elapsed_ms,
}
)
raise HTTPException(status_code=502, detail=str(e))
elapsed_ms = int((time.monotonic() - started) * 1000)
logger.info(
"ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)",
user["username"],
username,
result.session_id,
result.num_turns,
elapsed_ms,
result.duration_ms,
)
_audit(
{
"event": "ask_ok",
"user": username,
"session_id": result.session_id,
"result_preview": (result.result or "")[:300],
"result_len": len(result.result or ""),
"turns": result.num_turns,
"elapsed_ms": elapsed_ms,
"subprocess_ms": result.duration_ms,
"is_error": result.is_error,
}
)
return AskResponse(
result=result.result,