feat(agent): security hardening — systemd lockdown, rate limit, audit log

systemd unit now applies defense-in-depth:
- ProtectSystem=strict + ProtectHome=read-only (rest of FS sealed)
- ReadWritePaths only for ~/.claude (session JSONLs) and venv + audit log
- InaccessiblePaths blocks /etc/shadow, /etc/ssh, /root, ~/.ssh, shell history
- NoNewPrivileges + dropped capabilities (no setuid escalation, no caps)
- PrivateTmp, PrivateDevices, ProtectKernel*, MemoryDenyWriteExecute
- SystemCallFilter @system-service ~@privileged ~@debug ~@mount etc.
- RestrictAddressFamilies blocks raw/packet sockets

Application layer:
- Per-user rate limit 60/hour (configurable via AGENT_RATE_MAX)
- Per-user concurrency cap of 1 in-flight (no parallel claude burns)
- JSONL audit log of every /agent/ask to /var/log/overlord-agent/audit.jsonl
  Logs username, message preview, result preview, timing, errors.

Plus secrets migration: EnvironmentFile now prefers /etc/overlord/agent.env
(root:erik 0640) over /home/erik/MosswartOverlord/.env, so even the
read-only /home doesn't expose them. Falls back to old path during
transition.
This commit is contained in:
Erik 2026-04-25 21:25:40 +02:00
parent 4ae18536be
commit 9d4c724b7f
2 changed files with 219 additions and 12 deletions

View file

@ -11,19 +11,92 @@ Group=erik
# - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/ # - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/
# - .mcp.json is auto-loaded # - .mcp.json is auto-loaded
WorkingDirectory=/home/erik/MosswartOverlord WorkingDirectory=/home/erik/MosswartOverlord
# Secrets moved OUT of /home/erik/ to /etc/overlord/agent.env so
# ProtectHome=read-only blocks their read entirely. The file is
# root-owned, mode 0640, group=erik.
EnvironmentFile=-/etc/overlord/agent.env
# Backwards-compat: also try the old location during transition.
EnvironmentFile=-/home/erik/MosswartOverlord/.env EnvironmentFile=-/home/erik/MosswartOverlord/.env
# Run inside the venv populated by install.sh. # Run inside the venv populated by install.sh.
ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service
Restart=on-failure Restart=on-failure
RestartSec=3 RestartSec=3
# Don't tie up the disk with stdout — let journald handle it.
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal
# Resource hints — the service is light, but cap so a runaway can't # ─── Resource caps ─────────────────────────────────────────────────
# starve the host.
MemoryLimit=512M MemoryLimit=512M
CPUQuota=200% CPUQuota=200%
TasksMax=128
# ─── Filesystem hardening ──────────────────────────────────────────
# /usr, /boot, /efi become read-only; /etc + /var get a writable overlay
# that's discarded on stop. Subprocesses inherit these protections.
ProtectSystem=strict
ProtectHome=read-only
# Allow writing only to the explicit paths claude / our service need.
# - ~/.claude — session JSONL files
# - .venv pycache — minor pip cache writes
ReadWritePaths=/home/erik/.claude
ReadWritePaths=/home/erik/MosswartOverlord/agent/.venv
ReadWritePaths=/var/log/overlord-agent
# Keep $HOME visible to the venv python so it can find pip cache etc.
# (read-only via ProtectHome=read-only — this writable carve-out is
# narrowly the .claude session dir above.)
LogsDirectory=overlord-agent
LogsDirectoryMode=0755
PrivateTmp=true
PrivateDevices=true
ProtectClock=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectKernelLogs=true
ProtectControlGroups=true
ProtectHostname=true
ProtectProc=invisible
ProcSubset=pid
# Hide sensitive host paths even if something in the python or claude
# subprocess tree tries to read them.
InaccessiblePaths=/etc/shadow
InaccessiblePaths=/etc/gshadow
InaccessiblePaths=/etc/ssh
InaccessiblePaths=/root
InaccessiblePaths=-/home/erik/.ssh
InaccessiblePaths=-/home/erik/.bash_history
InaccessiblePaths=-/home/erik/.zsh_history
# ─── Privilege & capability hardening ──────────────────────────────
NoNewPrivileges=true
CapabilityBoundingSet=
AmbientCapabilities=
LockPersonality=true
RestrictRealtime=true
RestrictSUIDSGID=true
RemoveIPC=true
MemoryDenyWriteExecute=true
RestrictNamespaces=true
# ─── Network family restriction ────────────────────────────────────
# Block raw/packet sockets so even a kernel-LPE-class bug can't sniff
# traffic or forge packets. We don't IPAddressAllow-restrict because
# Anthropic's Cloudflare IPs shift and the whitelist would break claude.
# If you need true egress filtering, run nftables scoped to this
# service's cgroup — that's reliable in a way IPAddressAllow isn't.
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
# ─── Syscall filter ────────────────────────────────────────────────
SystemCallArchitectures=native
SystemCallFilter=@system-service
SystemCallFilter=~@privileged
SystemCallFilter=~@resources
SystemCallFilter=~@debug
SystemCallFilter=~@mount
SystemCallFilter=~@cpu-emulation
SystemCallFilter=~@obsolete
SystemCallFilter=~@reboot
SystemCallFilter=~@swap
SystemCallFilter=~@raw-io
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View file

@ -15,10 +15,13 @@ cookie that dereth-tracker issues.
from __future__ import annotations from __future__ import annotations
import asyncio
import json import json
import logging import logging
import os
import time import time
import uuid import uuid
from collections import deque
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -35,6 +38,66 @@ logging.basicConfig(
) )
logger = logging.getLogger("agent") logger = logging.getLogger("agent")
# Audit log — every /agent/ask request gets a JSONL line here, separate
# from journald so the operator can grep without root. Set to /dev/null
# to disable. Rotated externally (logrotate) if it gets big.
AUDIT_LOG_PATH = Path(os.getenv("AGENT_AUDIT_LOG", "/var/log/overlord-agent/audit.jsonl"))
audit_logger = logging.getLogger("agent.audit")
try:
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
_h = logging.FileHandler(AUDIT_LOG_PATH)
_h.setFormatter(logging.Formatter("%(message)s"))
audit_logger.addHandler(_h)
audit_logger.propagate = False
audit_logger.setLevel(logging.INFO)
except OSError as e:
logger.warning("audit log path %s not writable (%s); logging only via journal", AUDIT_LOG_PATH, e)
# Rate limit: per-user count over a rolling window. Defaults are generous
# for a single human at a keyboard but block automated abuse.
RATE_LIMIT_WINDOW_S = int(os.getenv("AGENT_RATE_WINDOW_S", "3600"))
RATE_LIMIT_MAX = int(os.getenv("AGENT_RATE_MAX", "60"))
# Per-user concurrent request cap (no fanning out 50 calls in parallel).
CONCURRENCY_LIMIT_PER_USER = int(os.getenv("AGENT_CONCURRENCY_PER_USER", "1"))
# Rolling timestamps of recent /agent/ask calls per user.
_rate_state: dict[str, deque[float]] = {}
# Per-user semaphores so a single user can't run multiple concurrent claude
# subprocesses (each is expensive).
_user_semaphores: dict[str, asyncio.Semaphore] = {}
def _check_rate_limit(username: str) -> tuple[bool, int]:
"""Return (allowed, retry_after_seconds)."""
now = time.monotonic()
window = _rate_state.setdefault(username, deque())
cutoff = now - RATE_LIMIT_WINDOW_S
while window and window[0] < cutoff:
window.popleft()
if len(window) >= RATE_LIMIT_MAX:
retry_after = int(window[0] + RATE_LIMIT_WINDOW_S - now) + 1
return False, retry_after
window.append(now)
return True, 0
def _user_semaphore(username: str) -> asyncio.Semaphore:
sem = _user_semaphores.get(username)
if sem is None:
sem = asyncio.Semaphore(CONCURRENCY_LIMIT_PER_USER)
_user_semaphores[username] = sem
return sem
def _audit(event: dict[str, Any]) -> None:
"""Emit one JSONL line to the audit log."""
event["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
try:
audit_logger.info(json.dumps(event, ensure_ascii=False))
except Exception: # noqa: BLE001
pass
app = FastAPI(title="Overlord Agent", version="0.1.0") app = FastAPI(title="Overlord Agent", version="0.1.0")
@ -101,25 +164,96 @@ async def new_session(_user: dict = Depends(auth.require_user)) -> NewSessionRes
async def agent_ask( async def agent_ask(
req: AskRequest, user: dict = Depends(auth.require_user) req: AskRequest, user: dict = Depends(auth.require_user)
) -> AskResponse: ) -> AskResponse:
"""Forward a message to claude -p resuming the given session.""" """Forward a message to claude -p resuming the given session.
started = time.monotonic()
try: Enforces:
result = await ask_claude(req.message, req.session_id) * Per-user rate limit (60 requests/hour by default).
except ClaudeError as e: * Per-user concurrency cap (1 in-flight at a time by default).
logger.warning( * Audit log of every request (JSONL).
"claude failed user=%s session=%s err=%s", user["username"], req.session_id, e """
username = user["username"]
# Rate limit BEFORE acquiring the user semaphore — cheaper to reject.
allowed, retry_after = _check_rate_limit(username)
if not allowed:
_audit(
{
"event": "rate_limited",
"user": username,
"session_id": req.session_id,
"retry_after_s": retry_after,
}
) )
raise HTTPException(status_code=502, detail=str(e)) raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded; retry in {retry_after}s",
headers={"Retry-After": str(retry_after)},
)
sem = _user_semaphore(username)
if sem.locked():
_audit(
{
"event": "concurrency_blocked",
"user": username,
"session_id": req.session_id,
}
)
raise HTTPException(
status_code=429, detail="A previous question is still being processed"
)
started = time.monotonic()
async with sem:
_audit(
{
"event": "ask_start",
"user": username,
"session_id": req.session_id,
"message": req.message[:500],
"message_len": len(req.message),
}
)
try:
result = await ask_claude(req.message, req.session_id)
except ClaudeError as e:
elapsed_ms = int((time.monotonic() - started) * 1000)
logger.warning(
"claude failed user=%s session=%s err=%s", username, req.session_id, e
)
_audit(
{
"event": "ask_error",
"user": username,
"session_id": req.session_id,
"error": str(e)[:500],
"elapsed_ms": elapsed_ms,
}
)
raise HTTPException(status_code=502, detail=str(e))
elapsed_ms = int((time.monotonic() - started) * 1000) elapsed_ms = int((time.monotonic() - started) * 1000)
logger.info( logger.info(
"ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)", "ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)",
user["username"], username,
result.session_id, result.session_id,
result.num_turns, result.num_turns,
elapsed_ms, elapsed_ms,
result.duration_ms, result.duration_ms,
) )
_audit(
{
"event": "ask_ok",
"user": username,
"session_id": result.session_id,
"result_preview": (result.result or "")[:300],
"result_len": len(result.result or ""),
"turns": result.num_turns,
"elapsed_ms": elapsed_ms,
"subprocess_ms": result.duration_ms,
"is_error": result.is_error,
}
)
return AskResponse( return AskResponse(
result=result.result, result=result.result,