feat(agent): security hardening — systemd lockdown, rate limit, audit log
systemd unit now applies defense-in-depth: - ProtectSystem=strict + ProtectHome=read-only (rest of FS sealed) - ReadWritePaths only for ~/.claude (session JSONLs) and venv + audit log - InaccessiblePaths blocks /etc/shadow, /etc/ssh, /root, ~/.ssh, shell history - NoNewPrivileges + dropped capabilities (no setuid escalation, no caps) - PrivateTmp, PrivateDevices, ProtectKernel*, MemoryDenyWriteExecute - SystemCallFilter @system-service ~@privileged ~@debug ~@mount etc. - RestrictAddressFamilies blocks raw/packet sockets Application layer: - Per-user rate limit 60/hour (configurable via AGENT_RATE_MAX) - Per-user concurrency cap of 1 in-flight (no parallel claude burns) - JSONL audit log of every /agent/ask to /var/log/overlord-agent/audit.jsonl Logs username, message preview, result preview, timing, errors. Plus secrets migration: EnvironmentFile now prefers /etc/overlord/agent.env (root:erik 0640) over /home/erik/MosswartOverlord/.env, so even the read-only /home doesn't expose them. Falls back to old path during transition.
This commit is contained in:
parent
4ae18536be
commit
9d4c724b7f
2 changed files with 219 additions and 12 deletions
|
|
@ -11,19 +11,92 @@ Group=erik
|
|||
# - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/
|
||||
# - .mcp.json is auto-loaded
|
||||
WorkingDirectory=/home/erik/MosswartOverlord
|
||||
# Secrets moved OUT of /home/erik/ to /etc/overlord/agent.env so
|
||||
# ProtectHome=read-only blocks their read entirely. The file is
|
||||
# root-owned, mode 0640, group=erik.
|
||||
EnvironmentFile=-/etc/overlord/agent.env
|
||||
# Backwards-compat: also try the old location during transition.
|
||||
EnvironmentFile=-/home/erik/MosswartOverlord/.env
|
||||
# Run inside the venv populated by install.sh.
|
||||
ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service
|
||||
Restart=on-failure
|
||||
RestartSec=3
|
||||
# Don't tie up the disk with stdout — let journald handle it.
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
# Resource hints — the service is light, but cap so a runaway can't
|
||||
# starve the host.
|
||||
# ─── Resource caps ─────────────────────────────────────────────────
|
||||
MemoryLimit=512M
|
||||
CPUQuota=200%
|
||||
TasksMax=128
|
||||
|
||||
# ─── Filesystem hardening ──────────────────────────────────────────
|
||||
# /usr, /boot, /efi become read-only; /etc + /var get a writable overlay
|
||||
# that's discarded on stop. Subprocesses inherit these protections.
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
# Allow writing only to the explicit paths claude / our service need.
|
||||
# - ~/.claude — session JSONL files
|
||||
# - .venv pycache — minor pip cache writes
|
||||
ReadWritePaths=/home/erik/.claude
|
||||
ReadWritePaths=/home/erik/MosswartOverlord/agent/.venv
|
||||
ReadWritePaths=/var/log/overlord-agent
|
||||
# Keep $HOME visible to the venv python so it can find pip cache etc.
|
||||
# (read-only via ProtectHome=read-only — this writable carve-out is
|
||||
# narrowly the .claude session dir above.)
|
||||
LogsDirectory=overlord-agent
|
||||
LogsDirectoryMode=0755
|
||||
PrivateTmp=true
|
||||
PrivateDevices=true
|
||||
ProtectClock=true
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
ProtectKernelLogs=true
|
||||
ProtectControlGroups=true
|
||||
ProtectHostname=true
|
||||
ProtectProc=invisible
|
||||
ProcSubset=pid
|
||||
|
||||
# Hide sensitive host paths even if something in the python or claude
|
||||
# subprocess tree tries to read them.
|
||||
InaccessiblePaths=/etc/shadow
|
||||
InaccessiblePaths=/etc/gshadow
|
||||
InaccessiblePaths=/etc/ssh
|
||||
InaccessiblePaths=/root
|
||||
InaccessiblePaths=-/home/erik/.ssh
|
||||
InaccessiblePaths=-/home/erik/.bash_history
|
||||
InaccessiblePaths=-/home/erik/.zsh_history
|
||||
|
||||
# ─── Privilege & capability hardening ──────────────────────────────
|
||||
NoNewPrivileges=true
|
||||
CapabilityBoundingSet=
|
||||
AmbientCapabilities=
|
||||
LockPersonality=true
|
||||
RestrictRealtime=true
|
||||
RestrictSUIDSGID=true
|
||||
RemoveIPC=true
|
||||
MemoryDenyWriteExecute=true
|
||||
RestrictNamespaces=true
|
||||
|
||||
# ─── Network family restriction ────────────────────────────────────
|
||||
# Block raw/packet sockets so even a kernel-LPE-class bug can't sniff
|
||||
# traffic or forge packets. We don't IPAddressAllow-restrict because
|
||||
# Anthropic's Cloudflare IPs shift and the whitelist would break claude.
|
||||
# If you need true egress filtering, run nftables scoped to this
|
||||
# service's cgroup — that's reliable in a way IPAddressAllow isn't.
|
||||
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
|
||||
|
||||
# ─── Syscall filter ────────────────────────────────────────────────
|
||||
SystemCallArchitectures=native
|
||||
SystemCallFilter=@system-service
|
||||
SystemCallFilter=~@privileged
|
||||
SystemCallFilter=~@resources
|
||||
SystemCallFilter=~@debug
|
||||
SystemCallFilter=~@mount
|
||||
SystemCallFilter=~@cpu-emulation
|
||||
SystemCallFilter=~@obsolete
|
||||
SystemCallFilter=~@reboot
|
||||
SystemCallFilter=~@swap
|
||||
SystemCallFilter=~@raw-io
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
|
|
|||
152
agent/service.py
152
agent/service.py
|
|
@ -15,10 +15,13 @@ cookie that dereth-tracker issues.
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
|
@ -35,6 +38,66 @@ logging.basicConfig(
|
|||
)
|
||||
logger = logging.getLogger("agent")
|
||||
|
||||
# Audit log — every /agent/ask request gets a JSONL line here, separate
|
||||
# from journald so the operator can grep without root. Set to /dev/null
|
||||
# to disable. Rotated externally (logrotate) if it gets big.
|
||||
AUDIT_LOG_PATH = Path(os.getenv("AGENT_AUDIT_LOG", "/var/log/overlord-agent/audit.jsonl"))
|
||||
audit_logger = logging.getLogger("agent.audit")
|
||||
try:
|
||||
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
_h = logging.FileHandler(AUDIT_LOG_PATH)
|
||||
_h.setFormatter(logging.Formatter("%(message)s"))
|
||||
audit_logger.addHandler(_h)
|
||||
audit_logger.propagate = False
|
||||
audit_logger.setLevel(logging.INFO)
|
||||
except OSError as e:
|
||||
logger.warning("audit log path %s not writable (%s); logging only via journal", AUDIT_LOG_PATH, e)
|
||||
|
||||
# Rate limit: per-user count over a rolling window. Defaults are generous
|
||||
# for a single human at a keyboard but block automated abuse.
|
||||
RATE_LIMIT_WINDOW_S = int(os.getenv("AGENT_RATE_WINDOW_S", "3600"))
|
||||
RATE_LIMIT_MAX = int(os.getenv("AGENT_RATE_MAX", "60"))
|
||||
# Per-user concurrent request cap (no fanning out 50 calls in parallel).
|
||||
CONCURRENCY_LIMIT_PER_USER = int(os.getenv("AGENT_CONCURRENCY_PER_USER", "1"))
|
||||
|
||||
# Rolling timestamps of recent /agent/ask calls per user.
|
||||
_rate_state: dict[str, deque[float]] = {}
|
||||
# Per-user semaphores so a single user can't run multiple concurrent claude
|
||||
# subprocesses (each is expensive).
|
||||
_user_semaphores: dict[str, asyncio.Semaphore] = {}
|
||||
|
||||
|
||||
def _check_rate_limit(username: str) -> tuple[bool, int]:
|
||||
"""Return (allowed, retry_after_seconds)."""
|
||||
now = time.monotonic()
|
||||
window = _rate_state.setdefault(username, deque())
|
||||
cutoff = now - RATE_LIMIT_WINDOW_S
|
||||
while window and window[0] < cutoff:
|
||||
window.popleft()
|
||||
if len(window) >= RATE_LIMIT_MAX:
|
||||
retry_after = int(window[0] + RATE_LIMIT_WINDOW_S - now) + 1
|
||||
return False, retry_after
|
||||
window.append(now)
|
||||
return True, 0
|
||||
|
||||
|
||||
def _user_semaphore(username: str) -> asyncio.Semaphore:
|
||||
sem = _user_semaphores.get(username)
|
||||
if sem is None:
|
||||
sem = asyncio.Semaphore(CONCURRENCY_LIMIT_PER_USER)
|
||||
_user_semaphores[username] = sem
|
||||
return sem
|
||||
|
||||
|
||||
def _audit(event: dict[str, Any]) -> None:
|
||||
"""Emit one JSONL line to the audit log."""
|
||||
event["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
try:
|
||||
audit_logger.info(json.dumps(event, ensure_ascii=False))
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
app = FastAPI(title="Overlord Agent", version="0.1.0")
|
||||
|
||||
|
||||
|
|
@ -101,25 +164,96 @@ async def new_session(_user: dict = Depends(auth.require_user)) -> NewSessionRes
|
|||
async def agent_ask(
|
||||
req: AskRequest, user: dict = Depends(auth.require_user)
|
||||
) -> AskResponse:
|
||||
"""Forward a message to claude -p resuming the given session."""
|
||||
started = time.monotonic()
|
||||
try:
|
||||
result = await ask_claude(req.message, req.session_id)
|
||||
except ClaudeError as e:
|
||||
logger.warning(
|
||||
"claude failed user=%s session=%s err=%s", user["username"], req.session_id, e
|
||||
"""Forward a message to claude -p resuming the given session.
|
||||
|
||||
Enforces:
|
||||
* Per-user rate limit (60 requests/hour by default).
|
||||
* Per-user concurrency cap (1 in-flight at a time by default).
|
||||
* Audit log of every request (JSONL).
|
||||
"""
|
||||
username = user["username"]
|
||||
|
||||
# Rate limit BEFORE acquiring the user semaphore — cheaper to reject.
|
||||
allowed, retry_after = _check_rate_limit(username)
|
||||
if not allowed:
|
||||
_audit(
|
||||
{
|
||||
"event": "rate_limited",
|
||||
"user": username,
|
||||
"session_id": req.session_id,
|
||||
"retry_after_s": retry_after,
|
||||
}
|
||||
)
|
||||
raise HTTPException(status_code=502, detail=str(e))
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail=f"Rate limit exceeded; retry in {retry_after}s",
|
||||
headers={"Retry-After": str(retry_after)},
|
||||
)
|
||||
|
||||
sem = _user_semaphore(username)
|
||||
if sem.locked():
|
||||
_audit(
|
||||
{
|
||||
"event": "concurrency_blocked",
|
||||
"user": username,
|
||||
"session_id": req.session_id,
|
||||
}
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=429, detail="A previous question is still being processed"
|
||||
)
|
||||
|
||||
started = time.monotonic()
|
||||
async with sem:
|
||||
_audit(
|
||||
{
|
||||
"event": "ask_start",
|
||||
"user": username,
|
||||
"session_id": req.session_id,
|
||||
"message": req.message[:500],
|
||||
"message_len": len(req.message),
|
||||
}
|
||||
)
|
||||
try:
|
||||
result = await ask_claude(req.message, req.session_id)
|
||||
except ClaudeError as e:
|
||||
elapsed_ms = int((time.monotonic() - started) * 1000)
|
||||
logger.warning(
|
||||
"claude failed user=%s session=%s err=%s", username, req.session_id, e
|
||||
)
|
||||
_audit(
|
||||
{
|
||||
"event": "ask_error",
|
||||
"user": username,
|
||||
"session_id": req.session_id,
|
||||
"error": str(e)[:500],
|
||||
"elapsed_ms": elapsed_ms,
|
||||
}
|
||||
)
|
||||
raise HTTPException(status_code=502, detail=str(e))
|
||||
|
||||
elapsed_ms = int((time.monotonic() - started) * 1000)
|
||||
logger.info(
|
||||
"ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)",
|
||||
user["username"],
|
||||
username,
|
||||
result.session_id,
|
||||
result.num_turns,
|
||||
elapsed_ms,
|
||||
result.duration_ms,
|
||||
)
|
||||
_audit(
|
||||
{
|
||||
"event": "ask_ok",
|
||||
"user": username,
|
||||
"session_id": result.session_id,
|
||||
"result_preview": (result.result or "")[:300],
|
||||
"result_len": len(result.result or ""),
|
||||
"turns": result.num_turns,
|
||||
"elapsed_ms": elapsed_ms,
|
||||
"subprocess_ms": result.duration_ms,
|
||||
"is_error": result.is_error,
|
||||
}
|
||||
)
|
||||
|
||||
return AskResponse(
|
||||
result=result.result,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue