feat(agent): security hardening — systemd lockdown, rate limit, audit log
systemd unit now applies defense-in-depth: - ProtectSystem=strict + ProtectHome=read-only (rest of FS sealed) - ReadWritePaths only for ~/.claude (session JSONLs) and venv + audit log - InaccessiblePaths blocks /etc/shadow, /etc/ssh, /root, ~/.ssh, shell history - NoNewPrivileges + dropped capabilities (no setuid escalation, no caps) - PrivateTmp, PrivateDevices, ProtectKernel*, MemoryDenyWriteExecute - SystemCallFilter @system-service ~@privileged ~@debug ~@mount etc. - RestrictAddressFamilies blocks raw/packet sockets Application layer: - Per-user rate limit 60/hour (configurable via AGENT_RATE_MAX) - Per-user concurrency cap of 1 in-flight (no parallel claude burns) - JSONL audit log of every /agent/ask to /var/log/overlord-agent/audit.jsonl Logs username, message preview, result preview, timing, errors. Plus secrets migration: EnvironmentFile now prefers /etc/overlord/agent.env (root:erik 0640) over /home/erik/MosswartOverlord/.env, so even the read-only /home doesn't expose them. Falls back to old path during transition.
This commit is contained in:
parent
4ae18536be
commit
9d4c724b7f
2 changed files with 219 additions and 12 deletions
|
|
@ -11,19 +11,92 @@ Group=erik
|
||||||
# - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/
|
# - claude -p sessions land at ~/.claude/projects/-home-erik-MosswartOverlord/
|
||||||
# - .mcp.json is auto-loaded
|
# - .mcp.json is auto-loaded
|
||||||
WorkingDirectory=/home/erik/MosswartOverlord
|
WorkingDirectory=/home/erik/MosswartOverlord
|
||||||
|
# Secrets moved OUT of /home/erik/ to /etc/overlord/agent.env so
|
||||||
|
# ProtectHome=read-only blocks their read entirely. The file is
|
||||||
|
# root-owned, mode 0640, group=erik.
|
||||||
|
EnvironmentFile=-/etc/overlord/agent.env
|
||||||
|
# Backwards-compat: also try the old location during transition.
|
||||||
EnvironmentFile=-/home/erik/MosswartOverlord/.env
|
EnvironmentFile=-/home/erik/MosswartOverlord/.env
|
||||||
# Run inside the venv populated by install.sh.
|
# Run inside the venv populated by install.sh.
|
||||||
ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service
|
ExecStart=/home/erik/MosswartOverlord/agent/.venv/bin/python -m agent.service
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=3
|
RestartSec=3
|
||||||
# Don't tie up the disk with stdout — let journald handle it.
|
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
|
|
||||||
# Resource hints — the service is light, but cap so a runaway can't
|
# ─── Resource caps ─────────────────────────────────────────────────
|
||||||
# starve the host.
|
|
||||||
MemoryLimit=512M
|
MemoryLimit=512M
|
||||||
CPUQuota=200%
|
CPUQuota=200%
|
||||||
|
TasksMax=128
|
||||||
|
|
||||||
|
# ─── Filesystem hardening ──────────────────────────────────────────
|
||||||
|
# /usr, /boot, /efi become read-only; /etc + /var get a writable overlay
|
||||||
|
# that's discarded on stop. Subprocesses inherit these protections.
|
||||||
|
ProtectSystem=strict
|
||||||
|
ProtectHome=read-only
|
||||||
|
# Allow writing only to the explicit paths claude / our service need.
|
||||||
|
# - ~/.claude — session JSONL files
|
||||||
|
# - .venv pycache — minor pip cache writes
|
||||||
|
ReadWritePaths=/home/erik/.claude
|
||||||
|
ReadWritePaths=/home/erik/MosswartOverlord/agent/.venv
|
||||||
|
ReadWritePaths=/var/log/overlord-agent
|
||||||
|
# Keep $HOME visible to the venv python so it can find pip cache etc.
|
||||||
|
# (read-only via ProtectHome=read-only — this writable carve-out is
|
||||||
|
# narrowly the .claude session dir above.)
|
||||||
|
LogsDirectory=overlord-agent
|
||||||
|
LogsDirectoryMode=0755
|
||||||
|
PrivateTmp=true
|
||||||
|
PrivateDevices=true
|
||||||
|
ProtectClock=true
|
||||||
|
ProtectKernelTunables=true
|
||||||
|
ProtectKernelModules=true
|
||||||
|
ProtectKernelLogs=true
|
||||||
|
ProtectControlGroups=true
|
||||||
|
ProtectHostname=true
|
||||||
|
ProtectProc=invisible
|
||||||
|
ProcSubset=pid
|
||||||
|
|
||||||
|
# Hide sensitive host paths even if something in the python or claude
|
||||||
|
# subprocess tree tries to read them.
|
||||||
|
InaccessiblePaths=/etc/shadow
|
||||||
|
InaccessiblePaths=/etc/gshadow
|
||||||
|
InaccessiblePaths=/etc/ssh
|
||||||
|
InaccessiblePaths=/root
|
||||||
|
InaccessiblePaths=-/home/erik/.ssh
|
||||||
|
InaccessiblePaths=-/home/erik/.bash_history
|
||||||
|
InaccessiblePaths=-/home/erik/.zsh_history
|
||||||
|
|
||||||
|
# ─── Privilege & capability hardening ──────────────────────────────
|
||||||
|
NoNewPrivileges=true
|
||||||
|
CapabilityBoundingSet=
|
||||||
|
AmbientCapabilities=
|
||||||
|
LockPersonality=true
|
||||||
|
RestrictRealtime=true
|
||||||
|
RestrictSUIDSGID=true
|
||||||
|
RemoveIPC=true
|
||||||
|
MemoryDenyWriteExecute=true
|
||||||
|
RestrictNamespaces=true
|
||||||
|
|
||||||
|
# ─── Network family restriction ────────────────────────────────────
|
||||||
|
# Block raw/packet sockets so even a kernel-LPE-class bug can't sniff
|
||||||
|
# traffic or forge packets. We don't IPAddressAllow-restrict because
|
||||||
|
# Anthropic's Cloudflare IPs shift and the whitelist would break claude.
|
||||||
|
# If you need true egress filtering, run nftables scoped to this
|
||||||
|
# service's cgroup — that's reliable in a way IPAddressAllow isn't.
|
||||||
|
RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
|
||||||
|
|
||||||
|
# ─── Syscall filter ────────────────────────────────────────────────
|
||||||
|
SystemCallArchitectures=native
|
||||||
|
SystemCallFilter=@system-service
|
||||||
|
SystemCallFilter=~@privileged
|
||||||
|
SystemCallFilter=~@resources
|
||||||
|
SystemCallFilter=~@debug
|
||||||
|
SystemCallFilter=~@mount
|
||||||
|
SystemCallFilter=~@cpu-emulation
|
||||||
|
SystemCallFilter=~@obsolete
|
||||||
|
SystemCallFilter=~@reboot
|
||||||
|
SystemCallFilter=~@swap
|
||||||
|
SystemCallFilter=~@raw-io
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|
|
||||||
152
agent/service.py
152
agent/service.py
|
|
@ -15,10 +15,13 @@ cookie that dereth-tracker issues.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
from collections import deque
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
@ -35,6 +38,66 @@ logging.basicConfig(
|
||||||
)
|
)
|
||||||
logger = logging.getLogger("agent")
|
logger = logging.getLogger("agent")
|
||||||
|
|
||||||
|
# Audit log — every /agent/ask request gets a JSONL line here, separate
|
||||||
|
# from journald so the operator can grep without root. Set to /dev/null
|
||||||
|
# to disable. Rotated externally (logrotate) if it gets big.
|
||||||
|
AUDIT_LOG_PATH = Path(os.getenv("AGENT_AUDIT_LOG", "/var/log/overlord-agent/audit.jsonl"))
|
||||||
|
audit_logger = logging.getLogger("agent.audit")
|
||||||
|
try:
|
||||||
|
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
_h = logging.FileHandler(AUDIT_LOG_PATH)
|
||||||
|
_h.setFormatter(logging.Formatter("%(message)s"))
|
||||||
|
audit_logger.addHandler(_h)
|
||||||
|
audit_logger.propagate = False
|
||||||
|
audit_logger.setLevel(logging.INFO)
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning("audit log path %s not writable (%s); logging only via journal", AUDIT_LOG_PATH, e)
|
||||||
|
|
||||||
|
# Rate limit: per-user count over a rolling window. Defaults are generous
|
||||||
|
# for a single human at a keyboard but block automated abuse.
|
||||||
|
RATE_LIMIT_WINDOW_S = int(os.getenv("AGENT_RATE_WINDOW_S", "3600"))
|
||||||
|
RATE_LIMIT_MAX = int(os.getenv("AGENT_RATE_MAX", "60"))
|
||||||
|
# Per-user concurrent request cap (no fanning out 50 calls in parallel).
|
||||||
|
CONCURRENCY_LIMIT_PER_USER = int(os.getenv("AGENT_CONCURRENCY_PER_USER", "1"))
|
||||||
|
|
||||||
|
# Rolling timestamps of recent /agent/ask calls per user.
|
||||||
|
_rate_state: dict[str, deque[float]] = {}
|
||||||
|
# Per-user semaphores so a single user can't run multiple concurrent claude
|
||||||
|
# subprocesses (each is expensive).
|
||||||
|
_user_semaphores: dict[str, asyncio.Semaphore] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _check_rate_limit(username: str) -> tuple[bool, int]:
|
||||||
|
"""Return (allowed, retry_after_seconds)."""
|
||||||
|
now = time.monotonic()
|
||||||
|
window = _rate_state.setdefault(username, deque())
|
||||||
|
cutoff = now - RATE_LIMIT_WINDOW_S
|
||||||
|
while window and window[0] < cutoff:
|
||||||
|
window.popleft()
|
||||||
|
if len(window) >= RATE_LIMIT_MAX:
|
||||||
|
retry_after = int(window[0] + RATE_LIMIT_WINDOW_S - now) + 1
|
||||||
|
return False, retry_after
|
||||||
|
window.append(now)
|
||||||
|
return True, 0
|
||||||
|
|
||||||
|
|
||||||
|
def _user_semaphore(username: str) -> asyncio.Semaphore:
|
||||||
|
sem = _user_semaphores.get(username)
|
||||||
|
if sem is None:
|
||||||
|
sem = asyncio.Semaphore(CONCURRENCY_LIMIT_PER_USER)
|
||||||
|
_user_semaphores[username] = sem
|
||||||
|
return sem
|
||||||
|
|
||||||
|
|
||||||
|
def _audit(event: dict[str, Any]) -> None:
|
||||||
|
"""Emit one JSONL line to the audit log."""
|
||||||
|
event["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||||
|
try:
|
||||||
|
audit_logger.info(json.dumps(event, ensure_ascii=False))
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(title="Overlord Agent", version="0.1.0")
|
app = FastAPI(title="Overlord Agent", version="0.1.0")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -101,25 +164,96 @@ async def new_session(_user: dict = Depends(auth.require_user)) -> NewSessionRes
|
||||||
async def agent_ask(
|
async def agent_ask(
|
||||||
req: AskRequest, user: dict = Depends(auth.require_user)
|
req: AskRequest, user: dict = Depends(auth.require_user)
|
||||||
) -> AskResponse:
|
) -> AskResponse:
|
||||||
"""Forward a message to claude -p resuming the given session."""
|
"""Forward a message to claude -p resuming the given session.
|
||||||
started = time.monotonic()
|
|
||||||
try:
|
Enforces:
|
||||||
result = await ask_claude(req.message, req.session_id)
|
* Per-user rate limit (60 requests/hour by default).
|
||||||
except ClaudeError as e:
|
* Per-user concurrency cap (1 in-flight at a time by default).
|
||||||
logger.warning(
|
* Audit log of every request (JSONL).
|
||||||
"claude failed user=%s session=%s err=%s", user["username"], req.session_id, e
|
"""
|
||||||
|
username = user["username"]
|
||||||
|
|
||||||
|
# Rate limit BEFORE acquiring the user semaphore — cheaper to reject.
|
||||||
|
allowed, retry_after = _check_rate_limit(username)
|
||||||
|
if not allowed:
|
||||||
|
_audit(
|
||||||
|
{
|
||||||
|
"event": "rate_limited",
|
||||||
|
"user": username,
|
||||||
|
"session_id": req.session_id,
|
||||||
|
"retry_after_s": retry_after,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
raise HTTPException(status_code=502, detail=str(e))
|
raise HTTPException(
|
||||||
|
status_code=429,
|
||||||
|
detail=f"Rate limit exceeded; retry in {retry_after}s",
|
||||||
|
headers={"Retry-After": str(retry_after)},
|
||||||
|
)
|
||||||
|
|
||||||
|
sem = _user_semaphore(username)
|
||||||
|
if sem.locked():
|
||||||
|
_audit(
|
||||||
|
{
|
||||||
|
"event": "concurrency_blocked",
|
||||||
|
"user": username,
|
||||||
|
"session_id": req.session_id,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=429, detail="A previous question is still being processed"
|
||||||
|
)
|
||||||
|
|
||||||
|
started = time.monotonic()
|
||||||
|
async with sem:
|
||||||
|
_audit(
|
||||||
|
{
|
||||||
|
"event": "ask_start",
|
||||||
|
"user": username,
|
||||||
|
"session_id": req.session_id,
|
||||||
|
"message": req.message[:500],
|
||||||
|
"message_len": len(req.message),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
result = await ask_claude(req.message, req.session_id)
|
||||||
|
except ClaudeError as e:
|
||||||
|
elapsed_ms = int((time.monotonic() - started) * 1000)
|
||||||
|
logger.warning(
|
||||||
|
"claude failed user=%s session=%s err=%s", username, req.session_id, e
|
||||||
|
)
|
||||||
|
_audit(
|
||||||
|
{
|
||||||
|
"event": "ask_error",
|
||||||
|
"user": username,
|
||||||
|
"session_id": req.session_id,
|
||||||
|
"error": str(e)[:500],
|
||||||
|
"elapsed_ms": elapsed_ms,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
raise HTTPException(status_code=502, detail=str(e))
|
||||||
|
|
||||||
elapsed_ms = int((time.monotonic() - started) * 1000)
|
elapsed_ms = int((time.monotonic() - started) * 1000)
|
||||||
logger.info(
|
logger.info(
|
||||||
"ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)",
|
"ask user=%s session=%s turns=%d duration_ms=%d (subprocess=%dms)",
|
||||||
user["username"],
|
username,
|
||||||
result.session_id,
|
result.session_id,
|
||||||
result.num_turns,
|
result.num_turns,
|
||||||
elapsed_ms,
|
elapsed_ms,
|
||||||
result.duration_ms,
|
result.duration_ms,
|
||||||
)
|
)
|
||||||
|
_audit(
|
||||||
|
{
|
||||||
|
"event": "ask_ok",
|
||||||
|
"user": username,
|
||||||
|
"session_id": result.session_id,
|
||||||
|
"result_preview": (result.result or "")[:300],
|
||||||
|
"result_len": len(result.result or ""),
|
||||||
|
"turns": result.num_turns,
|
||||||
|
"elapsed_ms": elapsed_ms,
|
||||||
|
"subprocess_ms": result.duration_ms,
|
||||||
|
"is_error": result.is_error,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return AskResponse(
|
return AskResponse(
|
||||||
result=result.result,
|
result=result.result,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue