feat: enhance service manager with manifest validation, logging rotation, and command existence checks

This commit is contained in:
2026-05-20 14:51:09 -06:00
parent 1121433db8
commit 9f8d3b975f
4 changed files with 159 additions and 5 deletions

View File

@@ -9,6 +9,10 @@
"command": ["python3", "scripts/mcp/aiw-context-mcp/server.py"],
"groups": ["mcp", "context"],
"restart": "on-failure",
"doctor": {
"required_commands": ["python3"],
"required_paths": ["scripts/mcp/aiw-context-mcp/server.py"]
},
"health": {
"type": "http",
"url": "http://127.0.0.1:8765/health"
@@ -21,6 +25,10 @@
"command": ["scripts/mattermost-proxy/run-mirror.sh"],
"groups": ["communication", "mattermost", "capture"],
"restart": "on-failure",
"doctor": {
"required_commands": ["mitmdump"],
"optional_paths": ["scripts/mattermost-proxy/.env"]
},
"health": {
"type": "tcp",
"host": "127.0.0.1",
@@ -34,7 +42,11 @@
"command": ["scripts/mattermost-proxy/launch-mattermost.sh"],
"groups": ["communication", "mattermost"],
"depends_on": ["mattermost-proxy"],
"restart": "never"
"restart": "never",
"doctor": {
"required_paths": ["/Applications/Mattermost.app"],
"optional_paths": ["scripts/mattermost-proxy/.env"]
}
},
"photo-inbox": {
"enabled": true,
@@ -43,6 +55,11 @@
"command": ["scripts/iphone-photo-inbox/run.sh"],
"groups": ["inbox", "photos", "capture"],
"restart": "on-failure",
"doctor": {
"required_commands": ["python3"],
"optional_commands": ["swiftc"],
"optional_paths": ["scripts/iphone-photo-inbox/.env"]
},
"health": {
"type": "http",
"url": "http://127.0.0.1:8787/health"

View File

@@ -30,6 +30,17 @@ python3 scripts/aiw/services.py start --profile fidelity --group inbox
The service manager unifies startup and status. It does not move capture behavior into the MCP.
## Robustness features
- Manifest validation before lifecycle actions.
- Dependency-aware startup through `depends_on`.
- Managed PID/state files under `.aiw/runtime/`.
- Per-service logs under `.aiw/runtime/logs/`.
- Simple log rotation before service start.
- TCP/HTTP health checks.
- Doctor checks for required commands and paths declared in the profile manifest.
- Protection against starting duplicate services when a matching health check is already passing externally.
## Tests
```bash

View File

@@ -28,6 +28,9 @@ RUNTIME_DIR = ROOT / ".aiw" / "runtime"
PID_DIR = RUNTIME_DIR / "pids"
LOG_DIR = RUNTIME_DIR / "logs"
STATE_DIR = RUNTIME_DIR / "state"
DEFAULT_LOG_MAX_BYTES = 5 * 1024 * 1024
DEFAULT_LOG_BACKUPS = 3
DEFAULT_STOP_TIMEOUT_SECONDS = 5.0
@dataclass(frozen=True)
@@ -52,6 +55,30 @@ def load_manifest(profile: str) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8"))
def validate_manifest(manifest: dict[str, Any]) -> list[str]:
errors: list[str] = []
services = manifest.get("services")
if not isinstance(services, dict):
return ["manifest must contain a services object"]
for name, config in services.items():
if not isinstance(config, dict):
errors.append(f"{name}: service config must be an object")
continue
command = config.get("command")
if config.get("enabled", True) and (not isinstance(command, list) or not command):
errors.append(f"{name}: enabled services require a non-empty command list")
kind = config.get("kind", "process")
if kind not in {"process", "app-launcher", "mcp"}:
errors.append(f"{name}: unsupported kind {kind!r}")
restart = config.get("restart", "never")
if restart not in {"never", "on-failure", "always"}:
errors.append(f"{name}: unsupported restart policy {restart!r}")
for dependency in config.get("depends_on") or []:
if dependency not in services:
errors.append(f"{name}: depends on unknown service {dependency!r}")
return errors
def service_items(manifest: dict[str, Any], include_disabled: bool = False) -> list[ServiceRef]:
services = manifest.get("services") or {}
refs: list[ServiceRef] = []
@@ -94,6 +121,34 @@ def log_path(profile: str, service: str) -> Path:
return LOG_DIR / profile / f"{service}.log"
def resolve_workspace_path(raw: str) -> Path:
path = Path(raw).expanduser()
return path if path.is_absolute() else ROOT / path
def command_exists(command: str) -> bool:
if not command:
return False
path = Path(command)
if path.is_absolute() or "/" in command:
resolved = resolve_workspace_path(command)
return resolved.exists() and os.access(resolved, os.X_OK)
return shutil_which(command) is not None
def rotate_log_if_needed(path: Path, max_bytes: int = DEFAULT_LOG_MAX_BYTES, backups: int = DEFAULT_LOG_BACKUPS) -> None:
if max_bytes <= 0 or backups <= 0 or not path.exists() or path.stat().st_size < max_bytes:
return
oldest = path.with_suffix(path.suffix + f".{backups}")
oldest.unlink(missing_ok=True)
for index in range(backups - 1, 0, -1):
src = path.with_suffix(path.suffix + f".{index}")
dst = path.with_suffix(path.suffix + f".{index + 1}")
if src.exists():
src.replace(dst)
path.replace(path.with_suffix(path.suffix + ".1"))
def read_pid(profile: str, service: str) -> int | None:
path = pid_path(profile, service)
if not path.is_file():
@@ -189,6 +244,8 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start
command = ref.config.get("command") or []
if not command:
raise SystemExit(f"{ref.name} has no command")
if not command_exists(str(command[0])):
raise SystemExit(f"{ref.name} command is not executable or not found: {command[0]}")
if kind != "app-launcher":
pid = read_pid(profile, ref.name)
@@ -206,6 +263,7 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start
path = log_path(profile, ref.name)
path.parent.mkdir(parents=True, exist_ok=True)
rotate_log_if_needed(path, int(ref.config.get("log_max_bytes", DEFAULT_LOG_MAX_BYTES)), int(ref.config.get("log_backups", DEFAULT_LOG_BACKUPS)))
with path.open("ab") as log_file:
log_file.write(f"\n--- start {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode("utf-8"))
if kind == "app-launcher":
@@ -245,7 +303,7 @@ def stop_service(profile: str, ref: ServiceRef) -> None:
pass
except PermissionError:
os.kill(pid, signal.SIGTERM)
deadline = time.time() + 5
deadline = time.time() + float(ref.config.get("stop_timeout_seconds", DEFAULT_STOP_TIMEOUT_SECONDS))
while time.time() < deadline and is_running(pid):
time.sleep(0.2)
if is_running(pid):
@@ -302,16 +360,36 @@ def run_doctor(profile: str, manifest: dict[str, Any]) -> None:
print(f"manifest: {manifest_path(profile)}")
ensure_runtime()
print(f"runtime: {RUNTIME_DIR}")
errors = validate_manifest(manifest)
if errors:
print("manifest: invalid")
for error in errors:
print(f" ! {error}")
else:
print("manifest: ok")
for ref in service_items(manifest, include_disabled=True):
enabled = ref.config.get("enabled", True)
command = ref.config.get("command") or []
first = command[0] if command else ""
command_path = ROOT / first if first and not os.path.isabs(first) else Path(first) if first else None
command_ok = bool(command_path and (command_path.exists() or shutil_which(first)))
ok, detail = health_ok(ref.config)
command_ok = command_exists(first) if first else False
enabled_text = "enabled" if enabled else "disabled"
if not enabled:
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; health skipped")
continue
ok, detail = health_ok(ref.config)
health_text = detail if ok is not None else "no health check"
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; {health_text}")
doctor = ref.config.get("doctor") or {}
for command_name in doctor.get("required_commands") or []:
print(f" required command {command_name}: {'ok' if command_exists(command_name) else 'missing'}")
for command_name in doctor.get("optional_commands") or []:
print(f" optional command {command_name}: {'ok' if command_exists(command_name) else 'missing'}")
for raw_path in doctor.get("required_paths") or []:
path = resolve_workspace_path(str(raw_path))
print(f" required path {raw_path}: {'ok' if path.exists() else 'missing'}")
for raw_path in doctor.get("optional_paths") or []:
path = resolve_workspace_path(str(raw_path))
print(f" optional path {raw_path}: {'ok' if path.exists() else 'missing'}")
def shutil_which(command: str) -> str | None:
@@ -339,6 +417,10 @@ def main() -> None:
run_doctor(args.profile, manifest)
return
errors = validate_manifest(manifest)
if errors:
raise SystemExit("invalid services manifest:\n" + "\n".join(f"- {error}" for error in errors))
include_disabled = args.action == "status"
refs = select_services(manifest, args.services, args.group or None, include_disabled=include_disabled)

View File

@@ -75,6 +75,50 @@ class ServiceManagerTests(unittest.TestCase):
self.assertTrue(ok)
self.assertIn(f"127.0.0.1:{port}", detail)
def test_validate_manifest_reports_bad_dependencies(self) -> None:
manifest = {
"services": {
"bad": {
"enabled": True,
"kind": "process",
"command": ["python3", "-c", "pass"],
"depends_on": ["missing"],
}
}
}
errors = services.validate_manifest(manifest)
self.assertTrue(any("depends on unknown service" in error for error in errors))
def test_validate_manifest_reports_missing_command_for_enabled_service(self) -> None:
manifest = {"services": {"bad": {"enabled": True, "kind": "process"}}}
errors = services.validate_manifest(manifest)
self.assertTrue(any("non-empty command" in error for error in errors))
def test_command_exists_supports_relative_executable_paths(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
script = root / "bin" / "tool.sh"
script.parent.mkdir(parents=True)
script.write_text("#!/usr/bin/env bash\n", encoding="utf-8")
script.chmod(0o755)
with patch.object(services, "ROOT", root):
self.assertTrue(services.command_exists("bin/tool.sh"))
def test_rotate_log_if_needed_keeps_backups(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
log = Path(tmp) / "service.log"
log.write_text("old", encoding="utf-8")
services.rotate_log_if_needed(log, max_bytes=1, backups=2)
self.assertFalse(log.exists())
self.assertEqual(log.with_suffix(".log.1").read_text(encoding="utf-8"), "old")
def test_read_pid_ignores_invalid_pid_file(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
pid_dir = Path(tmp) / "pids"