From 9f8d3b975fc39250709099e6e3b0dd2457cdd589 Mon Sep 17 00:00:00 2001 From: "david.delagneau" Date: Wed, 20 May 2026 14:51:09 -0600 Subject: [PATCH] feat: enhance service manager with manifest validation, logging rotation, and command existence checks --- profiles/fidelity/services.json | 19 ++++++- scripts/aiw/README.md | 11 ++++ scripts/aiw/services.py | 90 +++++++++++++++++++++++++++++++-- scripts/aiw/test_services.py | 44 ++++++++++++++++ 4 files changed, 159 insertions(+), 5 deletions(-) diff --git a/profiles/fidelity/services.json b/profiles/fidelity/services.json index 1788bc0..bc61f38 100644 --- a/profiles/fidelity/services.json +++ b/profiles/fidelity/services.json @@ -9,6 +9,10 @@ "command": ["python3", "scripts/mcp/aiw-context-mcp/server.py"], "groups": ["mcp", "context"], "restart": "on-failure", + "doctor": { + "required_commands": ["python3"], + "required_paths": ["scripts/mcp/aiw-context-mcp/server.py"] + }, "health": { "type": "http", "url": "http://127.0.0.1:8765/health" @@ -21,6 +25,10 @@ "command": ["scripts/mattermost-proxy/run-mirror.sh"], "groups": ["communication", "mattermost", "capture"], "restart": "on-failure", + "doctor": { + "required_commands": ["mitmdump"], + "optional_paths": ["scripts/mattermost-proxy/.env"] + }, "health": { "type": "tcp", "host": "127.0.0.1", @@ -34,7 +42,11 @@ "command": ["scripts/mattermost-proxy/launch-mattermost.sh"], "groups": ["communication", "mattermost"], "depends_on": ["mattermost-proxy"], - "restart": "never" + "restart": "never", + "doctor": { + "required_paths": ["/Applications/Mattermost.app"], + "optional_paths": ["scripts/mattermost-proxy/.env"] + } }, "photo-inbox": { "enabled": true, @@ -43,6 +55,11 @@ "command": ["scripts/iphone-photo-inbox/run.sh"], "groups": ["inbox", "photos", "capture"], "restart": "on-failure", + "doctor": { + "required_commands": ["python3"], + "optional_commands": ["swiftc"], + "optional_paths": ["scripts/iphone-photo-inbox/.env"] + }, "health": { "type": "http", "url": "http://127.0.0.1:8787/health" diff --git a/scripts/aiw/README.md b/scripts/aiw/README.md index fcddf69..18284a4 100644 --- a/scripts/aiw/README.md +++ b/scripts/aiw/README.md @@ -30,6 +30,17 @@ python3 scripts/aiw/services.py start --profile fidelity --group inbox The service manager unifies startup and status. It does not move capture behavior into the MCP. +## Robustness features + +- Manifest validation before lifecycle actions. +- Dependency-aware startup through `depends_on`. +- Managed PID/state files under `.aiw/runtime/`. +- Per-service logs under `.aiw/runtime/logs/`. +- Simple log rotation before service start. +- TCP/HTTP health checks. +- Doctor checks for required commands and paths declared in the profile manifest. +- Protection against starting duplicate services when a matching health check is already passing externally. + ## Tests ```bash diff --git a/scripts/aiw/services.py b/scripts/aiw/services.py index a8227c9..e25b420 100644 --- a/scripts/aiw/services.py +++ b/scripts/aiw/services.py @@ -28,6 +28,9 @@ RUNTIME_DIR = ROOT / ".aiw" / "runtime" PID_DIR = RUNTIME_DIR / "pids" LOG_DIR = RUNTIME_DIR / "logs" STATE_DIR = RUNTIME_DIR / "state" +DEFAULT_LOG_MAX_BYTES = 5 * 1024 * 1024 +DEFAULT_LOG_BACKUPS = 3 +DEFAULT_STOP_TIMEOUT_SECONDS = 5.0 @dataclass(frozen=True) @@ -52,6 +55,30 @@ def load_manifest(profile: str) -> dict[str, Any]: return json.loads(path.read_text(encoding="utf-8")) +def validate_manifest(manifest: dict[str, Any]) -> list[str]: + errors: list[str] = [] + services = manifest.get("services") + if not isinstance(services, dict): + return ["manifest must contain a services object"] + for name, config in services.items(): + if not isinstance(config, dict): + errors.append(f"{name}: service config must be an object") + continue + command = config.get("command") + if config.get("enabled", True) and (not isinstance(command, list) or not command): + errors.append(f"{name}: enabled services require a non-empty command list") + kind = config.get("kind", "process") + if kind not in {"process", "app-launcher", "mcp"}: + errors.append(f"{name}: unsupported kind {kind!r}") + restart = config.get("restart", "never") + if restart not in {"never", "on-failure", "always"}: + errors.append(f"{name}: unsupported restart policy {restart!r}") + for dependency in config.get("depends_on") or []: + if dependency not in services: + errors.append(f"{name}: depends on unknown service {dependency!r}") + return errors + + def service_items(manifest: dict[str, Any], include_disabled: bool = False) -> list[ServiceRef]: services = manifest.get("services") or {} refs: list[ServiceRef] = [] @@ -94,6 +121,34 @@ def log_path(profile: str, service: str) -> Path: return LOG_DIR / profile / f"{service}.log" +def resolve_workspace_path(raw: str) -> Path: + path = Path(raw).expanduser() + return path if path.is_absolute() else ROOT / path + + +def command_exists(command: str) -> bool: + if not command: + return False + path = Path(command) + if path.is_absolute() or "/" in command: + resolved = resolve_workspace_path(command) + return resolved.exists() and os.access(resolved, os.X_OK) + return shutil_which(command) is not None + + +def rotate_log_if_needed(path: Path, max_bytes: int = DEFAULT_LOG_MAX_BYTES, backups: int = DEFAULT_LOG_BACKUPS) -> None: + if max_bytes <= 0 or backups <= 0 or not path.exists() or path.stat().st_size < max_bytes: + return + oldest = path.with_suffix(path.suffix + f".{backups}") + oldest.unlink(missing_ok=True) + for index in range(backups - 1, 0, -1): + src = path.with_suffix(path.suffix + f".{index}") + dst = path.with_suffix(path.suffix + f".{index + 1}") + if src.exists(): + src.replace(dst) + path.replace(path.with_suffix(path.suffix + ".1")) + + def read_pid(profile: str, service: str) -> int | None: path = pid_path(profile, service) if not path.is_file(): @@ -189,6 +244,8 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start command = ref.config.get("command") or [] if not command: raise SystemExit(f"{ref.name} has no command") + if not command_exists(str(command[0])): + raise SystemExit(f"{ref.name} command is not executable or not found: {command[0]}") if kind != "app-launcher": pid = read_pid(profile, ref.name) @@ -206,6 +263,7 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start path = log_path(profile, ref.name) path.parent.mkdir(parents=True, exist_ok=True) + rotate_log_if_needed(path, int(ref.config.get("log_max_bytes", DEFAULT_LOG_MAX_BYTES)), int(ref.config.get("log_backups", DEFAULT_LOG_BACKUPS))) with path.open("ab") as log_file: log_file.write(f"\n--- start {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode("utf-8")) if kind == "app-launcher": @@ -245,7 +303,7 @@ def stop_service(profile: str, ref: ServiceRef) -> None: pass except PermissionError: os.kill(pid, signal.SIGTERM) - deadline = time.time() + 5 + deadline = time.time() + float(ref.config.get("stop_timeout_seconds", DEFAULT_STOP_TIMEOUT_SECONDS)) while time.time() < deadline and is_running(pid): time.sleep(0.2) if is_running(pid): @@ -302,16 +360,36 @@ def run_doctor(profile: str, manifest: dict[str, Any]) -> None: print(f"manifest: {manifest_path(profile)}") ensure_runtime() print(f"runtime: {RUNTIME_DIR}") + errors = validate_manifest(manifest) + if errors: + print("manifest: invalid") + for error in errors: + print(f" ! {error}") + else: + print("manifest: ok") for ref in service_items(manifest, include_disabled=True): enabled = ref.config.get("enabled", True) command = ref.config.get("command") or [] first = command[0] if command else "" - command_path = ROOT / first if first and not os.path.isabs(first) else Path(first) if first else None - command_ok = bool(command_path and (command_path.exists() or shutil_which(first))) - ok, detail = health_ok(ref.config) + command_ok = command_exists(first) if first else False enabled_text = "enabled" if enabled else "disabled" + if not enabled: + print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; health skipped") + continue + ok, detail = health_ok(ref.config) health_text = detail if ok is not None else "no health check" print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; {health_text}") + doctor = ref.config.get("doctor") or {} + for command_name in doctor.get("required_commands") or []: + print(f" required command {command_name}: {'ok' if command_exists(command_name) else 'missing'}") + for command_name in doctor.get("optional_commands") or []: + print(f" optional command {command_name}: {'ok' if command_exists(command_name) else 'missing'}") + for raw_path in doctor.get("required_paths") or []: + path = resolve_workspace_path(str(raw_path)) + print(f" required path {raw_path}: {'ok' if path.exists() else 'missing'}") + for raw_path in doctor.get("optional_paths") or []: + path = resolve_workspace_path(str(raw_path)) + print(f" optional path {raw_path}: {'ok' if path.exists() else 'missing'}") def shutil_which(command: str) -> str | None: @@ -339,6 +417,10 @@ def main() -> None: run_doctor(args.profile, manifest) return + errors = validate_manifest(manifest) + if errors: + raise SystemExit("invalid services manifest:\n" + "\n".join(f"- {error}" for error in errors)) + include_disabled = args.action == "status" refs = select_services(manifest, args.services, args.group or None, include_disabled=include_disabled) diff --git a/scripts/aiw/test_services.py b/scripts/aiw/test_services.py index 70eec67..f22ec8b 100644 --- a/scripts/aiw/test_services.py +++ b/scripts/aiw/test_services.py @@ -75,6 +75,50 @@ class ServiceManagerTests(unittest.TestCase): self.assertTrue(ok) self.assertIn(f"127.0.0.1:{port}", detail) + def test_validate_manifest_reports_bad_dependencies(self) -> None: + manifest = { + "services": { + "bad": { + "enabled": True, + "kind": "process", + "command": ["python3", "-c", "pass"], + "depends_on": ["missing"], + } + } + } + + errors = services.validate_manifest(manifest) + + self.assertTrue(any("depends on unknown service" in error for error in errors)) + + def test_validate_manifest_reports_missing_command_for_enabled_service(self) -> None: + manifest = {"services": {"bad": {"enabled": True, "kind": "process"}}} + + errors = services.validate_manifest(manifest) + + self.assertTrue(any("non-empty command" in error for error in errors)) + + def test_command_exists_supports_relative_executable_paths(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + script = root / "bin" / "tool.sh" + script.parent.mkdir(parents=True) + script.write_text("#!/usr/bin/env bash\n", encoding="utf-8") + script.chmod(0o755) + + with patch.object(services, "ROOT", root): + self.assertTrue(services.command_exists("bin/tool.sh")) + + def test_rotate_log_if_needed_keeps_backups(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + log = Path(tmp) / "service.log" + log.write_text("old", encoding="utf-8") + + services.rotate_log_if_needed(log, max_bytes=1, backups=2) + + self.assertFalse(log.exists()) + self.assertEqual(log.with_suffix(".log.1").read_text(encoding="utf-8"), "old") + def test_read_pid_ignores_invalid_pid_file(self) -> None: with tempfile.TemporaryDirectory() as tmp: pid_dir = Path(tmp) / "pids"