feat: enhance service manager with manifest validation, logging rotation, and command existence checks

This commit is contained in:
2026-05-20 14:51:09 -06:00
parent 1121433db8
commit 9f8d3b975f
4 changed files with 159 additions and 5 deletions

View File

@@ -9,6 +9,10 @@
"command": ["python3", "scripts/mcp/aiw-context-mcp/server.py"], "command": ["python3", "scripts/mcp/aiw-context-mcp/server.py"],
"groups": ["mcp", "context"], "groups": ["mcp", "context"],
"restart": "on-failure", "restart": "on-failure",
"doctor": {
"required_commands": ["python3"],
"required_paths": ["scripts/mcp/aiw-context-mcp/server.py"]
},
"health": { "health": {
"type": "http", "type": "http",
"url": "http://127.0.0.1:8765/health" "url": "http://127.0.0.1:8765/health"
@@ -21,6 +25,10 @@
"command": ["scripts/mattermost-proxy/run-mirror.sh"], "command": ["scripts/mattermost-proxy/run-mirror.sh"],
"groups": ["communication", "mattermost", "capture"], "groups": ["communication", "mattermost", "capture"],
"restart": "on-failure", "restart": "on-failure",
"doctor": {
"required_commands": ["mitmdump"],
"optional_paths": ["scripts/mattermost-proxy/.env"]
},
"health": { "health": {
"type": "tcp", "type": "tcp",
"host": "127.0.0.1", "host": "127.0.0.1",
@@ -34,7 +42,11 @@
"command": ["scripts/mattermost-proxy/launch-mattermost.sh"], "command": ["scripts/mattermost-proxy/launch-mattermost.sh"],
"groups": ["communication", "mattermost"], "groups": ["communication", "mattermost"],
"depends_on": ["mattermost-proxy"], "depends_on": ["mattermost-proxy"],
"restart": "never" "restart": "never",
"doctor": {
"required_paths": ["/Applications/Mattermost.app"],
"optional_paths": ["scripts/mattermost-proxy/.env"]
}
}, },
"photo-inbox": { "photo-inbox": {
"enabled": true, "enabled": true,
@@ -43,6 +55,11 @@
"command": ["scripts/iphone-photo-inbox/run.sh"], "command": ["scripts/iphone-photo-inbox/run.sh"],
"groups": ["inbox", "photos", "capture"], "groups": ["inbox", "photos", "capture"],
"restart": "on-failure", "restart": "on-failure",
"doctor": {
"required_commands": ["python3"],
"optional_commands": ["swiftc"],
"optional_paths": ["scripts/iphone-photo-inbox/.env"]
},
"health": { "health": {
"type": "http", "type": "http",
"url": "http://127.0.0.1:8787/health" "url": "http://127.0.0.1:8787/health"

View File

@@ -30,6 +30,17 @@ python3 scripts/aiw/services.py start --profile fidelity --group inbox
The service manager unifies startup and status. It does not move capture behavior into the MCP. The service manager unifies startup and status. It does not move capture behavior into the MCP.
## Robustness features
- Manifest validation before lifecycle actions.
- Dependency-aware startup through `depends_on`.
- Managed PID/state files under `.aiw/runtime/`.
- Per-service logs under `.aiw/runtime/logs/`.
- Simple log rotation before service start.
- TCP/HTTP health checks.
- Doctor checks for required commands and paths declared in the profile manifest.
- Protection against starting duplicate services when a matching health check is already passing externally.
## Tests ## Tests
```bash ```bash

View File

@@ -28,6 +28,9 @@ RUNTIME_DIR = ROOT / ".aiw" / "runtime"
PID_DIR = RUNTIME_DIR / "pids" PID_DIR = RUNTIME_DIR / "pids"
LOG_DIR = RUNTIME_DIR / "logs" LOG_DIR = RUNTIME_DIR / "logs"
STATE_DIR = RUNTIME_DIR / "state" STATE_DIR = RUNTIME_DIR / "state"
DEFAULT_LOG_MAX_BYTES = 5 * 1024 * 1024
DEFAULT_LOG_BACKUPS = 3
DEFAULT_STOP_TIMEOUT_SECONDS = 5.0
@dataclass(frozen=True) @dataclass(frozen=True)
@@ -52,6 +55,30 @@ def load_manifest(profile: str) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8")) return json.loads(path.read_text(encoding="utf-8"))
def validate_manifest(manifest: dict[str, Any]) -> list[str]:
errors: list[str] = []
services = manifest.get("services")
if not isinstance(services, dict):
return ["manifest must contain a services object"]
for name, config in services.items():
if not isinstance(config, dict):
errors.append(f"{name}: service config must be an object")
continue
command = config.get("command")
if config.get("enabled", True) and (not isinstance(command, list) or not command):
errors.append(f"{name}: enabled services require a non-empty command list")
kind = config.get("kind", "process")
if kind not in {"process", "app-launcher", "mcp"}:
errors.append(f"{name}: unsupported kind {kind!r}")
restart = config.get("restart", "never")
if restart not in {"never", "on-failure", "always"}:
errors.append(f"{name}: unsupported restart policy {restart!r}")
for dependency in config.get("depends_on") or []:
if dependency not in services:
errors.append(f"{name}: depends on unknown service {dependency!r}")
return errors
def service_items(manifest: dict[str, Any], include_disabled: bool = False) -> list[ServiceRef]: def service_items(manifest: dict[str, Any], include_disabled: bool = False) -> list[ServiceRef]:
services = manifest.get("services") or {} services = manifest.get("services") or {}
refs: list[ServiceRef] = [] refs: list[ServiceRef] = []
@@ -94,6 +121,34 @@ def log_path(profile: str, service: str) -> Path:
return LOG_DIR / profile / f"{service}.log" return LOG_DIR / profile / f"{service}.log"
def resolve_workspace_path(raw: str) -> Path:
path = Path(raw).expanduser()
return path if path.is_absolute() else ROOT / path
def command_exists(command: str) -> bool:
if not command:
return False
path = Path(command)
if path.is_absolute() or "/" in command:
resolved = resolve_workspace_path(command)
return resolved.exists() and os.access(resolved, os.X_OK)
return shutil_which(command) is not None
def rotate_log_if_needed(path: Path, max_bytes: int = DEFAULT_LOG_MAX_BYTES, backups: int = DEFAULT_LOG_BACKUPS) -> None:
if max_bytes <= 0 or backups <= 0 or not path.exists() or path.stat().st_size < max_bytes:
return
oldest = path.with_suffix(path.suffix + f".{backups}")
oldest.unlink(missing_ok=True)
for index in range(backups - 1, 0, -1):
src = path.with_suffix(path.suffix + f".{index}")
dst = path.with_suffix(path.suffix + f".{index + 1}")
if src.exists():
src.replace(dst)
path.replace(path.with_suffix(path.suffix + ".1"))
def read_pid(profile: str, service: str) -> int | None: def read_pid(profile: str, service: str) -> int | None:
path = pid_path(profile, service) path = pid_path(profile, service)
if not path.is_file(): if not path.is_file():
@@ -189,6 +244,8 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start
command = ref.config.get("command") or [] command = ref.config.get("command") or []
if not command: if not command:
raise SystemExit(f"{ref.name} has no command") raise SystemExit(f"{ref.name} has no command")
if not command_exists(str(command[0])):
raise SystemExit(f"{ref.name} command is not executable or not found: {command[0]}")
if kind != "app-launcher": if kind != "app-launcher":
pid = read_pid(profile, ref.name) pid = read_pid(profile, ref.name)
@@ -206,6 +263,7 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start
path = log_path(profile, ref.name) path = log_path(profile, ref.name)
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
rotate_log_if_needed(path, int(ref.config.get("log_max_bytes", DEFAULT_LOG_MAX_BYTES)), int(ref.config.get("log_backups", DEFAULT_LOG_BACKUPS)))
with path.open("ab") as log_file: with path.open("ab") as log_file:
log_file.write(f"\n--- start {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode("utf-8")) log_file.write(f"\n--- start {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode("utf-8"))
if kind == "app-launcher": if kind == "app-launcher":
@@ -245,7 +303,7 @@ def stop_service(profile: str, ref: ServiceRef) -> None:
pass pass
except PermissionError: except PermissionError:
os.kill(pid, signal.SIGTERM) os.kill(pid, signal.SIGTERM)
deadline = time.time() + 5 deadline = time.time() + float(ref.config.get("stop_timeout_seconds", DEFAULT_STOP_TIMEOUT_SECONDS))
while time.time() < deadline and is_running(pid): while time.time() < deadline and is_running(pid):
time.sleep(0.2) time.sleep(0.2)
if is_running(pid): if is_running(pid):
@@ -302,16 +360,36 @@ def run_doctor(profile: str, manifest: dict[str, Any]) -> None:
print(f"manifest: {manifest_path(profile)}") print(f"manifest: {manifest_path(profile)}")
ensure_runtime() ensure_runtime()
print(f"runtime: {RUNTIME_DIR}") print(f"runtime: {RUNTIME_DIR}")
errors = validate_manifest(manifest)
if errors:
print("manifest: invalid")
for error in errors:
print(f" ! {error}")
else:
print("manifest: ok")
for ref in service_items(manifest, include_disabled=True): for ref in service_items(manifest, include_disabled=True):
enabled = ref.config.get("enabled", True) enabled = ref.config.get("enabled", True)
command = ref.config.get("command") or [] command = ref.config.get("command") or []
first = command[0] if command else "" first = command[0] if command else ""
command_path = ROOT / first if first and not os.path.isabs(first) else Path(first) if first else None command_ok = command_exists(first) if first else False
command_ok = bool(command_path and (command_path.exists() or shutil_which(first)))
ok, detail = health_ok(ref.config)
enabled_text = "enabled" if enabled else "disabled" enabled_text = "enabled" if enabled else "disabled"
if not enabled:
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; health skipped")
continue
ok, detail = health_ok(ref.config)
health_text = detail if ok is not None else "no health check" health_text = detail if ok is not None else "no health check"
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; {health_text}") print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; {health_text}")
doctor = ref.config.get("doctor") or {}
for command_name in doctor.get("required_commands") or []:
print(f" required command {command_name}: {'ok' if command_exists(command_name) else 'missing'}")
for command_name in doctor.get("optional_commands") or []:
print(f" optional command {command_name}: {'ok' if command_exists(command_name) else 'missing'}")
for raw_path in doctor.get("required_paths") or []:
path = resolve_workspace_path(str(raw_path))
print(f" required path {raw_path}: {'ok' if path.exists() else 'missing'}")
for raw_path in doctor.get("optional_paths") or []:
path = resolve_workspace_path(str(raw_path))
print(f" optional path {raw_path}: {'ok' if path.exists() else 'missing'}")
def shutil_which(command: str) -> str | None: def shutil_which(command: str) -> str | None:
@@ -339,6 +417,10 @@ def main() -> None:
run_doctor(args.profile, manifest) run_doctor(args.profile, manifest)
return return
errors = validate_manifest(manifest)
if errors:
raise SystemExit("invalid services manifest:\n" + "\n".join(f"- {error}" for error in errors))
include_disabled = args.action == "status" include_disabled = args.action == "status"
refs = select_services(manifest, args.services, args.group or None, include_disabled=include_disabled) refs = select_services(manifest, args.services, args.group or None, include_disabled=include_disabled)

View File

@@ -75,6 +75,50 @@ class ServiceManagerTests(unittest.TestCase):
self.assertTrue(ok) self.assertTrue(ok)
self.assertIn(f"127.0.0.1:{port}", detail) self.assertIn(f"127.0.0.1:{port}", detail)
def test_validate_manifest_reports_bad_dependencies(self) -> None:
manifest = {
"services": {
"bad": {
"enabled": True,
"kind": "process",
"command": ["python3", "-c", "pass"],
"depends_on": ["missing"],
}
}
}
errors = services.validate_manifest(manifest)
self.assertTrue(any("depends on unknown service" in error for error in errors))
def test_validate_manifest_reports_missing_command_for_enabled_service(self) -> None:
manifest = {"services": {"bad": {"enabled": True, "kind": "process"}}}
errors = services.validate_manifest(manifest)
self.assertTrue(any("non-empty command" in error for error in errors))
def test_command_exists_supports_relative_executable_paths(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
script = root / "bin" / "tool.sh"
script.parent.mkdir(parents=True)
script.write_text("#!/usr/bin/env bash\n", encoding="utf-8")
script.chmod(0o755)
with patch.object(services, "ROOT", root):
self.assertTrue(services.command_exists("bin/tool.sh"))
def test_rotate_log_if_needed_keeps_backups(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
log = Path(tmp) / "service.log"
log.write_text("old", encoding="utf-8")
services.rotate_log_if_needed(log, max_bytes=1, backups=2)
self.assertFalse(log.exists())
self.assertEqual(log.with_suffix(".log.1").read_text(encoding="utf-8"), "old")
def test_read_pid_ignores_invalid_pid_file(self) -> None: def test_read_pid_ignores_invalid_pid_file(self) -> None:
with tempfile.TemporaryDirectory() as tmp: with tempfile.TemporaryDirectory() as tmp:
pid_dir = Path(tmp) / "pids" pid_dir = Path(tmp) / "pids"