feat: enhance service manager with manifest validation, logging rotation, and command existence checks
This commit is contained in:
@@ -9,6 +9,10 @@
|
|||||||
"command": ["python3", "scripts/mcp/aiw-context-mcp/server.py"],
|
"command": ["python3", "scripts/mcp/aiw-context-mcp/server.py"],
|
||||||
"groups": ["mcp", "context"],
|
"groups": ["mcp", "context"],
|
||||||
"restart": "on-failure",
|
"restart": "on-failure",
|
||||||
|
"doctor": {
|
||||||
|
"required_commands": ["python3"],
|
||||||
|
"required_paths": ["scripts/mcp/aiw-context-mcp/server.py"]
|
||||||
|
},
|
||||||
"health": {
|
"health": {
|
||||||
"type": "http",
|
"type": "http",
|
||||||
"url": "http://127.0.0.1:8765/health"
|
"url": "http://127.0.0.1:8765/health"
|
||||||
@@ -21,6 +25,10 @@
|
|||||||
"command": ["scripts/mattermost-proxy/run-mirror.sh"],
|
"command": ["scripts/mattermost-proxy/run-mirror.sh"],
|
||||||
"groups": ["communication", "mattermost", "capture"],
|
"groups": ["communication", "mattermost", "capture"],
|
||||||
"restart": "on-failure",
|
"restart": "on-failure",
|
||||||
|
"doctor": {
|
||||||
|
"required_commands": ["mitmdump"],
|
||||||
|
"optional_paths": ["scripts/mattermost-proxy/.env"]
|
||||||
|
},
|
||||||
"health": {
|
"health": {
|
||||||
"type": "tcp",
|
"type": "tcp",
|
||||||
"host": "127.0.0.1",
|
"host": "127.0.0.1",
|
||||||
@@ -34,7 +42,11 @@
|
|||||||
"command": ["scripts/mattermost-proxy/launch-mattermost.sh"],
|
"command": ["scripts/mattermost-proxy/launch-mattermost.sh"],
|
||||||
"groups": ["communication", "mattermost"],
|
"groups": ["communication", "mattermost"],
|
||||||
"depends_on": ["mattermost-proxy"],
|
"depends_on": ["mattermost-proxy"],
|
||||||
"restart": "never"
|
"restart": "never",
|
||||||
|
"doctor": {
|
||||||
|
"required_paths": ["/Applications/Mattermost.app"],
|
||||||
|
"optional_paths": ["scripts/mattermost-proxy/.env"]
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"photo-inbox": {
|
"photo-inbox": {
|
||||||
"enabled": true,
|
"enabled": true,
|
||||||
@@ -43,6 +55,11 @@
|
|||||||
"command": ["scripts/iphone-photo-inbox/run.sh"],
|
"command": ["scripts/iphone-photo-inbox/run.sh"],
|
||||||
"groups": ["inbox", "photos", "capture"],
|
"groups": ["inbox", "photos", "capture"],
|
||||||
"restart": "on-failure",
|
"restart": "on-failure",
|
||||||
|
"doctor": {
|
||||||
|
"required_commands": ["python3"],
|
||||||
|
"optional_commands": ["swiftc"],
|
||||||
|
"optional_paths": ["scripts/iphone-photo-inbox/.env"]
|
||||||
|
},
|
||||||
"health": {
|
"health": {
|
||||||
"type": "http",
|
"type": "http",
|
||||||
"url": "http://127.0.0.1:8787/health"
|
"url": "http://127.0.0.1:8787/health"
|
||||||
|
|||||||
@@ -30,6 +30,17 @@ python3 scripts/aiw/services.py start --profile fidelity --group inbox
|
|||||||
|
|
||||||
The service manager unifies startup and status. It does not move capture behavior into the MCP.
|
The service manager unifies startup and status. It does not move capture behavior into the MCP.
|
||||||
|
|
||||||
|
## Robustness features
|
||||||
|
|
||||||
|
- Manifest validation before lifecycle actions.
|
||||||
|
- Dependency-aware startup through `depends_on`.
|
||||||
|
- Managed PID/state files under `.aiw/runtime/`.
|
||||||
|
- Per-service logs under `.aiw/runtime/logs/`.
|
||||||
|
- Simple log rotation before service start.
|
||||||
|
- TCP/HTTP health checks.
|
||||||
|
- Doctor checks for required commands and paths declared in the profile manifest.
|
||||||
|
- Protection against starting duplicate services when a matching health check is already passing externally.
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -28,6 +28,9 @@ RUNTIME_DIR = ROOT / ".aiw" / "runtime"
|
|||||||
PID_DIR = RUNTIME_DIR / "pids"
|
PID_DIR = RUNTIME_DIR / "pids"
|
||||||
LOG_DIR = RUNTIME_DIR / "logs"
|
LOG_DIR = RUNTIME_DIR / "logs"
|
||||||
STATE_DIR = RUNTIME_DIR / "state"
|
STATE_DIR = RUNTIME_DIR / "state"
|
||||||
|
DEFAULT_LOG_MAX_BYTES = 5 * 1024 * 1024
|
||||||
|
DEFAULT_LOG_BACKUPS = 3
|
||||||
|
DEFAULT_STOP_TIMEOUT_SECONDS = 5.0
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -52,6 +55,30 @@ def load_manifest(profile: str) -> dict[str, Any]:
|
|||||||
return json.loads(path.read_text(encoding="utf-8"))
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def validate_manifest(manifest: dict[str, Any]) -> list[str]:
|
||||||
|
errors: list[str] = []
|
||||||
|
services = manifest.get("services")
|
||||||
|
if not isinstance(services, dict):
|
||||||
|
return ["manifest must contain a services object"]
|
||||||
|
for name, config in services.items():
|
||||||
|
if not isinstance(config, dict):
|
||||||
|
errors.append(f"{name}: service config must be an object")
|
||||||
|
continue
|
||||||
|
command = config.get("command")
|
||||||
|
if config.get("enabled", True) and (not isinstance(command, list) or not command):
|
||||||
|
errors.append(f"{name}: enabled services require a non-empty command list")
|
||||||
|
kind = config.get("kind", "process")
|
||||||
|
if kind not in {"process", "app-launcher", "mcp"}:
|
||||||
|
errors.append(f"{name}: unsupported kind {kind!r}")
|
||||||
|
restart = config.get("restart", "never")
|
||||||
|
if restart not in {"never", "on-failure", "always"}:
|
||||||
|
errors.append(f"{name}: unsupported restart policy {restart!r}")
|
||||||
|
for dependency in config.get("depends_on") or []:
|
||||||
|
if dependency not in services:
|
||||||
|
errors.append(f"{name}: depends on unknown service {dependency!r}")
|
||||||
|
return errors
|
||||||
|
|
||||||
|
|
||||||
def service_items(manifest: dict[str, Any], include_disabled: bool = False) -> list[ServiceRef]:
|
def service_items(manifest: dict[str, Any], include_disabled: bool = False) -> list[ServiceRef]:
|
||||||
services = manifest.get("services") or {}
|
services = manifest.get("services") or {}
|
||||||
refs: list[ServiceRef] = []
|
refs: list[ServiceRef] = []
|
||||||
@@ -94,6 +121,34 @@ def log_path(profile: str, service: str) -> Path:
|
|||||||
return LOG_DIR / profile / f"{service}.log"
|
return LOG_DIR / profile / f"{service}.log"
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_workspace_path(raw: str) -> Path:
|
||||||
|
path = Path(raw).expanduser()
|
||||||
|
return path if path.is_absolute() else ROOT / path
|
||||||
|
|
||||||
|
|
||||||
|
def command_exists(command: str) -> bool:
|
||||||
|
if not command:
|
||||||
|
return False
|
||||||
|
path = Path(command)
|
||||||
|
if path.is_absolute() or "/" in command:
|
||||||
|
resolved = resolve_workspace_path(command)
|
||||||
|
return resolved.exists() and os.access(resolved, os.X_OK)
|
||||||
|
return shutil_which(command) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_log_if_needed(path: Path, max_bytes: int = DEFAULT_LOG_MAX_BYTES, backups: int = DEFAULT_LOG_BACKUPS) -> None:
|
||||||
|
if max_bytes <= 0 or backups <= 0 or not path.exists() or path.stat().st_size < max_bytes:
|
||||||
|
return
|
||||||
|
oldest = path.with_suffix(path.suffix + f".{backups}")
|
||||||
|
oldest.unlink(missing_ok=True)
|
||||||
|
for index in range(backups - 1, 0, -1):
|
||||||
|
src = path.with_suffix(path.suffix + f".{index}")
|
||||||
|
dst = path.with_suffix(path.suffix + f".{index + 1}")
|
||||||
|
if src.exists():
|
||||||
|
src.replace(dst)
|
||||||
|
path.replace(path.with_suffix(path.suffix + ".1"))
|
||||||
|
|
||||||
|
|
||||||
def read_pid(profile: str, service: str) -> int | None:
|
def read_pid(profile: str, service: str) -> int | None:
|
||||||
path = pid_path(profile, service)
|
path = pid_path(profile, service)
|
||||||
if not path.is_file():
|
if not path.is_file():
|
||||||
@@ -189,6 +244,8 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start
|
|||||||
command = ref.config.get("command") or []
|
command = ref.config.get("command") or []
|
||||||
if not command:
|
if not command:
|
||||||
raise SystemExit(f"{ref.name} has no command")
|
raise SystemExit(f"{ref.name} has no command")
|
||||||
|
if not command_exists(str(command[0])):
|
||||||
|
raise SystemExit(f"{ref.name} command is not executable or not found: {command[0]}")
|
||||||
|
|
||||||
if kind != "app-launcher":
|
if kind != "app-launcher":
|
||||||
pid = read_pid(profile, ref.name)
|
pid = read_pid(profile, ref.name)
|
||||||
@@ -206,6 +263,7 @@ def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], start
|
|||||||
|
|
||||||
path = log_path(profile, ref.name)
|
path = log_path(profile, ref.name)
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
rotate_log_if_needed(path, int(ref.config.get("log_max_bytes", DEFAULT_LOG_MAX_BYTES)), int(ref.config.get("log_backups", DEFAULT_LOG_BACKUPS)))
|
||||||
with path.open("ab") as log_file:
|
with path.open("ab") as log_file:
|
||||||
log_file.write(f"\n--- start {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode("utf-8"))
|
log_file.write(f"\n--- start {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode("utf-8"))
|
||||||
if kind == "app-launcher":
|
if kind == "app-launcher":
|
||||||
@@ -245,7 +303,7 @@ def stop_service(profile: str, ref: ServiceRef) -> None:
|
|||||||
pass
|
pass
|
||||||
except PermissionError:
|
except PermissionError:
|
||||||
os.kill(pid, signal.SIGTERM)
|
os.kill(pid, signal.SIGTERM)
|
||||||
deadline = time.time() + 5
|
deadline = time.time() + float(ref.config.get("stop_timeout_seconds", DEFAULT_STOP_TIMEOUT_SECONDS))
|
||||||
while time.time() < deadline and is_running(pid):
|
while time.time() < deadline and is_running(pid):
|
||||||
time.sleep(0.2)
|
time.sleep(0.2)
|
||||||
if is_running(pid):
|
if is_running(pid):
|
||||||
@@ -302,16 +360,36 @@ def run_doctor(profile: str, manifest: dict[str, Any]) -> None:
|
|||||||
print(f"manifest: {manifest_path(profile)}")
|
print(f"manifest: {manifest_path(profile)}")
|
||||||
ensure_runtime()
|
ensure_runtime()
|
||||||
print(f"runtime: {RUNTIME_DIR}")
|
print(f"runtime: {RUNTIME_DIR}")
|
||||||
|
errors = validate_manifest(manifest)
|
||||||
|
if errors:
|
||||||
|
print("manifest: invalid")
|
||||||
|
for error in errors:
|
||||||
|
print(f" ! {error}")
|
||||||
|
else:
|
||||||
|
print("manifest: ok")
|
||||||
for ref in service_items(manifest, include_disabled=True):
|
for ref in service_items(manifest, include_disabled=True):
|
||||||
enabled = ref.config.get("enabled", True)
|
enabled = ref.config.get("enabled", True)
|
||||||
command = ref.config.get("command") or []
|
command = ref.config.get("command") or []
|
||||||
first = command[0] if command else ""
|
first = command[0] if command else ""
|
||||||
command_path = ROOT / first if first and not os.path.isabs(first) else Path(first) if first else None
|
command_ok = command_exists(first) if first else False
|
||||||
command_ok = bool(command_path and (command_path.exists() or shutil_which(first)))
|
|
||||||
ok, detail = health_ok(ref.config)
|
|
||||||
enabled_text = "enabled" if enabled else "disabled"
|
enabled_text = "enabled" if enabled else "disabled"
|
||||||
|
if not enabled:
|
||||||
|
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; health skipped")
|
||||||
|
continue
|
||||||
|
ok, detail = health_ok(ref.config)
|
||||||
health_text = detail if ok is not None else "no health check"
|
health_text = detail if ok is not None else "no health check"
|
||||||
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; {health_text}")
|
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; {health_text}")
|
||||||
|
doctor = ref.config.get("doctor") or {}
|
||||||
|
for command_name in doctor.get("required_commands") or []:
|
||||||
|
print(f" required command {command_name}: {'ok' if command_exists(command_name) else 'missing'}")
|
||||||
|
for command_name in doctor.get("optional_commands") or []:
|
||||||
|
print(f" optional command {command_name}: {'ok' if command_exists(command_name) else 'missing'}")
|
||||||
|
for raw_path in doctor.get("required_paths") or []:
|
||||||
|
path = resolve_workspace_path(str(raw_path))
|
||||||
|
print(f" required path {raw_path}: {'ok' if path.exists() else 'missing'}")
|
||||||
|
for raw_path in doctor.get("optional_paths") or []:
|
||||||
|
path = resolve_workspace_path(str(raw_path))
|
||||||
|
print(f" optional path {raw_path}: {'ok' if path.exists() else 'missing'}")
|
||||||
|
|
||||||
|
|
||||||
def shutil_which(command: str) -> str | None:
|
def shutil_which(command: str) -> str | None:
|
||||||
@@ -339,6 +417,10 @@ def main() -> None:
|
|||||||
run_doctor(args.profile, manifest)
|
run_doctor(args.profile, manifest)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
errors = validate_manifest(manifest)
|
||||||
|
if errors:
|
||||||
|
raise SystemExit("invalid services manifest:\n" + "\n".join(f"- {error}" for error in errors))
|
||||||
|
|
||||||
include_disabled = args.action == "status"
|
include_disabled = args.action == "status"
|
||||||
refs = select_services(manifest, args.services, args.group or None, include_disabled=include_disabled)
|
refs = select_services(manifest, args.services, args.group or None, include_disabled=include_disabled)
|
||||||
|
|
||||||
|
|||||||
@@ -75,6 +75,50 @@ class ServiceManagerTests(unittest.TestCase):
|
|||||||
self.assertTrue(ok)
|
self.assertTrue(ok)
|
||||||
self.assertIn(f"127.0.0.1:{port}", detail)
|
self.assertIn(f"127.0.0.1:{port}", detail)
|
||||||
|
|
||||||
|
def test_validate_manifest_reports_bad_dependencies(self) -> None:
|
||||||
|
manifest = {
|
||||||
|
"services": {
|
||||||
|
"bad": {
|
||||||
|
"enabled": True,
|
||||||
|
"kind": "process",
|
||||||
|
"command": ["python3", "-c", "pass"],
|
||||||
|
"depends_on": ["missing"],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
errors = services.validate_manifest(manifest)
|
||||||
|
|
||||||
|
self.assertTrue(any("depends on unknown service" in error for error in errors))
|
||||||
|
|
||||||
|
def test_validate_manifest_reports_missing_command_for_enabled_service(self) -> None:
|
||||||
|
manifest = {"services": {"bad": {"enabled": True, "kind": "process"}}}
|
||||||
|
|
||||||
|
errors = services.validate_manifest(manifest)
|
||||||
|
|
||||||
|
self.assertTrue(any("non-empty command" in error for error in errors))
|
||||||
|
|
||||||
|
def test_command_exists_supports_relative_executable_paths(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
root = Path(tmp)
|
||||||
|
script = root / "bin" / "tool.sh"
|
||||||
|
script.parent.mkdir(parents=True)
|
||||||
|
script.write_text("#!/usr/bin/env bash\n", encoding="utf-8")
|
||||||
|
script.chmod(0o755)
|
||||||
|
|
||||||
|
with patch.object(services, "ROOT", root):
|
||||||
|
self.assertTrue(services.command_exists("bin/tool.sh"))
|
||||||
|
|
||||||
|
def test_rotate_log_if_needed_keeps_backups(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
log = Path(tmp) / "service.log"
|
||||||
|
log.write_text("old", encoding="utf-8")
|
||||||
|
|
||||||
|
services.rotate_log_if_needed(log, max_bytes=1, backups=2)
|
||||||
|
|
||||||
|
self.assertFalse(log.exists())
|
||||||
|
self.assertEqual(log.with_suffix(".log.1").read_text(encoding="utf-8"), "old")
|
||||||
|
|
||||||
def test_read_pid_ignores_invalid_pid_file(self) -> None:
|
def test_read_pid_ignores_invalid_pid_file(self) -> None:
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
pid_dir = Path(tmp) / "pids"
|
pid_dir = Path(tmp) / "pids"
|
||||||
|
|||||||
Reference in New Issue
Block a user