feat: implement AI Workspace service manager with lifecycle control for local services

This commit is contained in:
2026-05-20 14:43:52 -06:00
parent eb11bb9442
commit 1121433db8
8 changed files with 627 additions and 0 deletions

37
scripts/aiw/README.md Normal file
View File

@@ -0,0 +1,37 @@
# AI Workspace Service Manager
The service manager is the local lifecycle layer for AI Workspace services.
It reads `profiles/<profile>/services.json`, starts/stops enabled services, records logs under `.aiw/runtime/logs/`, and keeps PID/state files under `.aiw/runtime/`.
## Common commands
```bash
python3 scripts/aiw/services.py status --profile fidelity
python3 scripts/aiw/services.py doctor --profile fidelity
python3 scripts/aiw/services.py start --profile fidelity
python3 scripts/aiw/services.py stop --profile fidelity
python3 scripts/aiw/services.py logs mattermost-proxy --profile fidelity
```
Start a subset by group:
```bash
python3 scripts/aiw/services.py start --profile fidelity --group communication
python3 scripts/aiw/services.py start --profile fidelity --group inbox
```
## Current Fidelity services
- `mattermost-proxy`: runs the local Mattermost proxy mirror.
- `mattermost-desktop`: launches Mattermost Desktop through the proxy.
- `photo-inbox`: runs the local HTTP photo receiver.
- `aiw-context-mcp`: reserved placeholder for the future read-only context MCP server.
The service manager unifies startup and status. It does not move capture behavior into the MCP.
## Tests
```bash
python3 scripts/aiw/test_services.py
```

370
scripts/aiw/services.py Normal file
View File

@@ -0,0 +1,370 @@
#!/usr/bin/env python3
"""AI Workspace local service manager.
This is the profile-aware lifecycle layer for local capture/query services. It is
intentionally small and dependency-free so it can run before the future desktop
UI or MCP server exists.
"""
from __future__ import annotations
import argparse
import json
import os
import signal
import socket
import subprocess
import sys
import time
import urllib.error
import urllib.request
from dataclasses import dataclass
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[2]
RUNTIME_DIR = ROOT / ".aiw" / "runtime"
PID_DIR = RUNTIME_DIR / "pids"
LOG_DIR = RUNTIME_DIR / "logs"
STATE_DIR = RUNTIME_DIR / "state"
@dataclass(frozen=True)
class ServiceRef:
name: str
config: dict[str, Any]
def ensure_runtime() -> None:
for path in [PID_DIR, LOG_DIR, STATE_DIR]:
path.mkdir(parents=True, exist_ok=True)
def manifest_path(profile: str) -> Path:
return ROOT / "profiles" / profile / "services.json"
def load_manifest(profile: str) -> dict[str, Any]:
path = manifest_path(profile)
if not path.is_file():
raise SystemExit(f"services manifest not found: {path}")
return json.loads(path.read_text(encoding="utf-8"))
def service_items(manifest: dict[str, Any], include_disabled: bool = False) -> list[ServiceRef]:
services = manifest.get("services") or {}
refs: list[ServiceRef] = []
for name, config in services.items():
if not include_disabled and not config.get("enabled", True):
continue
refs.append(ServiceRef(name, config))
return refs
def select_services(manifest: dict[str, Any], names: list[str], group: str | None, include_disabled: bool = False) -> list[ServiceRef]:
refs = service_items(manifest, include_disabled=include_disabled)
by_name = {ref.name: ref for ref in refs}
if names:
selected: list[ServiceRef] = []
missing: list[str] = []
for name in names:
ref = by_name.get(name)
if ref is None:
missing.append(name)
else:
selected.append(ref)
if missing:
raise SystemExit("unknown or disabled service(s): " + ", ".join(missing))
return selected
if group:
return [ref for ref in refs if group in (ref.config.get("groups") or [])]
return refs
def pid_path(profile: str, service: str) -> Path:
return PID_DIR / profile / f"{service}.pid"
def state_path(profile: str, service: str) -> Path:
return STATE_DIR / profile / f"{service}.json"
def log_path(profile: str, service: str) -> Path:
return LOG_DIR / profile / f"{service}.log"
def read_pid(profile: str, service: str) -> int | None:
path = pid_path(profile, service)
if not path.is_file():
return None
try:
return int(path.read_text(encoding="utf-8").strip())
except ValueError:
return None
def is_running(pid: int | None) -> bool:
if not pid or pid <= 0:
return False
try:
os.kill(pid, 0)
except ProcessLookupError:
return False
except PermissionError:
return True
try:
result = subprocess.run(["ps", "-o", "stat=", "-p", str(pid)], check=False, capture_output=True, text=True)
if result.returncode != 0:
return False
state = result.stdout.strip()
if state.startswith("Z"):
return False
except OSError:
pass
return True
def write_state(profile: str, service: str, state: dict[str, Any]) -> None:
path = state_path(profile, service)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(state, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def read_state(profile: str, service: str) -> dict[str, Any]:
path = state_path(profile, service)
if not path.is_file():
return {}
try:
return json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return {}
def health_ok(config: dict[str, Any], timeout: float = 1.0) -> tuple[bool | None, str]:
health = config.get("health") or {}
kind = health.get("type")
if not kind:
return None, "no health check"
if kind == "tcp":
host = str(health.get("host") or "127.0.0.1")
port = int(health.get("port") or 0)
try:
with socket.create_connection((host, port), timeout=timeout):
return True, f"tcp {host}:{port} ok"
except OSError as error:
return False, f"tcp {host}:{port} failed: {error}"
if kind == "http":
url = str(health.get("url") or "")
try:
with urllib.request.urlopen(url, timeout=timeout) as response:
ok = 200 <= int(response.status) < 400
return ok, f"http {url} status {response.status}"
except (urllib.error.URLError, TimeoutError, OSError) as error:
return False, f"http {url} failed: {error}"
return None, f"unknown health type: {kind}"
def wait_for_health(config: dict[str, Any], seconds: float = 8.0) -> tuple[bool | None, str]:
deadline = time.time() + seconds
last: tuple[bool | None, str] = (None, "no health check")
while time.time() <= deadline:
last = health_ok(config)
if last[0] is True or last[0] is None:
return last
time.sleep(0.4)
return last
def start_service(profile: str, ref: ServiceRef, manifest: dict[str, Any], started: set[str]) -> None:
if ref.name in started:
return
for dependency in ref.config.get("depends_on") or []:
dep_config = (manifest.get("services") or {}).get(dependency)
if not dep_config or not dep_config.get("enabled", True):
raise SystemExit(f"{ref.name} depends on missing/disabled service: {dependency}")
start_service(profile, ServiceRef(dependency, dep_config), manifest, started)
kind = ref.config.get("kind", "process")
command = ref.config.get("command") or []
if not command:
raise SystemExit(f"{ref.name} has no command")
if kind != "app-launcher":
pid = read_pid(profile, ref.name)
if is_running(pid):
ok, detail = health_ok(ref.config)
status = "running" if ok is not False else "running unhealthy"
print(f"{ref.name}: {status} ({detail})")
started.add(ref.name)
return
ok, detail = health_ok(ref.config)
if ok is True:
print(f"{ref.name}: externally running ({detail}); not starting duplicate")
started.add(ref.name)
return
path = log_path(profile, ref.name)
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("ab") as log_file:
log_file.write(f"\n--- start {time.strftime('%Y-%m-%d %H:%M:%S')} ---\n".encode("utf-8"))
if kind == "app-launcher":
result = subprocess.run(command, cwd=ROOT, stdout=log_file, stderr=subprocess.STDOUT, check=False)
write_state(profile, ref.name, {"last_launch_exit": result.returncode, "launched_at": time.time()})
print(f"{ref.name}: launched (exit {result.returncode})")
else:
process = subprocess.Popen(command, cwd=ROOT, stdout=log_file, stderr=subprocess.STDOUT, start_new_session=True)
pid_file = pid_path(profile, ref.name)
pid_file.parent.mkdir(parents=True, exist_ok=True)
pid_file.write_text(str(process.pid) + "\n", encoding="utf-8")
ok, detail = wait_for_health(ref.config)
state = "started" if ok is not False else "started but health check failed"
write_state(profile, ref.name, {"pid": process.pid, "started_at": time.time(), "health": detail})
print(f"{ref.name}: {state} pid={process.pid} ({detail})")
started.add(ref.name)
def stop_service(profile: str, ref: ServiceRef) -> None:
kind = ref.config.get("kind", "process")
if kind == "app-launcher":
print(f"{ref.name}: launcher service has no managed process")
return
pid = read_pid(profile, ref.name)
if not is_running(pid):
ok, detail = health_ok(ref.config)
if ok is True:
print(f"{ref.name}: externally running ({detail}); no managed pid to stop")
return
print(f"{ref.name}: not running")
pid_path(profile, ref.name).unlink(missing_ok=True)
return
assert pid is not None
try:
os.killpg(pid, signal.SIGTERM)
except ProcessLookupError:
pass
except PermissionError:
os.kill(pid, signal.SIGTERM)
deadline = time.time() + 5
while time.time() < deadline and is_running(pid):
time.sleep(0.2)
if is_running(pid):
try:
os.killpg(pid, signal.SIGKILL)
except Exception:
os.kill(pid, signal.SIGKILL)
print(f"{ref.name}: killed pid={pid}")
else:
print(f"{ref.name}: stopped")
pid_path(profile, ref.name).unlink(missing_ok=True)
write_state(profile, ref.name, {"stopped_at": time.time()})
def status_service(profile: str, ref: ServiceRef) -> None:
enabled = ref.config.get("enabled", True)
kind = ref.config.get("kind", "process")
if not enabled:
print(f"{ref.name}: disabled")
return
if kind == "app-launcher":
state = read_state(profile, ref.name)
launched = state.get("launched_at")
suffix = f"last launched {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(launched))}" if launched else "not launched by manager"
print(f"{ref.name}: launcher ({suffix})")
return
pid = read_pid(profile, ref.name)
running = is_running(pid)
ok, detail = health_ok(ref.config)
if running and ok is not False:
label = "running"
elif running:
label = "unhealthy"
elif ok is True:
label = "externally running"
else:
label = "stopped"
print(f"{ref.name}: {label} pid={pid or '-'} ({detail})")
def tail_log(profile: str, service: str, lines: int) -> None:
path = log_path(profile, service)
if not path.is_file():
print(f"no log file: {path}")
return
content = path.read_text(encoding="utf-8", errors="replace").splitlines()
for line in content[-lines:]:
print(line)
def run_doctor(profile: str, manifest: dict[str, Any]) -> None:
print(f"AI Workspace doctor profile={profile}")
print(f"workspace: {ROOT}")
print(f"manifest: {manifest_path(profile)}")
ensure_runtime()
print(f"runtime: {RUNTIME_DIR}")
for ref in service_items(manifest, include_disabled=True):
enabled = ref.config.get("enabled", True)
command = ref.config.get("command") or []
first = command[0] if command else ""
command_path = ROOT / first if first and not os.path.isabs(first) else Path(first) if first else None
command_ok = bool(command_path and (command_path.exists() or shutil_which(first)))
ok, detail = health_ok(ref.config)
enabled_text = "enabled" if enabled else "disabled"
health_text = detail if ok is not None else "no health check"
print(f"- {ref.name}: {enabled_text}; command={'ok' if command_ok else 'missing'}; {health_text}")
def shutil_which(command: str) -> str | None:
paths = os.environ.get("PATH", "").split(os.pathsep)
for directory in paths:
candidate = Path(directory) / command
if candidate.exists() and os.access(candidate, os.X_OK):
return str(candidate)
return None
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("action", choices=["start", "stop", "restart", "status", "logs", "doctor"])
parser.add_argument("services", nargs="*", help="Optional service names for start/stop/restart/status/logs.")
parser.add_argument("--profile", default=os.getenv("AIW_PROJECT_PROFILE", "fidelity"))
parser.add_argument("--group", default="", help="Start/stop/status services in a group, e.g. communication or inbox.")
parser.add_argument("--lines", type=int, default=80, help="Number of log lines for logs action.")
args = parser.parse_args()
ensure_runtime()
manifest = load_manifest(args.profile)
if args.action == "doctor":
run_doctor(args.profile, manifest)
return
include_disabled = args.action == "status"
refs = select_services(manifest, args.services, args.group or None, include_disabled=include_disabled)
if args.action == "start":
started: set[str] = set()
for ref in refs:
start_service(args.profile, ref, manifest, started)
elif args.action == "stop":
for ref in reversed(refs):
stop_service(args.profile, ref)
elif args.action == "restart":
for ref in reversed(refs):
stop_service(args.profile, ref)
started = set()
for ref in refs:
start_service(args.profile, ref, manifest, started)
elif args.action == "status":
for ref in refs:
status_service(args.profile, ref)
elif args.action == "logs":
if not args.services:
raise SystemExit("logs requires at least one service name")
for service in args.services:
print(f"==> {service} <==")
tail_log(args.profile, service, args.lines)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
from __future__ import annotations
import importlib.util
import io
import socket
import sys
import tempfile
import unittest
import warnings
from pathlib import Path
from contextlib import redirect_stdout
from unittest.mock import patch
SERVICES_PATH = Path(__file__).with_name("services.py")
SPEC = importlib.util.spec_from_file_location("aiw_services", SERVICES_PATH)
services = importlib.util.module_from_spec(SPEC)
assert SPEC.loader is not None
sys.modules[SPEC.name] = services
SPEC.loader.exec_module(services)
def sample_manifest() -> dict:
return {
"services": {
"alpha": {
"enabled": True,
"kind": "process",
"command": ["python3", "-c", "import time; time.sleep(60)"],
"groups": ["core"],
},
"beta": {
"enabled": True,
"kind": "process",
"command": ["python3", "-c", "import time; time.sleep(60)"],
"groups": ["capture"],
"depends_on": ["alpha"],
},
"disabled": {
"enabled": False,
"kind": "process",
"command": ["python3", "-c", "import time; time.sleep(60)"],
"groups": ["core"],
},
}
}
class ServiceManagerTests(unittest.TestCase):
def test_select_services_excludes_disabled_by_default(self) -> None:
selected = services.select_services(sample_manifest(), names=[], group=None)
self.assertEqual([item.name for item in selected], ["alpha", "beta"])
def test_select_services_can_include_disabled_for_status(self) -> None:
selected = services.select_services(sample_manifest(), names=[], group=None, include_disabled=True)
self.assertEqual([item.name for item in selected], ["alpha", "beta", "disabled"])
def test_select_services_filters_by_group(self) -> None:
selected = services.select_services(sample_manifest(), names=[], group="capture")
self.assertEqual([item.name for item in selected], ["beta"])
def test_health_ok_tcp_reports_open_port(self) -> None:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server:
server.bind(("127.0.0.1", 0))
server.listen(1)
port = server.getsockname()[1]
ok, detail = services.health_ok({"health": {"type": "tcp", "host": "127.0.0.1", "port": port}})
self.assertTrue(ok)
self.assertIn(f"127.0.0.1:{port}", detail)
def test_read_pid_ignores_invalid_pid_file(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
pid_dir = Path(tmp) / "pids"
target = pid_dir / "fidelity"
target.mkdir(parents=True)
(target / "alpha.pid").write_text("not-a-pid\n", encoding="utf-8")
with patch.object(services, "PID_DIR", pid_dir):
self.assertIsNone(services.read_pid("fidelity", "alpha"))
def test_start_and_stop_managed_process(self) -> None:
manifest = {
"services": {
"sleeper": {
"enabled": True,
"kind": "process",
"command": [sys.executable, "-c", "import time; time.sleep(60)"],
"restart": "never",
}
}
}
ref = services.ServiceRef("sleeper", manifest["services"]["sleeper"])
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
with patch.object(services, "PID_DIR", root / "pids"), \
patch.object(services, "LOG_DIR", root / "logs"), \
patch.object(services, "STATE_DIR", root / "state"):
started: set[str] = set()
services.ensure_runtime()
with warnings.catch_warnings():
warnings.simplefilter("ignore", ResourceWarning)
with redirect_stdout(io.StringIO()):
services.start_service("test", ref, manifest, started)
pid = services.read_pid("test", "sleeper")
self.assertTrue(services.is_running(pid))
self.assertIn("sleeper", started)
with redirect_stdout(io.StringIO()):
services.stop_service("test", ref)
self.assertFalse(services.is_running(pid))
if __name__ == "__main__":
unittest.main()