feat: implement local indexer for project-knowledge and add memory hybrid search functionality

2026-05-21 09:13:07 -06:00
parent fc2abda588
commit e0069fd8c6
8 changed files with 575 additions and 0 deletions
--- a/scripts/mcp/aiw-context-mcp/README.md
+++ b/scripts/mcp/aiw-context-mcp/README.md
@@ -42,10 +42,19 @@ python3 scripts/mcp/aiw-context-mcp/server.py --transport stdio
 - `communication_thread_context`
 - `project_current_context`
 - `project_search_memory`
+- `memory_hybrid_search`
 - `photos_latest`

 All tools are read-only. Mattermost tools read `ai/inbox/mattermost-mirror/`; photo tools list local Photo Inbox files without embedding image data; project tools read canonical Markdown under `project-knowledge/`.

+`memory_hybrid_search` reads the derived local index built by:
+
+```bash
+python3 scripts/aiw/indexer.py build --profile fidelity
+```
+
+If the index is missing, it falls back to bounded live Markdown search over `project-knowledge/`. The index is not canonical memory; `project-knowledge/` remains the source of truth.
+
 Mattermost latest/date/standup tools filter to the active profile's context channels by default. For Fidelity, that list lives in `profiles/fidelity/context-sources.json`. Pass explicit `channels` to override the profile list, or `include_all_channels: true` when broad unfiltered mirror evidence is intentionally needed.

 ## Resources
--- a/scripts/mcp/aiw-context-mcp/server.py
+++ b/scripts/mcp/aiw-context-mcp/server.py
@@ -9,8 +9,10 @@ owned by the AI Workspace Service Manager.
 from __future__ import annotations

 import argparse
+import hashlib
 import json
 import os
+import re
 import sys
 import urllib.parse
 from datetime import date, datetime, timedelta
@@ -25,6 +27,7 @@ PROTOCOL_VERSION = "2025-06-18"
 SERVER_NAME = "aiw-context-mcp"
 SERVER_VERSION = "0.1.0"
 LOCAL_ENV = ROOT / "scripts" / "mattermost-proxy" / ".env"
+INDEX_ROOT = ROOT / ".aiw" / "indexes"


 def load_local_env(path: Path = LOCAL_ENV) -> None:
@@ -283,6 +286,100 @@ def project_search_memory(args: dict[str, Any]) -> dict[str, Any]:
    return tool_result({"profile": profile, "canonical": True, "query": query, "matches": matches})


+def index_path(profile: str) -> Path:
+    return INDEX_ROOT / profile / "project-knowledge.jsonl"
+
+
+def index_manifest_path(profile: str) -> Path:
+    return INDEX_ROOT / profile / "manifest.json"
+
+
+def search_tokens(text: str) -> set[str]:
+    return {item for item in re.findall(r"[a-z0-9][a-z0-9_-]{1,}", text.lower()) if len(item) > 1}
+
+
+def read_project_index(profile: str) -> list[dict[str, Any]]:
+    path = index_path(profile)
+    if not path.is_file():
+        return []
+    rows: list[dict[str, Any]] = []
+    for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
+        if not line.strip():
+            continue
+        try:
+            rows.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+    return rows
+
+
+def indexed_snippet(query: str, text: str, width: int = 520) -> str:
+    lowered = text.lower()
+    index = lowered.find(query.lower()) if query else -1
+    if index < 0:
+        positions = [lowered.find(term) for term in search_tokens(query) if lowered.find(term) >= 0]
+        index = min(positions) if positions else 0
+    start = max(0, index - width // 2)
+    end = min(len(text), start + width)
+    return re.sub(r"\s+", " ", text[start:end]).strip()
+
+
+def score_index_row(query: str, query_tokens: set[str], row: dict[str, Any]) -> float:
+    text = str(row.get("text") or "")
+    haystack = f"{row.get('path', '')} {row.get('heading', '')} {text}".lower()
+    exact = haystack.count(query.lower())
+    overlap = len(query_tokens & search_tokens(haystack))
+    if exact == 0 and overlap == 0:
+        return 0.0
+    heading_bonus = 1.5 if query.lower() in str(row.get("heading") or "").lower() else 0.0
+    path_bonus = 1.0 if query.lower() in str(row.get("path") or "").lower() else 0.0
+    return exact * 5.0 + overlap * 1.25 + heading_bonus + path_bonus
+
+
+def read_index_manifest(profile: str) -> dict[str, Any]:
+    path = index_manifest_path(profile)
+    if not path.is_file():
+        return {}
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return {}
+
+
+def memory_hybrid_search(args: dict[str, Any]) -> dict[str, Any]:
+    profile = str(args.get("profile") or "fidelity")
+    query = str(args.get("query") or "").strip()
+    if not query:
+        return tool_error("query is required")
+    limit = clamp_limit(args.get("limit"), default=10, maximum=50)
+    rows = read_project_index(profile)
+    if not rows:
+        fallback = project_search_memory({"profile": profile, "query": query, "limit": limit})["structuredContent"]
+        fallback["source"] = "live-project-knowledge-fallback"
+        fallback["index_available"] = False
+        return tool_result(fallback)
+    query_tokens = search_tokens(query)
+    scored = []
+    for row in rows:
+        score = score_index_row(query, query_tokens, row)
+        if score > 0:
+            scored.append((score, row))
+    scored.sort(key=lambda item: (-item[0], item[1].get("path", ""), item[1].get("chunk_id", "")))
+    matches = []
+    for score, row in scored[:limit]:
+        text = str(row.get("text") or "")
+        matches.append({
+            "score": round(score, 3),
+            "path": row.get("path"),
+            "heading": row.get("heading"),
+            "chunk_id": row.get("chunk_id") or hashlib.sha256(text.encode("utf-8")).hexdigest()[:16],
+            "snippet": indexed_snippet(query, text),
+            "mtime": row.get("mtime"),
+            "sha256": row.get("sha256"),
+        })
+    return tool_result({"profile": profile, "canonical": False, "source": "derived-project-knowledge-index", "index_available": True, "manifest": read_index_manifest(profile), "query": query, "matches": matches})
+
+
 def photos_latest(args: dict[str, Any]) -> dict[str, Any]:
    profile = str(args.get("profile") or "fidelity")
    limit = clamp_limit(args.get("limit"), default=20, maximum=100)
@@ -373,6 +470,7 @@ TOOLS: dict[str, dict[str, Any]] = {
    "communication_thread_context": {"handler": communication_thread_context, "description": "Read Mattermost mirror evidence for a thread id.", "properties": {"profile": {"type": "string"}, "thread_id": {"type": "string"}, "limit": {"type": "integer"}}},
    "project_current_context": {"handler": project_current_context, "description": "Read canonical current-work and work-items context.", "properties": {"profile": {"type": "string"}}},
    "project_search_memory": {"handler": project_search_memory, "description": "Search canonical project-knowledge Markdown files.", "properties": {"profile": {"type": "string"}, "query": {"type": "string"}, "limit": {"type": "integer"}}},
+    "memory_hybrid_search": {"handler": memory_hybrid_search, "description": "Search the derived local project-knowledge index with lexical scoring and source citations; falls back to live Markdown search if no index exists.", "properties": {"profile": {"type": "string"}, "query": {"type": "string"}, "limit": {"type": "integer"}}},
    "photos_latest": {"handler": photos_latest, "description": "List recent local Photo Inbox files without embedding image data.", "properties": {"profile": {"type": "string"}, "limit": {"type": "integer"}}},
 }

--- a/scripts/mcp/aiw-context-mcp/test_server.py
+++ b/scripts/mcp/aiw-context-mcp/test_server.py
@@ -32,6 +32,7 @@ class ContextMCPTests(unittest.TestCase):

        names = {tool["name"] for tool in response["result"]["tools"]}
        self.assertIn("project_search_memory", names)
+        self.assertIn("memory_hybrid_search", names)
        self.assertIn("communication_latest", names)

    def test_initialize_response_declares_resources(self) -> None:
@@ -158,6 +159,43 @@ class ContextMCPTests(unittest.TestCase):
        self.assertEqual(len(result["matches"]), 1)
        self.assertIn("03-context/project.md", result["matches"][0]["path"])

+    def test_memory_hybrid_search_uses_index_when_available(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            index = root / ".aiw" / "indexes" / "fidelity" / "project-knowledge.jsonl"
+            manifest = root / ".aiw" / "indexes" / "fidelity" / "manifest.json"
+            index.parent.mkdir(parents=True)
+            index.write_text(json.dumps({
+                "chunk_id": "abc",
+                "path": "project-knowledge/03-context/project.md",
+                "heading": "XFlow",
+                "text": "Dismissal lifecycle sequencing for XFlow",
+                "mtime": 1.0,
+                "sha256": "hash",
+            }) + "\n", encoding="utf-8")
+            manifest.write_text(json.dumps({"chunk_count": 1}), encoding="utf-8")
+
+            with patch.object(server, "ROOT", root), patch.object(server, "INDEX_ROOT", root / ".aiw" / "indexes"):
+                result = server.memory_hybrid_search({"profile": "fidelity", "query": "dismissal lifecycle"})["structuredContent"]
+
+        self.assertTrue(result["index_available"])
+        self.assertEqual(result["source"], "derived-project-knowledge-index")
+        self.assertEqual(result["matches"][0]["chunk_id"], "abc")
+
+    def test_memory_hybrid_search_falls_back_without_index(self) -> None:
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            real = root / "project-knowledge" / "03-context" / "project.md"
+            real.parent.mkdir(parents=True)
+            real.write_text("Important XFlow context", encoding="utf-8")
+
+            with patch.object(server, "ROOT", root), patch.object(server, "INDEX_ROOT", root / ".aiw" / "indexes"):
+                result = server.memory_hybrid_search({"profile": "fidelity", "query": "XFlow"})["structuredContent"]
+
+        self.assertFalse(result["index_available"])
+        self.assertEqual(result["source"], "live-project-knowledge-fallback")
+        self.assertEqual(len(result["matches"]), 1)
+
    def test_previous_workday_skips_weekend(self) -> None:
        monday = date(2026, 5, 18)