feat: implement local indexer for project-knowledge and add memory hybrid search functionality
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -44,3 +44,4 @@ project-knowledge/.obsidian/cache/
|
|||||||
|
|
||||||
# AI Workspace local service runtime
|
# AI Workspace local service runtime
|
||||||
.aiw/runtime/
|
.aiw/runtime/
|
||||||
|
.aiw/indexes/
|
||||||
|
|||||||
102
core/services/local-rag-index.md
Normal file
102
core/services/local-rag-index.md
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
---
|
||||||
|
type: service-design
|
||||||
|
status: active
|
||||||
|
updated: 2026-05-21
|
||||||
|
tags:
|
||||||
|
- ai-workspace
|
||||||
|
- rag
|
||||||
|
- index
|
||||||
|
---
|
||||||
|
|
||||||
|
# Local RAG Index
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Add retrieval over canonical workspace memory without replacing the human-readable `project-knowledge/` vault.
|
||||||
|
|
||||||
|
The local index is derived and disposable. If the index disagrees with Markdown, the Markdown wins.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Current Implementation
|
||||||
|
|
||||||
|
The first implementation is dependency-free and lexical:
|
||||||
|
|
||||||
|
```text
|
||||||
|
scripts/aiw/indexer.py
|
||||||
|
```
|
||||||
|
|
||||||
|
It reads:
|
||||||
|
|
||||||
|
```text
|
||||||
|
project-knowledge/**/*.md
|
||||||
|
```
|
||||||
|
|
||||||
|
and writes:
|
||||||
|
|
||||||
|
```text
|
||||||
|
.aiw/indexes/<profile>/project-knowledge.jsonl
|
||||||
|
.aiw/indexes/<profile>/manifest.json
|
||||||
|
```
|
||||||
|
|
||||||
|
It skips:
|
||||||
|
|
||||||
|
```text
|
||||||
|
project-knowledge/09-templates/
|
||||||
|
```
|
||||||
|
|
||||||
|
so Obsidian templates do not appear as real memory.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Commands
|
||||||
|
|
||||||
|
Build the index:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 scripts/aiw/indexer.py build --profile fidelity
|
||||||
|
```
|
||||||
|
|
||||||
|
Check index status:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 scripts/aiw/indexer.py status --profile fidelity
|
||||||
|
```
|
||||||
|
|
||||||
|
Search the index:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 scripts/aiw/indexer.py search "dismissal lifecycle" --profile fidelity
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## MCP Exposure
|
||||||
|
|
||||||
|
`aiw-context-mcp` exposes:
|
||||||
|
|
||||||
|
```text
|
||||||
|
memory_hybrid_search
|
||||||
|
```
|
||||||
|
|
||||||
|
Current behavior:
|
||||||
|
|
||||||
|
- searches the derived local index when it exists
|
||||||
|
- returns cited paths, headings, snippets, scores, hashes, and mtimes
|
||||||
|
- falls back to live Markdown search when no index exists
|
||||||
|
- remains read-only
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Upgrade Path
|
||||||
|
|
||||||
|
This layer can later add:
|
||||||
|
|
||||||
|
- full-text ranking
|
||||||
|
- embeddings
|
||||||
|
- Qdrant or Chroma as a local vector store
|
||||||
|
- hybrid lexical + semantic search
|
||||||
|
- reranking
|
||||||
|
- Mattermost evidence indexing with strict source filters
|
||||||
|
|
||||||
|
Do not make the vector store canonical. It should remain rebuildable from Markdown and selected evidence.
|
||||||
@@ -32,6 +32,18 @@ python3 scripts/aiw/services.py start --profile fidelity --group inbox
|
|||||||
|
|
||||||
The service manager unifies startup and status. It does not move capture behavior into the MCP.
|
The service manager unifies startup and status. It does not move capture behavior into the MCP.
|
||||||
|
|
||||||
|
## Local project-knowledge index
|
||||||
|
|
||||||
|
The workspace includes a dependency-free local indexer for canonical Markdown memory. The index is derived from `project-knowledge/` and written under `.aiw/indexes/<profile>/`; it is safe to delete and rebuild.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 scripts/aiw/indexer.py build --profile fidelity
|
||||||
|
python3 scripts/aiw/indexer.py status --profile fidelity
|
||||||
|
python3 scripts/aiw/indexer.py search "dismissal lifecycle" --profile fidelity
|
||||||
|
```
|
||||||
|
|
||||||
|
`aiw-context-mcp` exposes the same derived search through the read-only `memory_hybrid_search` tool and falls back to live Markdown search if the index has not been built yet.
|
||||||
|
|
||||||
## Robustness features
|
## Robustness features
|
||||||
|
|
||||||
- Manifest validation before lifecycle actions.
|
- Manifest validation before lifecycle actions.
|
||||||
@@ -47,4 +59,5 @@ The service manager unifies startup and status. It does not move capture behavio
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 scripts/aiw/test_services.py
|
python3 scripts/aiw/test_services.py
|
||||||
|
python3 scripts/aiw/test_indexer.py
|
||||||
```
|
```
|
||||||
|
|||||||
258
scripts/aiw/indexer.py
Normal file
258
scripts/aiw/indexer.py
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Dependency-free local indexer for AI Workspace canonical Markdown memory.
|
||||||
|
|
||||||
|
This is intentionally a small lexical/hybrid-ready index. It keeps
|
||||||
|
`project-knowledge/` as the source of truth and writes a derived, disposable
|
||||||
|
JSONL index under `.aiw/indexes/<profile>/`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
INDEX_ROOT = ROOT / ".aiw" / "indexes"
|
||||||
|
DEFAULT_PROFILE = "fidelity"
|
||||||
|
MAX_CHARS = 1800
|
||||||
|
OVERLAP_CHARS = 180
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Chunk:
|
||||||
|
chunk_id: str
|
||||||
|
path: str
|
||||||
|
heading: str
|
||||||
|
text: str
|
||||||
|
mtime: float
|
||||||
|
sha256: str
|
||||||
|
|
||||||
|
|
||||||
|
def project_knowledge_dir(profile: str) -> Path:
|
||||||
|
profile_base = ROOT / "profiles" / profile
|
||||||
|
candidate = profile_base / "project-knowledge"
|
||||||
|
if candidate.exists():
|
||||||
|
return candidate
|
||||||
|
return ROOT / "project-knowledge"
|
||||||
|
|
||||||
|
|
||||||
|
def index_dir(profile: str) -> Path:
|
||||||
|
return INDEX_ROOT / profile
|
||||||
|
|
||||||
|
|
||||||
|
def index_path(profile: str) -> Path:
|
||||||
|
return index_dir(profile) / "project-knowledge.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
def manifest_path(profile: str) -> Path:
|
||||||
|
return index_dir(profile) / "manifest.json"
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_space(text: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def tokens(text: str) -> set[str]:
|
||||||
|
return {item for item in re.findall(r"[a-z0-9][a-z0-9_-]{1,}", text.lower()) if len(item) > 1}
|
||||||
|
|
||||||
|
|
||||||
|
def iter_markdown_files(base: Path) -> list[Path]:
|
||||||
|
files: list[Path] = []
|
||||||
|
for path in sorted(base.rglob("*.md")):
|
||||||
|
rel = path.relative_to(base)
|
||||||
|
if str(rel).startswith("09-templates/"):
|
||||||
|
continue
|
||||||
|
files.append(path)
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def heading_for_line(line: str, current: str) -> str:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped.startswith("#"):
|
||||||
|
return stripped.lstrip("#").strip() or current
|
||||||
|
return current
|
||||||
|
|
||||||
|
|
||||||
|
def split_sections(text: str) -> list[tuple[str, str]]:
|
||||||
|
sections: list[tuple[str, list[str]]] = [("", [])]
|
||||||
|
current_heading = ""
|
||||||
|
for line in text.splitlines():
|
||||||
|
new_heading = heading_for_line(line, current_heading)
|
||||||
|
if new_heading != current_heading and line.strip().startswith("#"):
|
||||||
|
current_heading = new_heading
|
||||||
|
sections.append((current_heading, [line]))
|
||||||
|
else:
|
||||||
|
sections[-1][1].append(line)
|
||||||
|
return [(heading, "\n".join(lines).strip()) for heading, lines in sections if "\n".join(lines).strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text(section_text: str, max_chars: int = MAX_CHARS, overlap_chars: int = OVERLAP_CHARS) -> list[str]:
|
||||||
|
text = section_text.strip()
|
||||||
|
if len(text) <= max_chars:
|
||||||
|
return [text] if text else []
|
||||||
|
chunks: list[str] = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = min(len(text), start + max_chars)
|
||||||
|
if end < len(text):
|
||||||
|
boundary = max(text.rfind("\n\n", start, end), text.rfind(". ", start, end))
|
||||||
|
if boundary > start + max_chars // 2:
|
||||||
|
end = boundary + 1
|
||||||
|
chunk = text[start:end].strip()
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
if end >= len(text):
|
||||||
|
break
|
||||||
|
start = max(0, end - overlap_chars)
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def build_chunks(profile: str) -> list[Chunk]:
|
||||||
|
base = project_knowledge_dir(profile)
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
for path in iter_markdown_files(base):
|
||||||
|
raw = path.read_text(encoding="utf-8", errors="replace")
|
||||||
|
rel = str(path.relative_to(ROOT))
|
||||||
|
digest = hashlib.sha256(raw.encode("utf-8", errors="replace")).hexdigest()
|
||||||
|
mtime = path.stat().st_mtime
|
||||||
|
for section_index, (heading, section) in enumerate(split_sections(raw)):
|
||||||
|
for chunk_index, chunk in enumerate(chunk_text(section)):
|
||||||
|
chunk_digest = hashlib.sha256(f"{rel}\n{section_index}\n{chunk_index}\n{chunk}".encode("utf-8")).hexdigest()[:16]
|
||||||
|
chunks.append(Chunk(chunk_id=chunk_digest, path=rel, heading=heading, text=chunk, mtime=mtime, sha256=digest))
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def write_index(profile: str) -> dict[str, Any]:
|
||||||
|
out_dir = index_dir(profile)
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
chunks = build_chunks(profile)
|
||||||
|
with index_path(profile).open("w", encoding="utf-8") as handle:
|
||||||
|
for chunk in chunks:
|
||||||
|
handle.write(json.dumps(chunk.__dict__, ensure_ascii=False, sort_keys=True) + "\n")
|
||||||
|
files = sorted({chunk.path for chunk in chunks})
|
||||||
|
manifest = {
|
||||||
|
"profile": profile,
|
||||||
|
"source": str(project_knowledge_dir(profile).relative_to(ROOT)),
|
||||||
|
"canonical": False,
|
||||||
|
"derived_from": "project-knowledge",
|
||||||
|
"index_type": "lexical-markdown-chunks",
|
||||||
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"file_count": len(files),
|
||||||
|
"chunk_count": len(chunks),
|
||||||
|
"index_path": str(index_path(profile).relative_to(ROOT)),
|
||||||
|
}
|
||||||
|
manifest_path(profile).write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||||
|
return manifest
|
||||||
|
|
||||||
|
|
||||||
|
def read_index(profile: str) -> list[dict[str, Any]]:
|
||||||
|
path = index_path(profile)
|
||||||
|
if not path.is_file():
|
||||||
|
return []
|
||||||
|
rows: list[dict[str, Any]] = []
|
||||||
|
for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
rows.append(json.loads(line))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def score_chunk(query: str, query_tokens: set[str], chunk: dict[str, Any]) -> float:
|
||||||
|
text = str(chunk.get("text") or "")
|
||||||
|
haystack = f"{chunk.get('path', '')} {chunk.get('heading', '')} {text}".lower()
|
||||||
|
exact = haystack.count(query.lower())
|
||||||
|
chunk_tokens = tokens(haystack)
|
||||||
|
overlap = len(query_tokens & chunk_tokens)
|
||||||
|
if exact == 0 and overlap == 0:
|
||||||
|
return 0.0
|
||||||
|
heading_bonus = 1.5 if query.lower() in str(chunk.get("heading") or "").lower() else 0.0
|
||||||
|
path_bonus = 1.0 if query.lower() in str(chunk.get("path") or "").lower() else 0.0
|
||||||
|
return exact * 5.0 + overlap * 1.25 + heading_bonus + path_bonus
|
||||||
|
|
||||||
|
|
||||||
|
def snippet_for(query: str, text: str, width: int = 520) -> str:
|
||||||
|
lowered = text.lower()
|
||||||
|
index = lowered.find(query.lower()) if query else -1
|
||||||
|
if index < 0:
|
||||||
|
query_terms = tokens(query)
|
||||||
|
candidates = [lowered.find(term) for term in query_terms if lowered.find(term) >= 0]
|
||||||
|
index = min(candidates) if candidates else 0
|
||||||
|
start = max(0, index - width // 2)
|
||||||
|
end = min(len(text), start + width)
|
||||||
|
return normalize_space(text[start:end])
|
||||||
|
|
||||||
|
|
||||||
|
def search_index(profile: str, query: str, limit: int = 10) -> dict[str, Any]:
|
||||||
|
query = query.strip()
|
||||||
|
if not query:
|
||||||
|
raise SystemExit("query is required")
|
||||||
|
rows = read_index(profile)
|
||||||
|
query_tokens = tokens(query)
|
||||||
|
scored: list[tuple[float, dict[str, Any]]] = []
|
||||||
|
for row in rows:
|
||||||
|
score = score_chunk(query, query_tokens, row)
|
||||||
|
if score > 0:
|
||||||
|
scored.append((score, row))
|
||||||
|
scored.sort(key=lambda item: (-item[0], item[1].get("path", ""), item[1].get("chunk_id", "")))
|
||||||
|
matches = []
|
||||||
|
for score, row in scored[:limit]:
|
||||||
|
matches.append({
|
||||||
|
"score": round(score, 3),
|
||||||
|
"path": row.get("path"),
|
||||||
|
"heading": row.get("heading"),
|
||||||
|
"chunk_id": row.get("chunk_id"),
|
||||||
|
"snippet": snippet_for(query, str(row.get("text") or "")),
|
||||||
|
"mtime": row.get("mtime"),
|
||||||
|
"sha256": row.get("sha256"),
|
||||||
|
})
|
||||||
|
manifest = {}
|
||||||
|
if manifest_path(profile).is_file():
|
||||||
|
manifest = json.loads(manifest_path(profile).read_text(encoding="utf-8"))
|
||||||
|
return {"profile": profile, "query": query, "canonical": False, "source": "derived-index", "manifest": manifest, "matches": matches}
|
||||||
|
|
||||||
|
|
||||||
|
def status(profile: str) -> dict[str, Any]:
|
||||||
|
manifest_file = manifest_path(profile)
|
||||||
|
if not manifest_file.is_file():
|
||||||
|
return {"profile": profile, "indexed": False, "index_path": str(index_path(profile).relative_to(ROOT))}
|
||||||
|
manifest = json.loads(manifest_file.read_text(encoding="utf-8"))
|
||||||
|
path = index_path(profile)
|
||||||
|
manifest["indexed"] = path.is_file()
|
||||||
|
manifest["index_bytes"] = path.stat().st_size if path.is_file() else 0
|
||||||
|
manifest["age_seconds"] = int(time.time() - datetime.fromisoformat(manifest["created_at"]).timestamp()) if manifest.get("created_at") else None
|
||||||
|
return manifest
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||||
|
for name in ["build", "status"]:
|
||||||
|
command = subparsers.add_parser(name)
|
||||||
|
command.add_argument("--profile", default=DEFAULT_PROFILE)
|
||||||
|
search = subparsers.add_parser("search")
|
||||||
|
search.add_argument("query")
|
||||||
|
search.add_argument("--profile", default=DEFAULT_PROFILE)
|
||||||
|
search.add_argument("--limit", type=int, default=10)
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.command == "build":
|
||||||
|
payload = write_index(args.profile)
|
||||||
|
elif args.command == "search":
|
||||||
|
payload = search_index(args.profile, args.query, limit=max(1, min(args.limit, 50)))
|
||||||
|
else:
|
||||||
|
payload = status(args.profile)
|
||||||
|
print(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
56
scripts/aiw/test_indexer.py
Normal file
56
scripts/aiw/test_indexer.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
||||||
|
INDEXER_PATH = Path(__file__).with_name("indexer.py")
|
||||||
|
SPEC = importlib.util.spec_from_file_location("aiw_indexer", INDEXER_PATH)
|
||||||
|
indexer = importlib.util.module_from_spec(SPEC)
|
||||||
|
assert SPEC.loader is not None
|
||||||
|
sys.modules[SPEC.name] = indexer
|
||||||
|
SPEC.loader.exec_module(indexer)
|
||||||
|
|
||||||
|
|
||||||
|
class IndexerTests(unittest.TestCase):
|
||||||
|
def test_build_skips_templates_and_searches_canonical_files(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
root = Path(tmp)
|
||||||
|
real = root / "project-knowledge" / "03-context" / "project.md"
|
||||||
|
template = root / "project-knowledge" / "09-templates" / "daily.md"
|
||||||
|
real.parent.mkdir(parents=True)
|
||||||
|
template.parent.mkdir(parents=True)
|
||||||
|
real.write_text("# XFlow\nDismissal lifecycle context", encoding="utf-8")
|
||||||
|
template.write_text("# XFlow\nTemplate-only text", encoding="utf-8")
|
||||||
|
|
||||||
|
with patch.object(indexer, "ROOT", root), patch.object(indexer, "INDEX_ROOT", root / ".aiw" / "indexes"):
|
||||||
|
manifest = indexer.write_index("fidelity")
|
||||||
|
result = indexer.search_index("fidelity", "dismissal lifecycle", limit=5)
|
||||||
|
|
||||||
|
self.assertEqual(manifest["file_count"], 1)
|
||||||
|
self.assertEqual(len(result["matches"]), 1)
|
||||||
|
self.assertIn("03-context/project.md", result["matches"][0]["path"])
|
||||||
|
|
||||||
|
def test_status_reports_unindexed_profile(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
root = Path(tmp)
|
||||||
|
with patch.object(indexer, "ROOT", root), patch.object(indexer, "INDEX_ROOT", root / ".aiw" / "indexes"):
|
||||||
|
result = indexer.status("fidelity")
|
||||||
|
|
||||||
|
self.assertFalse(result["indexed"])
|
||||||
|
self.assertIn(".aiw/indexes/fidelity/project-knowledge.jsonl", result["index_path"])
|
||||||
|
|
||||||
|
def test_cli_search_payload_is_json_serializable(self) -> None:
|
||||||
|
payload = {"matches": [{"path": "project-knowledge/01-current/current-work.md", "score": 1.0}]}
|
||||||
|
self.assertIsInstance(json.dumps(payload), str)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -42,10 +42,19 @@ python3 scripts/mcp/aiw-context-mcp/server.py --transport stdio
|
|||||||
- `communication_thread_context`
|
- `communication_thread_context`
|
||||||
- `project_current_context`
|
- `project_current_context`
|
||||||
- `project_search_memory`
|
- `project_search_memory`
|
||||||
|
- `memory_hybrid_search`
|
||||||
- `photos_latest`
|
- `photos_latest`
|
||||||
|
|
||||||
All tools are read-only. Mattermost tools read `ai/inbox/mattermost-mirror/`; photo tools list local Photo Inbox files without embedding image data; project tools read canonical Markdown under `project-knowledge/`.
|
All tools are read-only. Mattermost tools read `ai/inbox/mattermost-mirror/`; photo tools list local Photo Inbox files without embedding image data; project tools read canonical Markdown under `project-knowledge/`.
|
||||||
|
|
||||||
|
`memory_hybrid_search` reads the derived local index built by:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 scripts/aiw/indexer.py build --profile fidelity
|
||||||
|
```
|
||||||
|
|
||||||
|
If the index is missing, it falls back to bounded live Markdown search over `project-knowledge/`. The index is not canonical memory; `project-knowledge/` remains the source of truth.
|
||||||
|
|
||||||
Mattermost latest/date/standup tools filter to the active profile's context channels by default. For Fidelity, that list lives in `profiles/fidelity/context-sources.json`. Pass explicit `channels` to override the profile list, or `include_all_channels: true` when broad unfiltered mirror evidence is intentionally needed.
|
Mattermost latest/date/standup tools filter to the active profile's context channels by default. For Fidelity, that list lives in `profiles/fidelity/context-sources.json`. Pass explicit `channels` to override the profile list, or `include_all_channels: true` when broad unfiltered mirror evidence is intentionally needed.
|
||||||
|
|
||||||
## Resources
|
## Resources
|
||||||
|
|||||||
@@ -9,8 +9,10 @@ owned by the AI Workspace Service Manager.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from datetime import date, datetime, timedelta
|
from datetime import date, datetime, timedelta
|
||||||
@@ -25,6 +27,7 @@ PROTOCOL_VERSION = "2025-06-18"
|
|||||||
SERVER_NAME = "aiw-context-mcp"
|
SERVER_NAME = "aiw-context-mcp"
|
||||||
SERVER_VERSION = "0.1.0"
|
SERVER_VERSION = "0.1.0"
|
||||||
LOCAL_ENV = ROOT / "scripts" / "mattermost-proxy" / ".env"
|
LOCAL_ENV = ROOT / "scripts" / "mattermost-proxy" / ".env"
|
||||||
|
INDEX_ROOT = ROOT / ".aiw" / "indexes"
|
||||||
|
|
||||||
|
|
||||||
def load_local_env(path: Path = LOCAL_ENV) -> None:
|
def load_local_env(path: Path = LOCAL_ENV) -> None:
|
||||||
@@ -283,6 +286,100 @@ def project_search_memory(args: dict[str, Any]) -> dict[str, Any]:
|
|||||||
return tool_result({"profile": profile, "canonical": True, "query": query, "matches": matches})
|
return tool_result({"profile": profile, "canonical": True, "query": query, "matches": matches})
|
||||||
|
|
||||||
|
|
||||||
|
def index_path(profile: str) -> Path:
|
||||||
|
return INDEX_ROOT / profile / "project-knowledge.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
def index_manifest_path(profile: str) -> Path:
|
||||||
|
return INDEX_ROOT / profile / "manifest.json"
|
||||||
|
|
||||||
|
|
||||||
|
def search_tokens(text: str) -> set[str]:
|
||||||
|
return {item for item in re.findall(r"[a-z0-9][a-z0-9_-]{1,}", text.lower()) if len(item) > 1}
|
||||||
|
|
||||||
|
|
||||||
|
def read_project_index(profile: str) -> list[dict[str, Any]]:
|
||||||
|
path = index_path(profile)
|
||||||
|
if not path.is_file():
|
||||||
|
return []
|
||||||
|
rows: list[dict[str, Any]] = []
|
||||||
|
for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
rows.append(json.loads(line))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def indexed_snippet(query: str, text: str, width: int = 520) -> str:
|
||||||
|
lowered = text.lower()
|
||||||
|
index = lowered.find(query.lower()) if query else -1
|
||||||
|
if index < 0:
|
||||||
|
positions = [lowered.find(term) for term in search_tokens(query) if lowered.find(term) >= 0]
|
||||||
|
index = min(positions) if positions else 0
|
||||||
|
start = max(0, index - width // 2)
|
||||||
|
end = min(len(text), start + width)
|
||||||
|
return re.sub(r"\s+", " ", text[start:end]).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def score_index_row(query: str, query_tokens: set[str], row: dict[str, Any]) -> float:
|
||||||
|
text = str(row.get("text") or "")
|
||||||
|
haystack = f"{row.get('path', '')} {row.get('heading', '')} {text}".lower()
|
||||||
|
exact = haystack.count(query.lower())
|
||||||
|
overlap = len(query_tokens & search_tokens(haystack))
|
||||||
|
if exact == 0 and overlap == 0:
|
||||||
|
return 0.0
|
||||||
|
heading_bonus = 1.5 if query.lower() in str(row.get("heading") or "").lower() else 0.0
|
||||||
|
path_bonus = 1.0 if query.lower() in str(row.get("path") or "").lower() else 0.0
|
||||||
|
return exact * 5.0 + overlap * 1.25 + heading_bonus + path_bonus
|
||||||
|
|
||||||
|
|
||||||
|
def read_index_manifest(profile: str) -> dict[str, Any]:
|
||||||
|
path = index_manifest_path(profile)
|
||||||
|
if not path.is_file():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def memory_hybrid_search(args: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
profile = str(args.get("profile") or "fidelity")
|
||||||
|
query = str(args.get("query") or "").strip()
|
||||||
|
if not query:
|
||||||
|
return tool_error("query is required")
|
||||||
|
limit = clamp_limit(args.get("limit"), default=10, maximum=50)
|
||||||
|
rows = read_project_index(profile)
|
||||||
|
if not rows:
|
||||||
|
fallback = project_search_memory({"profile": profile, "query": query, "limit": limit})["structuredContent"]
|
||||||
|
fallback["source"] = "live-project-knowledge-fallback"
|
||||||
|
fallback["index_available"] = False
|
||||||
|
return tool_result(fallback)
|
||||||
|
query_tokens = search_tokens(query)
|
||||||
|
scored = []
|
||||||
|
for row in rows:
|
||||||
|
score = score_index_row(query, query_tokens, row)
|
||||||
|
if score > 0:
|
||||||
|
scored.append((score, row))
|
||||||
|
scored.sort(key=lambda item: (-item[0], item[1].get("path", ""), item[1].get("chunk_id", "")))
|
||||||
|
matches = []
|
||||||
|
for score, row in scored[:limit]:
|
||||||
|
text = str(row.get("text") or "")
|
||||||
|
matches.append({
|
||||||
|
"score": round(score, 3),
|
||||||
|
"path": row.get("path"),
|
||||||
|
"heading": row.get("heading"),
|
||||||
|
"chunk_id": row.get("chunk_id") or hashlib.sha256(text.encode("utf-8")).hexdigest()[:16],
|
||||||
|
"snippet": indexed_snippet(query, text),
|
||||||
|
"mtime": row.get("mtime"),
|
||||||
|
"sha256": row.get("sha256"),
|
||||||
|
})
|
||||||
|
return tool_result({"profile": profile, "canonical": False, "source": "derived-project-knowledge-index", "index_available": True, "manifest": read_index_manifest(profile), "query": query, "matches": matches})
|
||||||
|
|
||||||
|
|
||||||
def photos_latest(args: dict[str, Any]) -> dict[str, Any]:
|
def photos_latest(args: dict[str, Any]) -> dict[str, Any]:
|
||||||
profile = str(args.get("profile") or "fidelity")
|
profile = str(args.get("profile") or "fidelity")
|
||||||
limit = clamp_limit(args.get("limit"), default=20, maximum=100)
|
limit = clamp_limit(args.get("limit"), default=20, maximum=100)
|
||||||
@@ -373,6 +470,7 @@ TOOLS: dict[str, dict[str, Any]] = {
|
|||||||
"communication_thread_context": {"handler": communication_thread_context, "description": "Read Mattermost mirror evidence for a thread id.", "properties": {"profile": {"type": "string"}, "thread_id": {"type": "string"}, "limit": {"type": "integer"}}},
|
"communication_thread_context": {"handler": communication_thread_context, "description": "Read Mattermost mirror evidence for a thread id.", "properties": {"profile": {"type": "string"}, "thread_id": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||||
"project_current_context": {"handler": project_current_context, "description": "Read canonical current-work and work-items context.", "properties": {"profile": {"type": "string"}}},
|
"project_current_context": {"handler": project_current_context, "description": "Read canonical current-work and work-items context.", "properties": {"profile": {"type": "string"}}},
|
||||||
"project_search_memory": {"handler": project_search_memory, "description": "Search canonical project-knowledge Markdown files.", "properties": {"profile": {"type": "string"}, "query": {"type": "string"}, "limit": {"type": "integer"}}},
|
"project_search_memory": {"handler": project_search_memory, "description": "Search canonical project-knowledge Markdown files.", "properties": {"profile": {"type": "string"}, "query": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||||
|
"memory_hybrid_search": {"handler": memory_hybrid_search, "description": "Search the derived local project-knowledge index with lexical scoring and source citations; falls back to live Markdown search if no index exists.", "properties": {"profile": {"type": "string"}, "query": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||||
"photos_latest": {"handler": photos_latest, "description": "List recent local Photo Inbox files without embedding image data.", "properties": {"profile": {"type": "string"}, "limit": {"type": "integer"}}},
|
"photos_latest": {"handler": photos_latest, "description": "List recent local Photo Inbox files without embedding image data.", "properties": {"profile": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ class ContextMCPTests(unittest.TestCase):
|
|||||||
|
|
||||||
names = {tool["name"] for tool in response["result"]["tools"]}
|
names = {tool["name"] for tool in response["result"]["tools"]}
|
||||||
self.assertIn("project_search_memory", names)
|
self.assertIn("project_search_memory", names)
|
||||||
|
self.assertIn("memory_hybrid_search", names)
|
||||||
self.assertIn("communication_latest", names)
|
self.assertIn("communication_latest", names)
|
||||||
|
|
||||||
def test_initialize_response_declares_resources(self) -> None:
|
def test_initialize_response_declares_resources(self) -> None:
|
||||||
@@ -158,6 +159,43 @@ class ContextMCPTests(unittest.TestCase):
|
|||||||
self.assertEqual(len(result["matches"]), 1)
|
self.assertEqual(len(result["matches"]), 1)
|
||||||
self.assertIn("03-context/project.md", result["matches"][0]["path"])
|
self.assertIn("03-context/project.md", result["matches"][0]["path"])
|
||||||
|
|
||||||
|
def test_memory_hybrid_search_uses_index_when_available(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
root = Path(tmp)
|
||||||
|
index = root / ".aiw" / "indexes" / "fidelity" / "project-knowledge.jsonl"
|
||||||
|
manifest = root / ".aiw" / "indexes" / "fidelity" / "manifest.json"
|
||||||
|
index.parent.mkdir(parents=True)
|
||||||
|
index.write_text(json.dumps({
|
||||||
|
"chunk_id": "abc",
|
||||||
|
"path": "project-knowledge/03-context/project.md",
|
||||||
|
"heading": "XFlow",
|
||||||
|
"text": "Dismissal lifecycle sequencing for XFlow",
|
||||||
|
"mtime": 1.0,
|
||||||
|
"sha256": "hash",
|
||||||
|
}) + "\n", encoding="utf-8")
|
||||||
|
manifest.write_text(json.dumps({"chunk_count": 1}), encoding="utf-8")
|
||||||
|
|
||||||
|
with patch.object(server, "ROOT", root), patch.object(server, "INDEX_ROOT", root / ".aiw" / "indexes"):
|
||||||
|
result = server.memory_hybrid_search({"profile": "fidelity", "query": "dismissal lifecycle"})["structuredContent"]
|
||||||
|
|
||||||
|
self.assertTrue(result["index_available"])
|
||||||
|
self.assertEqual(result["source"], "derived-project-knowledge-index")
|
||||||
|
self.assertEqual(result["matches"][0]["chunk_id"], "abc")
|
||||||
|
|
||||||
|
def test_memory_hybrid_search_falls_back_without_index(self) -> None:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
root = Path(tmp)
|
||||||
|
real = root / "project-knowledge" / "03-context" / "project.md"
|
||||||
|
real.parent.mkdir(parents=True)
|
||||||
|
real.write_text("Important XFlow context", encoding="utf-8")
|
||||||
|
|
||||||
|
with patch.object(server, "ROOT", root), patch.object(server, "INDEX_ROOT", root / ".aiw" / "indexes"):
|
||||||
|
result = server.memory_hybrid_search({"profile": "fidelity", "query": "XFlow"})["structuredContent"]
|
||||||
|
|
||||||
|
self.assertFalse(result["index_available"])
|
||||||
|
self.assertEqual(result["source"], "live-project-knowledge-fallback")
|
||||||
|
self.assertEqual(len(result["matches"]), 1)
|
||||||
|
|
||||||
def test_previous_workday_skips_weekend(self) -> None:
|
def test_previous_workday_skips_weekend(self) -> None:
|
||||||
monday = date(2026, 5, 18)
|
monday = date(2026, 5, 18)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user