feat: implement local indexer for project-knowledge and add memory hybrid search functionality
This commit is contained in:
@@ -9,8 +9,10 @@ owned by the AI Workspace Service Manager.
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
from datetime import date, datetime, timedelta
|
||||
@@ -25,6 +27,7 @@ PROTOCOL_VERSION = "2025-06-18"
|
||||
SERVER_NAME = "aiw-context-mcp"
|
||||
SERVER_VERSION = "0.1.0"
|
||||
LOCAL_ENV = ROOT / "scripts" / "mattermost-proxy" / ".env"
|
||||
INDEX_ROOT = ROOT / ".aiw" / "indexes"
|
||||
|
||||
|
||||
def load_local_env(path: Path = LOCAL_ENV) -> None:
|
||||
@@ -283,6 +286,100 @@ def project_search_memory(args: dict[str, Any]) -> dict[str, Any]:
|
||||
return tool_result({"profile": profile, "canonical": True, "query": query, "matches": matches})
|
||||
|
||||
|
||||
def index_path(profile: str) -> Path:
|
||||
return INDEX_ROOT / profile / "project-knowledge.jsonl"
|
||||
|
||||
|
||||
def index_manifest_path(profile: str) -> Path:
|
||||
return INDEX_ROOT / profile / "manifest.json"
|
||||
|
||||
|
||||
def search_tokens(text: str) -> set[str]:
|
||||
return {item for item in re.findall(r"[a-z0-9][a-z0-9_-]{1,}", text.lower()) if len(item) > 1}
|
||||
|
||||
|
||||
def read_project_index(profile: str) -> list[dict[str, Any]]:
|
||||
path = index_path(profile)
|
||||
if not path.is_file():
|
||||
return []
|
||||
rows: list[dict[str, Any]] = []
|
||||
for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
rows.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return rows
|
||||
|
||||
|
||||
def indexed_snippet(query: str, text: str, width: int = 520) -> str:
|
||||
lowered = text.lower()
|
||||
index = lowered.find(query.lower()) if query else -1
|
||||
if index < 0:
|
||||
positions = [lowered.find(term) for term in search_tokens(query) if lowered.find(term) >= 0]
|
||||
index = min(positions) if positions else 0
|
||||
start = max(0, index - width // 2)
|
||||
end = min(len(text), start + width)
|
||||
return re.sub(r"\s+", " ", text[start:end]).strip()
|
||||
|
||||
|
||||
def score_index_row(query: str, query_tokens: set[str], row: dict[str, Any]) -> float:
|
||||
text = str(row.get("text") or "")
|
||||
haystack = f"{row.get('path', '')} {row.get('heading', '')} {text}".lower()
|
||||
exact = haystack.count(query.lower())
|
||||
overlap = len(query_tokens & search_tokens(haystack))
|
||||
if exact == 0 and overlap == 0:
|
||||
return 0.0
|
||||
heading_bonus = 1.5 if query.lower() in str(row.get("heading") or "").lower() else 0.0
|
||||
path_bonus = 1.0 if query.lower() in str(row.get("path") or "").lower() else 0.0
|
||||
return exact * 5.0 + overlap * 1.25 + heading_bonus + path_bonus
|
||||
|
||||
|
||||
def read_index_manifest(profile: str) -> dict[str, Any]:
|
||||
path = index_manifest_path(profile)
|
||||
if not path.is_file():
|
||||
return {}
|
||||
try:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
|
||||
|
||||
def memory_hybrid_search(args: dict[str, Any]) -> dict[str, Any]:
|
||||
profile = str(args.get("profile") or "fidelity")
|
||||
query = str(args.get("query") or "").strip()
|
||||
if not query:
|
||||
return tool_error("query is required")
|
||||
limit = clamp_limit(args.get("limit"), default=10, maximum=50)
|
||||
rows = read_project_index(profile)
|
||||
if not rows:
|
||||
fallback = project_search_memory({"profile": profile, "query": query, "limit": limit})["structuredContent"]
|
||||
fallback["source"] = "live-project-knowledge-fallback"
|
||||
fallback["index_available"] = False
|
||||
return tool_result(fallback)
|
||||
query_tokens = search_tokens(query)
|
||||
scored = []
|
||||
for row in rows:
|
||||
score = score_index_row(query, query_tokens, row)
|
||||
if score > 0:
|
||||
scored.append((score, row))
|
||||
scored.sort(key=lambda item: (-item[0], item[1].get("path", ""), item[1].get("chunk_id", "")))
|
||||
matches = []
|
||||
for score, row in scored[:limit]:
|
||||
text = str(row.get("text") or "")
|
||||
matches.append({
|
||||
"score": round(score, 3),
|
||||
"path": row.get("path"),
|
||||
"heading": row.get("heading"),
|
||||
"chunk_id": row.get("chunk_id") or hashlib.sha256(text.encode("utf-8")).hexdigest()[:16],
|
||||
"snippet": indexed_snippet(query, text),
|
||||
"mtime": row.get("mtime"),
|
||||
"sha256": row.get("sha256"),
|
||||
})
|
||||
return tool_result({"profile": profile, "canonical": False, "source": "derived-project-knowledge-index", "index_available": True, "manifest": read_index_manifest(profile), "query": query, "matches": matches})
|
||||
|
||||
|
||||
def photos_latest(args: dict[str, Any]) -> dict[str, Any]:
|
||||
profile = str(args.get("profile") or "fidelity")
|
||||
limit = clamp_limit(args.get("limit"), default=20, maximum=100)
|
||||
@@ -373,6 +470,7 @@ TOOLS: dict[str, dict[str, Any]] = {
|
||||
"communication_thread_context": {"handler": communication_thread_context, "description": "Read Mattermost mirror evidence for a thread id.", "properties": {"profile": {"type": "string"}, "thread_id": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||
"project_current_context": {"handler": project_current_context, "description": "Read canonical current-work and work-items context.", "properties": {"profile": {"type": "string"}}},
|
||||
"project_search_memory": {"handler": project_search_memory, "description": "Search canonical project-knowledge Markdown files.", "properties": {"profile": {"type": "string"}, "query": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||
"memory_hybrid_search": {"handler": memory_hybrid_search, "description": "Search the derived local project-knowledge index with lexical scoring and source citations; falls back to live Markdown search if no index exists.", "properties": {"profile": {"type": "string"}, "query": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||
"photos_latest": {"handler": photos_latest, "description": "List recent local Photo Inbox files without embedding image data.", "properties": {"profile": {"type": "string"}, "limit": {"type": "integer"}}},
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user