From 1d003fa242d2f0e890d961129ea3f27a409ad524 Mon Sep 17 00:00:00 2001
From: "david.delagneau" <david@all-win.software>
Date: Thu, 9 Apr 2026 16:04:08 -0600
Subject: [PATCH] feat: Enhance Slack import functionality to prioritize
 historical context and improve message selection criteria

---
 .opencode/commands/slack-import.md   |   4 +
 scripts/slack/README.md              |   3 +
 scripts/slack/import_slack_export.py | 144 ++++++++++++++++++++++++---
 3 files changed, 138 insertions(+), 13 deletions(-)

diff --git a/.opencode/commands/slack-import.md b/.opencode/commands/slack-import.md
index 1ca9cc0..e10d119 100644
--- a/.opencode/commands/slack-import.md
+++ b/.opencode/commands/slack-import.md
@@ -13,6 +13,8 @@ Inputs:
 - otherwise, if `archives/slack/export/` exists, use it as the default import source
 - if no channels are specified, auto-detect channels whose folder names start with `fidelity`
 - if no message limit is specified, auto-tune message selection based on archive size
+- if no date range is specified, do an initial full-history sweep across the detected `fidelity*` channels
+- preserve broad coverage across years and channels while still prioritizing high-signal messages
 
 First, run the importer:
 
@@ -41,6 +43,7 @@ Instructions:
 
 - treat the Slack archive as historical evidence
 - assume this may be a large multi-year export
+- assume the first import should preserve evidence from the beginning of the project, not just recent history
 - promote durable project-relevant context automatically when confidence is high
 - prefer promoting:
   - repeated Jira IDs and titles still relevant to current understanding
@@ -48,6 +51,7 @@ Instructions:
   - recurring architecture or debugging patterns
   - past approvals or decisions that still matter
 - prioritize high-signal messages such as Jira references, approvals, scope changes, root-cause notes, points, and persistent technical constraints
+- favor messages that help reconstruct project history across multiple years, not just the newest ones
 - avoid promoting outdated daily status unless it changes current understanding
 - update existing memory when the archive clarifies or corrects it
 - if historical facts are ambiguous or likely outdated, summarize them as archived context instead of promoting them
diff --git a/scripts/slack/README.md b/scripts/slack/README.md
index c4e1515..1524e43 100644
--- a/scripts/slack/README.md
+++ b/scripts/slack/README.md
@@ -47,7 +47,10 @@ For very large multi-year exports, the importer is designed to be selective by d
 
 - auto-detects channels whose names start with `fidelity` when no channels are specified
 - auto-tunes the message limit based on archive size
+- when no date filters are provided, performs an initial full-history sweep across the detected `fidelity*` channels
+- preserves coverage across channels and years, not only recent history
 - prefers recent messages plus older high-signal messages
+- preserves strong Jira-linked messages even when they are old
 - prioritizes Jira IDs, approvals, scope changes, root-cause notes, points, and durable technical patterns
 
 Override behavior if needed:
diff --git a/scripts/slack/import_slack_export.py b/scripts/slack/import_slack_export.py
index 137f138..24159c1 100644
--- a/scripts/slack/import_slack_export.py
+++ b/scripts/slack/import_slack_export.py
@@ -14,6 +14,7 @@ JIRA_RE = re.compile(r"\b[A-Z][A-Z0-9]+-\d+\b")
 HIGH_SIGNAL_PATTERNS = [
     re.compile(pattern, re.IGNORECASE)
     for pattern in [
+        r"\bdecision\b",
         r"\broot cause\b",
         r"\bapproved?\b",
         r"\bpoints?\b",
@@ -29,6 +30,20 @@ HIGH_SIGNAL_PATTERNS = [
         r"\bfeature flag\b",
         r"\btitle\b",
         r"\bscope\b",
+        r"\bowner(?:ship)?\b",
+        r"\brollout\b",
+        r"\bmigration\b",
+        r"\bdependency\b",
+        r"\bcontract\b",
+        r"\blifecycle\b",
+        r"\bswiftui\b",
+        r"\bbug\b",
+        r"\bissue\b",
+        r"\bincident\b",
+        r"\bfix(?:ed)?\b",
+        r"\bvalidation\b",
+        r"\bdob\b",
+        r"\bteenidentitycheck\b",
         r"\bdone\b",
         r"\bin progress\b",
         r"\bblocked?\b",
@@ -219,7 +234,9 @@ def auto_max_messages(total_messages: int) -> int:
         return 2500
     if total_messages <= 25000:
         return 4000
-    return 6000
+    if total_messages <= 80000:
+        return 6500
+    return 8000
 
 
 def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> int:
@@ -257,15 +274,65 @@ def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> in
     return score
 
 
+def message_key(item: Dict[str, Any]) -> str:
+    return item.get("message_id") or "|".join(
+        [
+            item.get("channel", ""),
+            item.get("timestamp", ""),
+            item.get("username", ""),
+            item.get("message", ""),
+        ]
+    )
+
+
+def message_year(item: Dict[str, Any]) -> str:
+    timestamp = item.get("timestamp", "")
+    try:
+        return str(datetime.fromisoformat(timestamp).year)
+    except ValueError:
+        return "unknown"
+
+
+def add_unique_items(
+    selected: List[Dict[str, Any]],
+    seen: set[str],
+    candidates: Sequence[Dict[str, Any]],
+    limit: int,
+) -> None:
+    if limit <= 0:
+        return
+
+    for item in candidates:
+        if len(selected) >= limit:
+            return
+        key = message_key(item)
+        if key in seen:
+            continue
+        seen.add(key)
+        selected.append(item)
+
+
 def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_days: int) -> List[Dict[str, Any]]:
     if len(messages) <= max_messages:
         return messages
 
     recent_cutoff = datetime.now().astimezone() - timedelta(days=recent_days)
+    scored_messages: List[Dict[str, Any]] = []
+    recent_messages: List[Dict[str, Any]] = []
+    channel_year_buckets: Dict[tuple[str, str], List[Dict[str, Any]]] = {}
+    jira_buckets: Dict[str, List[Dict[str, Any]]] = {}
 
-    recent_messages = []
-    older_messages = []
     for item in messages:
+        item["_score"] = score_message(item, recent_cutoff)
+        scored_messages.append(item)
+        if item["_score"] > 0:
+            bucket_key = (item.get("channel", ""), message_year(item))
+            channel_year_buckets.setdefault(bucket_key, []).append(item)
+
+        jira_ids = JIRA_RE.findall(item.get("message", ""))
+        for jira_id in jira_ids:
+            jira_buckets.setdefault(jira_id, []).append(item)
+
         try:
             message_dt = datetime.fromisoformat(item.get("timestamp", ""))
         except ValueError:
@@ -273,21 +340,61 @@ def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_da
 
         if message_dt and message_dt >= recent_cutoff:
             recent_messages.append(item)
-        else:
-            older_messages.append(item)
 
-    recent_budget = min(len(recent_messages), max(max_messages // 2, 1000))
-    selected_recent = recent_messages[-recent_budget:]
-    remaining_budget = max_messages - len(selected_recent)
+    selected: List[Dict[str, Any]] = []
+    seen: set[str] = set()
 
-    scored_older = sorted(
-        older_messages,
-        key=lambda item: (score_message(item, recent_cutoff), item.get("timestamp", "")),
+    recent_budget = min(len(recent_messages), max(max_messages // 5, 300))
+    add_unique_items(selected, seen, recent_messages[-recent_budget:], recent_budget)
+
+    channel_year_keys = sorted(channel_year_buckets.keys(), key=lambda key: (key[1], key[0]))
+    remaining_after_recent = max_messages - len(selected)
+    coverage_budget = min(
+        remaining_after_recent,
+        max(remaining_after_recent // 3, min(len(channel_year_keys) * 2, remaining_after_recent)),
+    )
+    if channel_year_keys and coverage_budget > 0:
+        per_bucket = max(1, min(4, coverage_budget // len(channel_year_keys) or 1))
+        for bucket_key in channel_year_keys:
+            bucket_items = sorted(
+                channel_year_buckets[bucket_key],
+                key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
+                reverse=True,
+            )
+            add_unique_items(selected, seen, bucket_items, min(max_messages, len(selected) + per_bucket))
+
+    remaining_after_coverage = max_messages - len(selected)
+    jira_ranked = sorted(
+        jira_buckets.items(),
+        key=lambda pair: (
+            max(item.get("_score", 0) for item in pair[1]),
+            len(pair[1]),
+            pair[0],
+        ),
         reverse=True,
     )
-    selected_older = scored_older[:remaining_budget]
+    jira_budget = min(remaining_after_coverage, max(remaining_after_coverage // 4, 150))
+    if jira_budget > 0:
+        for _, bucket_items in jira_ranked:
+            ranked_items = sorted(
+                bucket_items,
+                key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
+                reverse=True,
+            )
+            add_unique_items(selected, seen, ranked_items, min(max_messages, len(selected) + 2))
+            if len(selected) >= recent_budget + coverage_budget + jira_budget:
+                break
+
+    ranked_messages = sorted(
+        scored_messages,
+        key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
+        reverse=True,
+    )
+    add_unique_items(selected, seen, ranked_messages, max_messages)
+
+    for item in selected:
+        item.pop("_score", None)
 
-    selected = selected_recent + selected_older
     selected.sort(key=lambda item: item.get("timestamp", ""))
     return selected
 
@@ -333,15 +440,21 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
     channel_counter = Counter(item["channel"] for item in messages)
     user_counter = Counter(item["username"] for item in messages)
     jira_counter = Counter()
+    year_counter = Counter()
 
     for item in messages:
         jira_counter.update(JIRA_RE.findall(item.get("message", "")))
+        year_counter.update([message_year(item)])
+
+    first_timestamp = messages[0]["timestamp"] if messages else "n/a"
+    last_timestamp = messages[-1]["timestamp"] if messages else "n/a"
 
     lines = [
         "# Slack Import Summary",
         "",
         f"- Messages imported: {len(messages)}",
         f"- Channels imported: {', '.join(channels) if channels else 'all detected channels'}",
+        f"- Time span covered: {first_timestamp} -> {last_timestamp}",
         "",
         "## Top Channels",
     ]
@@ -360,11 +473,16 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
     else:
         lines.append("- None detected")
 
+    lines.extend(["", "## Historical Coverage"])
+    for year, count in year_counter.most_common():
+        lines.append(f"- {year}: {count}")
+
     lines.extend(
         [
             "",
             "## Guidance",
             "- Treat this archive as historical context, not current truth.",
+            "- The importer preserves recent context and older high-signal evidence across channels and years.",
             "- Prefer promoting durable patterns, repeated approvals, role mappings, Jira references, and architectural context.",
             "- Avoid promoting outdated status unless it still affects current understanding.",
         ]