From 1d003fa242d2f0e890d961129ea3f27a409ad524 Mon Sep 17 00:00:00 2001 From: "david.delagneau" Date: Thu, 9 Apr 2026 16:04:08 -0600 Subject: [PATCH] feat: Enhance Slack import functionality to prioritize historical context and improve message selection criteria --- .opencode/commands/slack-import.md | 4 + scripts/slack/README.md | 3 + scripts/slack/import_slack_export.py | 144 ++++++++++++++++++++++++--- 3 files changed, 138 insertions(+), 13 deletions(-) diff --git a/.opencode/commands/slack-import.md b/.opencode/commands/slack-import.md index 1ca9cc0..e10d119 100644 --- a/.opencode/commands/slack-import.md +++ b/.opencode/commands/slack-import.md @@ -13,6 +13,8 @@ Inputs: - otherwise, if `archives/slack/export/` exists, use it as the default import source - if no channels are specified, auto-detect channels whose folder names start with `fidelity` - if no message limit is specified, auto-tune message selection based on archive size +- if no date range is specified, do an initial full-history sweep across the detected `fidelity*` channels +- preserve broad coverage across years and channels while still prioritizing high-signal messages First, run the importer: @@ -41,6 +43,7 @@ Instructions: - treat the Slack archive as historical evidence - assume this may be a large multi-year export +- assume the first import should preserve evidence from the beginning of the project, not just recent history - promote durable project-relevant context automatically when confidence is high - prefer promoting: - repeated Jira IDs and titles still relevant to current understanding @@ -48,6 +51,7 @@ Instructions: - recurring architecture or debugging patterns - past approvals or decisions that still matter - prioritize high-signal messages such as Jira references, approvals, scope changes, root-cause notes, points, and persistent technical constraints +- favor messages that help reconstruct project history across multiple years, not just the newest ones - avoid promoting outdated daily status unless it changes current understanding - update existing memory when the archive clarifies or corrects it - if historical facts are ambiguous or likely outdated, summarize them as archived context instead of promoting them diff --git a/scripts/slack/README.md b/scripts/slack/README.md index c4e1515..1524e43 100644 --- a/scripts/slack/README.md +++ b/scripts/slack/README.md @@ -47,7 +47,10 @@ For very large multi-year exports, the importer is designed to be selective by d - auto-detects channels whose names start with `fidelity` when no channels are specified - auto-tunes the message limit based on archive size +- when no date filters are provided, performs an initial full-history sweep across the detected `fidelity*` channels +- preserves coverage across channels and years, not only recent history - prefers recent messages plus older high-signal messages +- preserves strong Jira-linked messages even when they are old - prioritizes Jira IDs, approvals, scope changes, root-cause notes, points, and durable technical patterns Override behavior if needed: diff --git a/scripts/slack/import_slack_export.py b/scripts/slack/import_slack_export.py index 137f138..24159c1 100644 --- a/scripts/slack/import_slack_export.py +++ b/scripts/slack/import_slack_export.py @@ -14,6 +14,7 @@ JIRA_RE = re.compile(r"\b[A-Z][A-Z0-9]+-\d+\b") HIGH_SIGNAL_PATTERNS = [ re.compile(pattern, re.IGNORECASE) for pattern in [ + r"\bdecision\b", r"\broot cause\b", r"\bapproved?\b", r"\bpoints?\b", @@ -29,6 +30,20 @@ HIGH_SIGNAL_PATTERNS = [ r"\bfeature flag\b", r"\btitle\b", r"\bscope\b", + r"\bowner(?:ship)?\b", + r"\brollout\b", + r"\bmigration\b", + r"\bdependency\b", + r"\bcontract\b", + r"\blifecycle\b", + r"\bswiftui\b", + r"\bbug\b", + r"\bissue\b", + r"\bincident\b", + r"\bfix(?:ed)?\b", + r"\bvalidation\b", + r"\bdob\b", + r"\bteenidentitycheck\b", r"\bdone\b", r"\bin progress\b", r"\bblocked?\b", @@ -219,7 +234,9 @@ def auto_max_messages(total_messages: int) -> int: return 2500 if total_messages <= 25000: return 4000 - return 6000 + if total_messages <= 80000: + return 6500 + return 8000 def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> int: @@ -257,15 +274,65 @@ def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> in return score +def message_key(item: Dict[str, Any]) -> str: + return item.get("message_id") or "|".join( + [ + item.get("channel", ""), + item.get("timestamp", ""), + item.get("username", ""), + item.get("message", ""), + ] + ) + + +def message_year(item: Dict[str, Any]) -> str: + timestamp = item.get("timestamp", "") + try: + return str(datetime.fromisoformat(timestamp).year) + except ValueError: + return "unknown" + + +def add_unique_items( + selected: List[Dict[str, Any]], + seen: set[str], + candidates: Sequence[Dict[str, Any]], + limit: int, +) -> None: + if limit <= 0: + return + + for item in candidates: + if len(selected) >= limit: + return + key = message_key(item) + if key in seen: + continue + seen.add(key) + selected.append(item) + + def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_days: int) -> List[Dict[str, Any]]: if len(messages) <= max_messages: return messages recent_cutoff = datetime.now().astimezone() - timedelta(days=recent_days) + scored_messages: List[Dict[str, Any]] = [] + recent_messages: List[Dict[str, Any]] = [] + channel_year_buckets: Dict[tuple[str, str], List[Dict[str, Any]]] = {} + jira_buckets: Dict[str, List[Dict[str, Any]]] = {} - recent_messages = [] - older_messages = [] for item in messages: + item["_score"] = score_message(item, recent_cutoff) + scored_messages.append(item) + if item["_score"] > 0: + bucket_key = (item.get("channel", ""), message_year(item)) + channel_year_buckets.setdefault(bucket_key, []).append(item) + + jira_ids = JIRA_RE.findall(item.get("message", "")) + for jira_id in jira_ids: + jira_buckets.setdefault(jira_id, []).append(item) + try: message_dt = datetime.fromisoformat(item.get("timestamp", "")) except ValueError: @@ -273,21 +340,61 @@ def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_da if message_dt and message_dt >= recent_cutoff: recent_messages.append(item) - else: - older_messages.append(item) - recent_budget = min(len(recent_messages), max(max_messages // 2, 1000)) - selected_recent = recent_messages[-recent_budget:] - remaining_budget = max_messages - len(selected_recent) + selected: List[Dict[str, Any]] = [] + seen: set[str] = set() - scored_older = sorted( - older_messages, - key=lambda item: (score_message(item, recent_cutoff), item.get("timestamp", "")), + recent_budget = min(len(recent_messages), max(max_messages // 5, 300)) + add_unique_items(selected, seen, recent_messages[-recent_budget:], recent_budget) + + channel_year_keys = sorted(channel_year_buckets.keys(), key=lambda key: (key[1], key[0])) + remaining_after_recent = max_messages - len(selected) + coverage_budget = min( + remaining_after_recent, + max(remaining_after_recent // 3, min(len(channel_year_keys) * 2, remaining_after_recent)), + ) + if channel_year_keys and coverage_budget > 0: + per_bucket = max(1, min(4, coverage_budget // len(channel_year_keys) or 1)) + for bucket_key in channel_year_keys: + bucket_items = sorted( + channel_year_buckets[bucket_key], + key=lambda item: (item.get("_score", 0), item.get("timestamp", "")), + reverse=True, + ) + add_unique_items(selected, seen, bucket_items, min(max_messages, len(selected) + per_bucket)) + + remaining_after_coverage = max_messages - len(selected) + jira_ranked = sorted( + jira_buckets.items(), + key=lambda pair: ( + max(item.get("_score", 0) for item in pair[1]), + len(pair[1]), + pair[0], + ), reverse=True, ) - selected_older = scored_older[:remaining_budget] + jira_budget = min(remaining_after_coverage, max(remaining_after_coverage // 4, 150)) + if jira_budget > 0: + for _, bucket_items in jira_ranked: + ranked_items = sorted( + bucket_items, + key=lambda item: (item.get("_score", 0), item.get("timestamp", "")), + reverse=True, + ) + add_unique_items(selected, seen, ranked_items, min(max_messages, len(selected) + 2)) + if len(selected) >= recent_budget + coverage_budget + jira_budget: + break + + ranked_messages = sorted( + scored_messages, + key=lambda item: (item.get("_score", 0), item.get("timestamp", "")), + reverse=True, + ) + add_unique_items(selected, seen, ranked_messages, max_messages) + + for item in selected: + item.pop("_score", None) - selected = selected_recent + selected_older selected.sort(key=lambda item: item.get("timestamp", "")) return selected @@ -333,15 +440,21 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st channel_counter = Counter(item["channel"] for item in messages) user_counter = Counter(item["username"] for item in messages) jira_counter = Counter() + year_counter = Counter() for item in messages: jira_counter.update(JIRA_RE.findall(item.get("message", ""))) + year_counter.update([message_year(item)]) + + first_timestamp = messages[0]["timestamp"] if messages else "n/a" + last_timestamp = messages[-1]["timestamp"] if messages else "n/a" lines = [ "# Slack Import Summary", "", f"- Messages imported: {len(messages)}", f"- Channels imported: {', '.join(channels) if channels else 'all detected channels'}", + f"- Time span covered: {first_timestamp} -> {last_timestamp}", "", "## Top Channels", ] @@ -360,11 +473,16 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st else: lines.append("- None detected") + lines.extend(["", "## Historical Coverage"]) + for year, count in year_counter.most_common(): + lines.append(f"- {year}: {count}") + lines.extend( [ "", "## Guidance", "- Treat this archive as historical context, not current truth.", + "- The importer preserves recent context and older high-signal evidence across channels and years.", "- Prefer promoting durable patterns, repeated approvals, role mappings, Jira references, and architectural context.", "- Avoid promoting outdated status unless it still affects current understanding.", ]