feat: Enhance Slack import functionality to prioritize historical context and improve message selection criteria

2026-04-09 16:04:08 -06:00
parent dab9dabd92
commit 1d003fa242
3 changed files with 138 additions and 13 deletions
--- a/.opencode/commands/slack-import.md
+++ b/.opencode/commands/slack-import.md
@@ -13,6 +13,8 @@ Inputs:
 - otherwise, if `archives/slack/export/` exists, use it as the default import source
 - if no channels are specified, auto-detect channels whose folder names start with `fidelity`
 - if no message limit is specified, auto-tune message selection based on archive size
 - if no date range is specified, do an initial full-history sweep across the detected `fidelity*` channels
 - preserve broad coverage across years and channels while still prioritizing high-signal messages
 First, run the importer:
@@ -41,6 +43,7 @@ Instructions:
 - treat the Slack archive as historical evidence
 - assume this may be a large multi-year export
 - assume the first import should preserve evidence from the beginning of the project, not just recent history
 - promote durable project-relevant context automatically when confidence is high
 - prefer promoting:
  - repeated Jira IDs and titles still relevant to current understanding
@@ -48,6 +51,7 @@ Instructions:
  - recurring architecture or debugging patterns
  - past approvals or decisions that still matter
 - prioritize high-signal messages such as Jira references, approvals, scope changes, root-cause notes, points, and persistent technical constraints
 - favor messages that help reconstruct project history across multiple years, not just the newest ones
 - avoid promoting outdated daily status unless it changes current understanding
 - update existing memory when the archive clarifies or corrects it
 - if historical facts are ambiguous or likely outdated, summarize them as archived context instead of promoting them
--- a/scripts/slack/README.md
+++ b/scripts/slack/README.md
@@ -47,7 +47,10 @@ For very large multi-year exports, the importer is designed to be selective by d
 - auto-detects channels whose names start with `fidelity` when no channels are specified
 - auto-tunes the message limit based on archive size
 - when no date filters are provided, performs an initial full-history sweep across the detected `fidelity*` channels
 - preserves coverage across channels and years, not only recent history
 - prefers recent messages plus older high-signal messages
 - preserves strong Jira-linked messages even when they are old
 - prioritizes Jira IDs, approvals, scope changes, root-cause notes, points, and durable technical patterns
 Override behavior if needed:
--- a/scripts/slack/import_slack_export.py
+++ b/scripts/slack/import_slack_export.py
@@ -14,6 +14,7 @@ JIRA_RE = re.compile(r"\b[A-Z][A-Z0-9]+-\d+\b")
 HIGH_SIGNAL_PATTERNS = [
    re.compile(pattern, re.IGNORECASE)
    for pattern in [
        r"\bdecision\b",
        r"\broot cause\b",
        r"\bapproved?\b",
        r"\bpoints?\b",
@@ -29,6 +30,20 @@ HIGH_SIGNAL_PATTERNS = [
        r"\bfeature flag\b",
        r"\btitle\b",
        r"\bscope\b",
        r"\bowner(?:ship)?\b",
        r"\brollout\b",
        r"\bmigration\b",
        r"\bdependency\b",
        r"\bcontract\b",
        r"\blifecycle\b",
        r"\bswiftui\b",
        r"\bbug\b",
        r"\bissue\b",
        r"\bincident\b",
        r"\bfix(?:ed)?\b",
        r"\bvalidation\b",
        r"\bdob\b",
        r"\bteenidentitycheck\b",
        r"\bdone\b",
        r"\bin progress\b",
        r"\bblocked?\b",
@@ -219,7 +234,9 @@ def auto_max_messages(total_messages: int) -> int:
        return 2500
    if total_messages <= 25000:
        return 4000
-    return 6000
+    if total_messages <= 80000:
        return 6500
    return 8000
 def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> int:
@@ -257,15 +274,65 @@ def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> in
    return score
 def message_key(item: Dict[str, Any]) -> str:
    return item.get("message_id") or "|".join(
        [
            item.get("channel", ""),
            item.get("timestamp", ""),
            item.get("username", ""),
            item.get("message", ""),
        ]
    )
 def message_year(item: Dict[str, Any]) -> str:
    timestamp = item.get("timestamp", "")
    try:
        return str(datetime.fromisoformat(timestamp).year)
    except ValueError:
        return "unknown"
 def add_unique_items(
    selected: List[Dict[str, Any]],
    seen: set[str],
    candidates: Sequence[Dict[str, Any]],
    limit: int,
 ) -> None:
    if limit <= 0:
        return
    for item in candidates:
        if len(selected) >= limit:
            return
        key = message_key(item)
        if key in seen:
            continue
        seen.add(key)
        selected.append(item)
 def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_days: int) -> List[Dict[str, Any]]:
    if len(messages) <= max_messages:
        return messages
    recent_cutoff = datetime.now().astimezone() - timedelta(days=recent_days)
    scored_messages: List[Dict[str, Any]] = []
    recent_messages: List[Dict[str, Any]] = []
    channel_year_buckets: Dict[tuple[str, str], List[Dict[str, Any]]] = {}
    jira_buckets: Dict[str, List[Dict[str, Any]]] = {}
    recent_messages = []
    older_messages = []
    for item in messages:
        item["_score"] = score_message(item, recent_cutoff)
        scored_messages.append(item)
        if item["_score"] > 0:
            bucket_key = (item.get("channel", ""), message_year(item))
            channel_year_buckets.setdefault(bucket_key, []).append(item)
        jira_ids = JIRA_RE.findall(item.get("message", ""))
        for jira_id in jira_ids:
            jira_buckets.setdefault(jira_id, []).append(item)
        try:
            message_dt = datetime.fromisoformat(item.get("timestamp", ""))
        except ValueError:
@@ -273,21 +340,61 @@ def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_da
        if message_dt and message_dt >= recent_cutoff:
            recent_messages.append(item)
        else:
            older_messages.append(item)
-    recent_budget = min(len(recent_messages), max(max_messages // 2, 1000))
+    selected: List[Dict[str, Any]] = []
-    selected_recent = recent_messages[-recent_budget:]
+    seen: set[str] = set()
    remaining_budget = max_messages - len(selected_recent)
-    scored_older = sorted(
+    recent_budget = min(len(recent_messages), max(max_messages // 5, 300))
-        older_messages,
+    add_unique_items(selected, seen, recent_messages[-recent_budget:], recent_budget)
-        key=lambda item: (score_message(item, recent_cutoff), item.get("timestamp", "")),
+
    channel_year_keys = sorted(channel_year_buckets.keys(), key=lambda key: (key[1], key[0]))
    remaining_after_recent = max_messages - len(selected)
    coverage_budget = min(
        remaining_after_recent,
        max(remaining_after_recent // 3, min(len(channel_year_keys) * 2, remaining_after_recent)),
    )
    if channel_year_keys and coverage_budget > 0:
        per_bucket = max(1, min(4, coverage_budget // len(channel_year_keys) or 1))
        for bucket_key in channel_year_keys:
            bucket_items = sorted(
                channel_year_buckets[bucket_key],
                key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
                reverse=True,
            )
            add_unique_items(selected, seen, bucket_items, min(max_messages, len(selected) + per_bucket))
    remaining_after_coverage = max_messages - len(selected)
    jira_ranked = sorted(
        jira_buckets.items(),
        key=lambda pair: (
            max(item.get("_score", 0) for item in pair[1]),
            len(pair[1]),
            pair[0],
        ),
        reverse=True,
    )
-    selected_older = scored_older[:remaining_budget]
+    jira_budget = min(remaining_after_coverage, max(remaining_after_coverage // 4, 150))
    if jira_budget > 0:
        for _, bucket_items in jira_ranked:
            ranked_items = sorted(
                bucket_items,
                key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
                reverse=True,
            )
            add_unique_items(selected, seen, ranked_items, min(max_messages, len(selected) + 2))
            if len(selected) >= recent_budget + coverage_budget + jira_budget:
                break
    ranked_messages = sorted(
        scored_messages,
        key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
        reverse=True,
    )
    add_unique_items(selected, seen, ranked_messages, max_messages)
    for item in selected:
        item.pop("_score", None)
    selected = selected_recent + selected_older
    selected.sort(key=lambda item: item.get("timestamp", ""))
    return selected
@@ -333,15 +440,21 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
    channel_counter = Counter(item["channel"] for item in messages)
    user_counter = Counter(item["username"] for item in messages)
    jira_counter = Counter()
    year_counter = Counter()
    for item in messages:
        jira_counter.update(JIRA_RE.findall(item.get("message", "")))
        year_counter.update([message_year(item)])
    first_timestamp = messages[0]["timestamp"] if messages else "n/a"
    last_timestamp = messages[-1]["timestamp"] if messages else "n/a"
    lines = [
        "# Slack Import Summary",
        "",
        f"- Messages imported: {len(messages)}",
        f"- Channels imported: {', '.join(channels) if channels else 'all detected channels'}",
        f"- Time span covered: {first_timestamp} -> {last_timestamp}",
        "",
        "## Top Channels",
    ]
@@ -360,11 +473,16 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
    else:
        lines.append("- None detected")
    lines.extend(["", "## Historical Coverage"])
    for year, count in year_counter.most_common():
        lines.append(f"- {year}: {count}")
    lines.extend(
        [
            "",
            "## Guidance",
            "- Treat this archive as historical context, not current truth.",
            "- The importer preserves recent context and older high-signal evidence across channels and years.",
            "- Prefer promoting durable patterns, repeated approvals, role mappings, Jira references, and architectural context.",
            "- Avoid promoting outdated status unless it still affects current understanding.",
        ]