feat: Enhance Slack import functionality to prioritize historical context and improve message selection criteria
This commit is contained in:
@@ -14,6 +14,7 @@ JIRA_RE = re.compile(r"\b[A-Z][A-Z0-9]+-\d+\b")
|
||||
HIGH_SIGNAL_PATTERNS = [
|
||||
re.compile(pattern, re.IGNORECASE)
|
||||
for pattern in [
|
||||
r"\bdecision\b",
|
||||
r"\broot cause\b",
|
||||
r"\bapproved?\b",
|
||||
r"\bpoints?\b",
|
||||
@@ -29,6 +30,20 @@ HIGH_SIGNAL_PATTERNS = [
|
||||
r"\bfeature flag\b",
|
||||
r"\btitle\b",
|
||||
r"\bscope\b",
|
||||
r"\bowner(?:ship)?\b",
|
||||
r"\brollout\b",
|
||||
r"\bmigration\b",
|
||||
r"\bdependency\b",
|
||||
r"\bcontract\b",
|
||||
r"\blifecycle\b",
|
||||
r"\bswiftui\b",
|
||||
r"\bbug\b",
|
||||
r"\bissue\b",
|
||||
r"\bincident\b",
|
||||
r"\bfix(?:ed)?\b",
|
||||
r"\bvalidation\b",
|
||||
r"\bdob\b",
|
||||
r"\bteenidentitycheck\b",
|
||||
r"\bdone\b",
|
||||
r"\bin progress\b",
|
||||
r"\bblocked?\b",
|
||||
@@ -219,7 +234,9 @@ def auto_max_messages(total_messages: int) -> int:
|
||||
return 2500
|
||||
if total_messages <= 25000:
|
||||
return 4000
|
||||
return 6000
|
||||
if total_messages <= 80000:
|
||||
return 6500
|
||||
return 8000
|
||||
|
||||
|
||||
def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> int:
|
||||
@@ -257,15 +274,65 @@ def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> in
|
||||
return score
|
||||
|
||||
|
||||
def message_key(item: Dict[str, Any]) -> str:
|
||||
return item.get("message_id") or "|".join(
|
||||
[
|
||||
item.get("channel", ""),
|
||||
item.get("timestamp", ""),
|
||||
item.get("username", ""),
|
||||
item.get("message", ""),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def message_year(item: Dict[str, Any]) -> str:
|
||||
timestamp = item.get("timestamp", "")
|
||||
try:
|
||||
return str(datetime.fromisoformat(timestamp).year)
|
||||
except ValueError:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def add_unique_items(
|
||||
selected: List[Dict[str, Any]],
|
||||
seen: set[str],
|
||||
candidates: Sequence[Dict[str, Any]],
|
||||
limit: int,
|
||||
) -> None:
|
||||
if limit <= 0:
|
||||
return
|
||||
|
||||
for item in candidates:
|
||||
if len(selected) >= limit:
|
||||
return
|
||||
key = message_key(item)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
selected.append(item)
|
||||
|
||||
|
||||
def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_days: int) -> List[Dict[str, Any]]:
|
||||
if len(messages) <= max_messages:
|
||||
return messages
|
||||
|
||||
recent_cutoff = datetime.now().astimezone() - timedelta(days=recent_days)
|
||||
scored_messages: List[Dict[str, Any]] = []
|
||||
recent_messages: List[Dict[str, Any]] = []
|
||||
channel_year_buckets: Dict[tuple[str, str], List[Dict[str, Any]]] = {}
|
||||
jira_buckets: Dict[str, List[Dict[str, Any]]] = {}
|
||||
|
||||
recent_messages = []
|
||||
older_messages = []
|
||||
for item in messages:
|
||||
item["_score"] = score_message(item, recent_cutoff)
|
||||
scored_messages.append(item)
|
||||
if item["_score"] > 0:
|
||||
bucket_key = (item.get("channel", ""), message_year(item))
|
||||
channel_year_buckets.setdefault(bucket_key, []).append(item)
|
||||
|
||||
jira_ids = JIRA_RE.findall(item.get("message", ""))
|
||||
for jira_id in jira_ids:
|
||||
jira_buckets.setdefault(jira_id, []).append(item)
|
||||
|
||||
try:
|
||||
message_dt = datetime.fromisoformat(item.get("timestamp", ""))
|
||||
except ValueError:
|
||||
@@ -273,21 +340,61 @@ def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_da
|
||||
|
||||
if message_dt and message_dt >= recent_cutoff:
|
||||
recent_messages.append(item)
|
||||
else:
|
||||
older_messages.append(item)
|
||||
|
||||
recent_budget = min(len(recent_messages), max(max_messages // 2, 1000))
|
||||
selected_recent = recent_messages[-recent_budget:]
|
||||
remaining_budget = max_messages - len(selected_recent)
|
||||
selected: List[Dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
scored_older = sorted(
|
||||
older_messages,
|
||||
key=lambda item: (score_message(item, recent_cutoff), item.get("timestamp", "")),
|
||||
recent_budget = min(len(recent_messages), max(max_messages // 5, 300))
|
||||
add_unique_items(selected, seen, recent_messages[-recent_budget:], recent_budget)
|
||||
|
||||
channel_year_keys = sorted(channel_year_buckets.keys(), key=lambda key: (key[1], key[0]))
|
||||
remaining_after_recent = max_messages - len(selected)
|
||||
coverage_budget = min(
|
||||
remaining_after_recent,
|
||||
max(remaining_after_recent // 3, min(len(channel_year_keys) * 2, remaining_after_recent)),
|
||||
)
|
||||
if channel_year_keys and coverage_budget > 0:
|
||||
per_bucket = max(1, min(4, coverage_budget // len(channel_year_keys) or 1))
|
||||
for bucket_key in channel_year_keys:
|
||||
bucket_items = sorted(
|
||||
channel_year_buckets[bucket_key],
|
||||
key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
|
||||
reverse=True,
|
||||
)
|
||||
add_unique_items(selected, seen, bucket_items, min(max_messages, len(selected) + per_bucket))
|
||||
|
||||
remaining_after_coverage = max_messages - len(selected)
|
||||
jira_ranked = sorted(
|
||||
jira_buckets.items(),
|
||||
key=lambda pair: (
|
||||
max(item.get("_score", 0) for item in pair[1]),
|
||||
len(pair[1]),
|
||||
pair[0],
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
selected_older = scored_older[:remaining_budget]
|
||||
jira_budget = min(remaining_after_coverage, max(remaining_after_coverage // 4, 150))
|
||||
if jira_budget > 0:
|
||||
for _, bucket_items in jira_ranked:
|
||||
ranked_items = sorted(
|
||||
bucket_items,
|
||||
key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
|
||||
reverse=True,
|
||||
)
|
||||
add_unique_items(selected, seen, ranked_items, min(max_messages, len(selected) + 2))
|
||||
if len(selected) >= recent_budget + coverage_budget + jira_budget:
|
||||
break
|
||||
|
||||
ranked_messages = sorted(
|
||||
scored_messages,
|
||||
key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
|
||||
reverse=True,
|
||||
)
|
||||
add_unique_items(selected, seen, ranked_messages, max_messages)
|
||||
|
||||
for item in selected:
|
||||
item.pop("_score", None)
|
||||
|
||||
selected = selected_recent + selected_older
|
||||
selected.sort(key=lambda item: item.get("timestamp", ""))
|
||||
return selected
|
||||
|
||||
@@ -333,15 +440,21 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
|
||||
channel_counter = Counter(item["channel"] for item in messages)
|
||||
user_counter = Counter(item["username"] for item in messages)
|
||||
jira_counter = Counter()
|
||||
year_counter = Counter()
|
||||
|
||||
for item in messages:
|
||||
jira_counter.update(JIRA_RE.findall(item.get("message", "")))
|
||||
year_counter.update([message_year(item)])
|
||||
|
||||
first_timestamp = messages[0]["timestamp"] if messages else "n/a"
|
||||
last_timestamp = messages[-1]["timestamp"] if messages else "n/a"
|
||||
|
||||
lines = [
|
||||
"# Slack Import Summary",
|
||||
"",
|
||||
f"- Messages imported: {len(messages)}",
|
||||
f"- Channels imported: {', '.join(channels) if channels else 'all detected channels'}",
|
||||
f"- Time span covered: {first_timestamp} -> {last_timestamp}",
|
||||
"",
|
||||
"## Top Channels",
|
||||
]
|
||||
@@ -360,11 +473,16 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
|
||||
else:
|
||||
lines.append("- None detected")
|
||||
|
||||
lines.extend(["", "## Historical Coverage"])
|
||||
for year, count in year_counter.most_common():
|
||||
lines.append(f"- {year}: {count}")
|
||||
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Guidance",
|
||||
"- Treat this archive as historical context, not current truth.",
|
||||
"- The importer preserves recent context and older high-signal evidence across channels and years.",
|
||||
"- Prefer promoting durable patterns, repeated approvals, role mappings, Jira references, and architectural context.",
|
||||
"- Avoid promoting outdated status unless it still affects current understanding.",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user