feat: Enhance Slack import functionality to prioritize historical context and improve message selection criteria
This commit is contained in:
@@ -13,6 +13,8 @@ Inputs:
|
|||||||
- otherwise, if `archives/slack/export/` exists, use it as the default import source
|
- otherwise, if `archives/slack/export/` exists, use it as the default import source
|
||||||
- if no channels are specified, auto-detect channels whose folder names start with `fidelity`
|
- if no channels are specified, auto-detect channels whose folder names start with `fidelity`
|
||||||
- if no message limit is specified, auto-tune message selection based on archive size
|
- if no message limit is specified, auto-tune message selection based on archive size
|
||||||
|
- if no date range is specified, do an initial full-history sweep across the detected `fidelity*` channels
|
||||||
|
- preserve broad coverage across years and channels while still prioritizing high-signal messages
|
||||||
|
|
||||||
First, run the importer:
|
First, run the importer:
|
||||||
|
|
||||||
@@ -41,6 +43,7 @@ Instructions:
|
|||||||
|
|
||||||
- treat the Slack archive as historical evidence
|
- treat the Slack archive as historical evidence
|
||||||
- assume this may be a large multi-year export
|
- assume this may be a large multi-year export
|
||||||
|
- assume the first import should preserve evidence from the beginning of the project, not just recent history
|
||||||
- promote durable project-relevant context automatically when confidence is high
|
- promote durable project-relevant context automatically when confidence is high
|
||||||
- prefer promoting:
|
- prefer promoting:
|
||||||
- repeated Jira IDs and titles still relevant to current understanding
|
- repeated Jira IDs and titles still relevant to current understanding
|
||||||
@@ -48,6 +51,7 @@ Instructions:
|
|||||||
- recurring architecture or debugging patterns
|
- recurring architecture or debugging patterns
|
||||||
- past approvals or decisions that still matter
|
- past approvals or decisions that still matter
|
||||||
- prioritize high-signal messages such as Jira references, approvals, scope changes, root-cause notes, points, and persistent technical constraints
|
- prioritize high-signal messages such as Jira references, approvals, scope changes, root-cause notes, points, and persistent technical constraints
|
||||||
|
- favor messages that help reconstruct project history across multiple years, not just the newest ones
|
||||||
- avoid promoting outdated daily status unless it changes current understanding
|
- avoid promoting outdated daily status unless it changes current understanding
|
||||||
- update existing memory when the archive clarifies or corrects it
|
- update existing memory when the archive clarifies or corrects it
|
||||||
- if historical facts are ambiguous or likely outdated, summarize them as archived context instead of promoting them
|
- if historical facts are ambiguous or likely outdated, summarize them as archived context instead of promoting them
|
||||||
|
|||||||
@@ -47,7 +47,10 @@ For very large multi-year exports, the importer is designed to be selective by d
|
|||||||
|
|
||||||
- auto-detects channels whose names start with `fidelity` when no channels are specified
|
- auto-detects channels whose names start with `fidelity` when no channels are specified
|
||||||
- auto-tunes the message limit based on archive size
|
- auto-tunes the message limit based on archive size
|
||||||
|
- when no date filters are provided, performs an initial full-history sweep across the detected `fidelity*` channels
|
||||||
|
- preserves coverage across channels and years, not only recent history
|
||||||
- prefers recent messages plus older high-signal messages
|
- prefers recent messages plus older high-signal messages
|
||||||
|
- preserves strong Jira-linked messages even when they are old
|
||||||
- prioritizes Jira IDs, approvals, scope changes, root-cause notes, points, and durable technical patterns
|
- prioritizes Jira IDs, approvals, scope changes, root-cause notes, points, and durable technical patterns
|
||||||
|
|
||||||
Override behavior if needed:
|
Override behavior if needed:
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ JIRA_RE = re.compile(r"\b[A-Z][A-Z0-9]+-\d+\b")
|
|||||||
HIGH_SIGNAL_PATTERNS = [
|
HIGH_SIGNAL_PATTERNS = [
|
||||||
re.compile(pattern, re.IGNORECASE)
|
re.compile(pattern, re.IGNORECASE)
|
||||||
for pattern in [
|
for pattern in [
|
||||||
|
r"\bdecision\b",
|
||||||
r"\broot cause\b",
|
r"\broot cause\b",
|
||||||
r"\bapproved?\b",
|
r"\bapproved?\b",
|
||||||
r"\bpoints?\b",
|
r"\bpoints?\b",
|
||||||
@@ -29,6 +30,20 @@ HIGH_SIGNAL_PATTERNS = [
|
|||||||
r"\bfeature flag\b",
|
r"\bfeature flag\b",
|
||||||
r"\btitle\b",
|
r"\btitle\b",
|
||||||
r"\bscope\b",
|
r"\bscope\b",
|
||||||
|
r"\bowner(?:ship)?\b",
|
||||||
|
r"\brollout\b",
|
||||||
|
r"\bmigration\b",
|
||||||
|
r"\bdependency\b",
|
||||||
|
r"\bcontract\b",
|
||||||
|
r"\blifecycle\b",
|
||||||
|
r"\bswiftui\b",
|
||||||
|
r"\bbug\b",
|
||||||
|
r"\bissue\b",
|
||||||
|
r"\bincident\b",
|
||||||
|
r"\bfix(?:ed)?\b",
|
||||||
|
r"\bvalidation\b",
|
||||||
|
r"\bdob\b",
|
||||||
|
r"\bteenidentitycheck\b",
|
||||||
r"\bdone\b",
|
r"\bdone\b",
|
||||||
r"\bin progress\b",
|
r"\bin progress\b",
|
||||||
r"\bblocked?\b",
|
r"\bblocked?\b",
|
||||||
@@ -219,7 +234,9 @@ def auto_max_messages(total_messages: int) -> int:
|
|||||||
return 2500
|
return 2500
|
||||||
if total_messages <= 25000:
|
if total_messages <= 25000:
|
||||||
return 4000
|
return 4000
|
||||||
return 6000
|
if total_messages <= 80000:
|
||||||
|
return 6500
|
||||||
|
return 8000
|
||||||
|
|
||||||
|
|
||||||
def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> int:
|
def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> int:
|
||||||
@@ -257,15 +274,65 @@ def score_message(item: Dict[str, Any], recent_cutoff: Optional[datetime]) -> in
|
|||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def message_key(item: Dict[str, Any]) -> str:
|
||||||
|
return item.get("message_id") or "|".join(
|
||||||
|
[
|
||||||
|
item.get("channel", ""),
|
||||||
|
item.get("timestamp", ""),
|
||||||
|
item.get("username", ""),
|
||||||
|
item.get("message", ""),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def message_year(item: Dict[str, Any]) -> str:
|
||||||
|
timestamp = item.get("timestamp", "")
|
||||||
|
try:
|
||||||
|
return str(datetime.fromisoformat(timestamp).year)
|
||||||
|
except ValueError:
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def add_unique_items(
|
||||||
|
selected: List[Dict[str, Any]],
|
||||||
|
seen: set[str],
|
||||||
|
candidates: Sequence[Dict[str, Any]],
|
||||||
|
limit: int,
|
||||||
|
) -> None:
|
||||||
|
if limit <= 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
for item in candidates:
|
||||||
|
if len(selected) >= limit:
|
||||||
|
return
|
||||||
|
key = message_key(item)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
selected.append(item)
|
||||||
|
|
||||||
|
|
||||||
def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_days: int) -> List[Dict[str, Any]]:
|
def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_days: int) -> List[Dict[str, Any]]:
|
||||||
if len(messages) <= max_messages:
|
if len(messages) <= max_messages:
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
recent_cutoff = datetime.now().astimezone() - timedelta(days=recent_days)
|
recent_cutoff = datetime.now().astimezone() - timedelta(days=recent_days)
|
||||||
|
scored_messages: List[Dict[str, Any]] = []
|
||||||
|
recent_messages: List[Dict[str, Any]] = []
|
||||||
|
channel_year_buckets: Dict[tuple[str, str], List[Dict[str, Any]]] = {}
|
||||||
|
jira_buckets: Dict[str, List[Dict[str, Any]]] = {}
|
||||||
|
|
||||||
recent_messages = []
|
|
||||||
older_messages = []
|
|
||||||
for item in messages:
|
for item in messages:
|
||||||
|
item["_score"] = score_message(item, recent_cutoff)
|
||||||
|
scored_messages.append(item)
|
||||||
|
if item["_score"] > 0:
|
||||||
|
bucket_key = (item.get("channel", ""), message_year(item))
|
||||||
|
channel_year_buckets.setdefault(bucket_key, []).append(item)
|
||||||
|
|
||||||
|
jira_ids = JIRA_RE.findall(item.get("message", ""))
|
||||||
|
for jira_id in jira_ids:
|
||||||
|
jira_buckets.setdefault(jira_id, []).append(item)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
message_dt = datetime.fromisoformat(item.get("timestamp", ""))
|
message_dt = datetime.fromisoformat(item.get("timestamp", ""))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@@ -273,21 +340,61 @@ def select_messages(messages: List[Dict[str, Any]], max_messages: int, recent_da
|
|||||||
|
|
||||||
if message_dt and message_dt >= recent_cutoff:
|
if message_dt and message_dt >= recent_cutoff:
|
||||||
recent_messages.append(item)
|
recent_messages.append(item)
|
||||||
else:
|
|
||||||
older_messages.append(item)
|
|
||||||
|
|
||||||
recent_budget = min(len(recent_messages), max(max_messages // 2, 1000))
|
selected: List[Dict[str, Any]] = []
|
||||||
selected_recent = recent_messages[-recent_budget:]
|
seen: set[str] = set()
|
||||||
remaining_budget = max_messages - len(selected_recent)
|
|
||||||
|
|
||||||
scored_older = sorted(
|
recent_budget = min(len(recent_messages), max(max_messages // 5, 300))
|
||||||
older_messages,
|
add_unique_items(selected, seen, recent_messages[-recent_budget:], recent_budget)
|
||||||
key=lambda item: (score_message(item, recent_cutoff), item.get("timestamp", "")),
|
|
||||||
|
channel_year_keys = sorted(channel_year_buckets.keys(), key=lambda key: (key[1], key[0]))
|
||||||
|
remaining_after_recent = max_messages - len(selected)
|
||||||
|
coverage_budget = min(
|
||||||
|
remaining_after_recent,
|
||||||
|
max(remaining_after_recent // 3, min(len(channel_year_keys) * 2, remaining_after_recent)),
|
||||||
|
)
|
||||||
|
if channel_year_keys and coverage_budget > 0:
|
||||||
|
per_bucket = max(1, min(4, coverage_budget // len(channel_year_keys) or 1))
|
||||||
|
for bucket_key in channel_year_keys:
|
||||||
|
bucket_items = sorted(
|
||||||
|
channel_year_buckets[bucket_key],
|
||||||
|
key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
add_unique_items(selected, seen, bucket_items, min(max_messages, len(selected) + per_bucket))
|
||||||
|
|
||||||
|
remaining_after_coverage = max_messages - len(selected)
|
||||||
|
jira_ranked = sorted(
|
||||||
|
jira_buckets.items(),
|
||||||
|
key=lambda pair: (
|
||||||
|
max(item.get("_score", 0) for item in pair[1]),
|
||||||
|
len(pair[1]),
|
||||||
|
pair[0],
|
||||||
|
),
|
||||||
reverse=True,
|
reverse=True,
|
||||||
)
|
)
|
||||||
selected_older = scored_older[:remaining_budget]
|
jira_budget = min(remaining_after_coverage, max(remaining_after_coverage // 4, 150))
|
||||||
|
if jira_budget > 0:
|
||||||
|
for _, bucket_items in jira_ranked:
|
||||||
|
ranked_items = sorted(
|
||||||
|
bucket_items,
|
||||||
|
key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
add_unique_items(selected, seen, ranked_items, min(max_messages, len(selected) + 2))
|
||||||
|
if len(selected) >= recent_budget + coverage_budget + jira_budget:
|
||||||
|
break
|
||||||
|
|
||||||
|
ranked_messages = sorted(
|
||||||
|
scored_messages,
|
||||||
|
key=lambda item: (item.get("_score", 0), item.get("timestamp", "")),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
add_unique_items(selected, seen, ranked_messages, max_messages)
|
||||||
|
|
||||||
|
for item in selected:
|
||||||
|
item.pop("_score", None)
|
||||||
|
|
||||||
selected = selected_recent + selected_older
|
|
||||||
selected.sort(key=lambda item: item.get("timestamp", ""))
|
selected.sort(key=lambda item: item.get("timestamp", ""))
|
||||||
return selected
|
return selected
|
||||||
|
|
||||||
@@ -333,15 +440,21 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
|
|||||||
channel_counter = Counter(item["channel"] for item in messages)
|
channel_counter = Counter(item["channel"] for item in messages)
|
||||||
user_counter = Counter(item["username"] for item in messages)
|
user_counter = Counter(item["username"] for item in messages)
|
||||||
jira_counter = Counter()
|
jira_counter = Counter()
|
||||||
|
year_counter = Counter()
|
||||||
|
|
||||||
for item in messages:
|
for item in messages:
|
||||||
jira_counter.update(JIRA_RE.findall(item.get("message", "")))
|
jira_counter.update(JIRA_RE.findall(item.get("message", "")))
|
||||||
|
year_counter.update([message_year(item)])
|
||||||
|
|
||||||
|
first_timestamp = messages[0]["timestamp"] if messages else "n/a"
|
||||||
|
last_timestamp = messages[-1]["timestamp"] if messages else "n/a"
|
||||||
|
|
||||||
lines = [
|
lines = [
|
||||||
"# Slack Import Summary",
|
"# Slack Import Summary",
|
||||||
"",
|
"",
|
||||||
f"- Messages imported: {len(messages)}",
|
f"- Messages imported: {len(messages)}",
|
||||||
f"- Channels imported: {', '.join(channels) if channels else 'all detected channels'}",
|
f"- Channels imported: {', '.join(channels) if channels else 'all detected channels'}",
|
||||||
|
f"- Time span covered: {first_timestamp} -> {last_timestamp}",
|
||||||
"",
|
"",
|
||||||
"## Top Channels",
|
"## Top Channels",
|
||||||
]
|
]
|
||||||
@@ -360,11 +473,16 @@ def build_summary(messages: List[Dict[str, Any]], channels: Sequence[str]) -> st
|
|||||||
else:
|
else:
|
||||||
lines.append("- None detected")
|
lines.append("- None detected")
|
||||||
|
|
||||||
|
lines.extend(["", "## Historical Coverage"])
|
||||||
|
for year, count in year_counter.most_common():
|
||||||
|
lines.append(f"- {year}: {count}")
|
||||||
|
|
||||||
lines.extend(
|
lines.extend(
|
||||||
[
|
[
|
||||||
"",
|
"",
|
||||||
"## Guidance",
|
"## Guidance",
|
||||||
"- Treat this archive as historical context, not current truth.",
|
"- Treat this archive as historical context, not current truth.",
|
||||||
|
"- The importer preserves recent context and older high-signal evidence across channels and years.",
|
||||||
"- Prefer promoting durable patterns, repeated approvals, role mappings, Jira references, and architectural context.",
|
"- Prefer promoting durable patterns, repeated approvals, role mappings, Jira references, and architectural context.",
|
||||||
"- Avoid promoting outdated status unless it still affects current understanding.",
|
"- Avoid promoting outdated status unless it still affects current understanding.",
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user