feat: enhance Mattermost proxy with improved post ID deduplication and file tracking

This commit is contained in:
2026-05-19 16:03:16 -06:00
parent 9dd731f758
commit b886c61afd
4 changed files with 30 additions and 35 deletions

View File

@@ -54,6 +54,7 @@ class MattermostMirror:
self.latest_md_path = self.out_dir / "latest.md"
self.seen_post_ids: set[str] = set()
self.seen_by_file: dict[Path, set[str]] = {}
self.users: dict[str, str] = {}
self.channels: dict[str, str] = {}
self.state: dict[str, Any] = {"channels": {}, "users": {}, "updated_at": None}
@@ -81,11 +82,22 @@ class MattermostMirror:
self.state = {"channels": {}, "users": {}, "updated_at": None}
def _load_recent_seen_ids(self) -> None:
# Bound startup work: latest.jsonl contains the dedupe window. Daily files also
# protect same-day restarts below.
# Bound startup work: latest.jsonl contains the hot dedupe window. Daily
# files are loaded lazily when older/backfilled messages are encountered.
for path in [self.latest_jsonl_path, self._daily_messages_path(datetime.now(timezone.utc))]:
if not path.exists():
continue
try:
ids = self._load_seen_ids_for_file(path)
self.seen_post_ids.update(ids)
except Exception:
continue
def _load_seen_ids_for_file(self, path: Path) -> set[str]:
if path in self.seen_by_file:
return self.seen_by_file[path]
ids: set[str] = set()
if path.exists():
try:
with path.open("r", encoding="utf-8") as handle:
for line in handle:
@@ -94,9 +106,11 @@ class MattermostMirror:
obj = json.loads(line)
post_id = obj.get("post_id")
if post_id:
self.seen_post_ids.add(post_id)
ids.add(post_id)
except Exception:
continue
ids = set()
self.seen_by_file[path] = ids
return ids
def _atomic_write_text(self, path: Path, text: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
@@ -231,12 +245,15 @@ class MattermostMirror:
def _write_message(self, msg: dict[str, Any]) -> None:
post_id = msg["post_id"]
if post_id in self.seen_post_ids:
return
self.seen_post_ids.add(post_id)
created_dt = self._dt_from_ms(msg.get("created_at_ms"))
self._append_jsonl(self._daily_messages_path(created_dt), msg)
daily_path = self._daily_messages_path(created_dt)
daily_seen = self._load_seen_ids_for_file(daily_path)
if post_id in self.seen_post_ids or post_id in daily_seen:
return
self.seen_post_ids.add(post_id)
daily_seen.add(post_id)
self._append_jsonl(daily_path, msg)
self._update_state(msg)
self._update_latest(msg)
self._update_index(created_dt, msg)