feat: enhance Mattermost proxy with improved post ID deduplication and file tracking
This commit is contained in:
@@ -54,6 +54,7 @@ class MattermostMirror:
|
||||
self.latest_md_path = self.out_dir / "latest.md"
|
||||
|
||||
self.seen_post_ids: set[str] = set()
|
||||
self.seen_by_file: dict[Path, set[str]] = {}
|
||||
self.users: dict[str, str] = {}
|
||||
self.channels: dict[str, str] = {}
|
||||
self.state: dict[str, Any] = {"channels": {}, "users": {}, "updated_at": None}
|
||||
@@ -81,11 +82,22 @@ class MattermostMirror:
|
||||
self.state = {"channels": {}, "users": {}, "updated_at": None}
|
||||
|
||||
def _load_recent_seen_ids(self) -> None:
|
||||
# Bound startup work: latest.jsonl contains the dedupe window. Daily files also
|
||||
# protect same-day restarts below.
|
||||
# Bound startup work: latest.jsonl contains the hot dedupe window. Daily
|
||||
# files are loaded lazily when older/backfilled messages are encountered.
|
||||
for path in [self.latest_jsonl_path, self._daily_messages_path(datetime.now(timezone.utc))]:
|
||||
if not path.exists():
|
||||
continue
|
||||
try:
|
||||
ids = self._load_seen_ids_for_file(path)
|
||||
self.seen_post_ids.update(ids)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def _load_seen_ids_for_file(self, path: Path) -> set[str]:
|
||||
if path in self.seen_by_file:
|
||||
return self.seen_by_file[path]
|
||||
ids: set[str] = set()
|
||||
if path.exists():
|
||||
try:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
@@ -94,9 +106,11 @@ class MattermostMirror:
|
||||
obj = json.loads(line)
|
||||
post_id = obj.get("post_id")
|
||||
if post_id:
|
||||
self.seen_post_ids.add(post_id)
|
||||
ids.add(post_id)
|
||||
except Exception:
|
||||
continue
|
||||
ids = set()
|
||||
self.seen_by_file[path] = ids
|
||||
return ids
|
||||
|
||||
def _atomic_write_text(self, path: Path, text: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -231,12 +245,15 @@ class MattermostMirror:
|
||||
|
||||
def _write_message(self, msg: dict[str, Any]) -> None:
|
||||
post_id = msg["post_id"]
|
||||
if post_id in self.seen_post_ids:
|
||||
return
|
||||
self.seen_post_ids.add(post_id)
|
||||
|
||||
created_dt = self._dt_from_ms(msg.get("created_at_ms"))
|
||||
self._append_jsonl(self._daily_messages_path(created_dt), msg)
|
||||
daily_path = self._daily_messages_path(created_dt)
|
||||
daily_seen = self._load_seen_ids_for_file(daily_path)
|
||||
if post_id in self.seen_post_ids or post_id in daily_seen:
|
||||
return
|
||||
|
||||
self.seen_post_ids.add(post_id)
|
||||
daily_seen.add(post_id)
|
||||
self._append_jsonl(daily_path, msg)
|
||||
self._update_state(msg)
|
||||
self._update_latest(msg)
|
||||
self._update_index(created_dt, msg)
|
||||
|
||||
Reference in New Issue
Block a user