Skip to content

Commit be1ac7f

Browse files
add public archive mode
1 parent 43d0453 commit be1ac7f

File tree

5 files changed

+381
-2
lines changed

5 files changed

+381
-2
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ RUN apk add --no-cache gcc make python3-dev musl-dev libffi-dev openssl-dev && \
88
pip install --no-cache-dir -r requirements.txt && \
99
apk del gcc make python3-dev musl-dev libffi-dev openssl-dev
1010

11-
RUN apk add --no-cache gpg
11+
RUN apk add --no-cache gpg git
1212

1313
COPY . .
1414

git_archive.py

Lines changed: 343 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,343 @@
1+
"""
2+
Git Archive Module for Discord Backup Bot
3+
4+
Writes Discord messages to GitHub repositories in a format compatible with
5+
discord-backup-restorer. Runs in parallel with the existing encrypted S3 backup.
6+
"""
7+
8+
import base64
9+
import hashlib
10+
import json
11+
import os
12+
from datetime import datetime
13+
from pathlib import Path
14+
from typing import Optional
15+
16+
from git import Repo
17+
from git.exc import GitCommandError
18+
19+
20+
class GitArchiveConfig:
21+
"""Load and validate per-server git archive configuration."""
22+
23+
def __init__(self, config_path: str):
24+
self.config_path = config_path
25+
self.config = self._load_config()
26+
27+
def _load_config(self) -> dict:
28+
"""Load configuration from JSON file."""
29+
if not os.path.exists(self.config_path):
30+
raise FileNotFoundError(f"Git archive config not found: {self.config_path}")
31+
32+
with open(self.config_path, 'r', encoding='utf-8') as f:
33+
return json.load(f)
34+
35+
@property
36+
def default_branch(self) -> str:
37+
return self.config.get('default_branch', 'main')
38+
39+
@property
40+
def commit_batch_size(self) -> int:
41+
return self.config.get('commit_batch_size', 50)
42+
43+
def get_server_config(self, server_id: str) -> Optional[dict]:
44+
"""Get configuration for a specific server."""
45+
servers = self.config.get('servers', {})
46+
return servers.get(str(server_id))
47+
48+
def is_server_enabled(self, server_id: str) -> bool:
49+
"""Check if git archiving is enabled for a server."""
50+
server_config = self.get_server_config(server_id)
51+
if server_config is None:
52+
return False
53+
return server_config.get('enabled', False)
54+
55+
56+
class GitArchiveManager:
57+
"""Manages git-based archiving of Discord messages."""
58+
59+
def __init__(self, config: GitArchiveConfig, clone_path: str, github_token: str):
60+
self.config = config
61+
self.clone_path = clone_path
62+
self.github_token = github_token
63+
self.repos: dict[str, Repo] = {}
64+
self.message_queues: dict[str, list] = {} # channel_id -> messages
65+
66+
def _get_repo_path(self, server_id: str) -> str:
67+
"""Get the local path for a server's repo clone."""
68+
return os.path.join(self.clone_path, str(server_id))
69+
70+
def _get_authenticated_url(self, repo_url: str) -> str:
71+
"""Add GitHub token to repo URL for authentication."""
72+
if repo_url.startswith('https://github.com/'):
73+
return repo_url.replace(
74+
'https://github.com/',
75+
f'https://{self.github_token}@github.com/'
76+
)
77+
return repo_url
78+
79+
def ensure_repo_cloned(self, server_id: str) -> Optional[Repo]:
80+
"""Ensure the repository for a server is cloned and up to date."""
81+
server_config = self.config.get_server_config(server_id)
82+
if server_config is None or not server_config.get('enabled', False):
83+
return None
84+
85+
repo_path = self._get_repo_path(server_id)
86+
repo_url = server_config.get('repo_url')
87+
branch = server_config.get('branch', self.config.default_branch)
88+
89+
if str(server_id) in self.repos:
90+
return self.repos[str(server_id)]
91+
92+
auth_url = self._get_authenticated_url(repo_url)
93+
94+
if os.path.exists(repo_path):
95+
# Repo exists, open and pull
96+
print(f'\t[Git Archive] Opening existing repo at {repo_path}')
97+
repo = Repo(repo_path)
98+
try:
99+
repo.remotes.origin.pull()
100+
except GitCommandError as e:
101+
print(f'\t[Git Archive] Warning: Could not pull: {e}')
102+
else:
103+
# Clone the repo
104+
print(f'\t[Git Archive] Cloning {repo_url} to {repo_path}')
105+
os.makedirs(repo_path, exist_ok=True)
106+
repo = Repo.clone_from(auth_url, repo_path, branch=branch)
107+
108+
self.repos[str(server_id)] = repo
109+
return repo
110+
111+
def should_archive_channel(self, channel) -> bool:
112+
"""Determine if a channel should be archived based on config."""
113+
server_id = str(channel.guild.id)
114+
server_config = self.config.get_server_config(server_id)
115+
116+
if server_config is None or not server_config.get('enabled', False):
117+
return False
118+
119+
# Check excluded channels
120+
excluded_channels = server_config.get('excluded_channels', [])
121+
if channel.name in excluded_channels:
122+
return False
123+
124+
# Check allowed categories
125+
allowed_categories = server_config.get('allowed_categories', [])
126+
if allowed_categories:
127+
category_name = channel.category.name if channel.category else None
128+
if category_name not in allowed_categories:
129+
return False
130+
131+
return True
132+
133+
def queue_message(self, backup_msg: dict, channel) -> None:
134+
"""Queue a message for later batch commit."""
135+
channel_key = f"{backup_msg['server']['id']}_{channel.id}"
136+
if channel_key not in self.message_queues:
137+
self.message_queues[channel_key] = []
138+
139+
self.message_queues[channel_key].append({
140+
'backup_msg': backup_msg,
141+
'channel': channel
142+
})
143+
144+
def _to_export_format(self, backup_msg: dict, attachments_dir: Path,
145+
include_attachments: bool) -> dict:
146+
"""Convert internal backup format to DiscordExportMessage format."""
147+
export_attachments = []
148+
149+
if include_attachments:
150+
for attach in backup_msg.get('attachments', []):
151+
# Decode base64 content
152+
content_b64 = attach.get('content', '')
153+
if content_b64:
154+
content_bytes = base64.b64decode(content_b64)
155+
# Compute SHA256 hash
156+
content_hash = hashlib.sha256(content_bytes).hexdigest()
157+
158+
# Write binary file
159+
attachments_dir.mkdir(parents=True, exist_ok=True)
160+
attachment_path = attachments_dir / f"{content_hash}.bin"
161+
if not attachment_path.exists():
162+
with open(attachment_path, 'wb') as f:
163+
f.write(content_bytes)
164+
165+
export_attachments.append({
166+
'type': attach.get('type', ''),
167+
'origin_name': attach.get('origin_name', ''),
168+
'content': content_hash
169+
})
170+
171+
return {
172+
'author': backup_msg['author']['name'],
173+
'category': backup_msg.get('category', ''),
174+
'parent': backup_msg.get('parent', ''),
175+
'content': backup_msg.get('content', ''),
176+
'created_at': backup_msg.get('created_at', ''),
177+
'attachments': export_attachments
178+
}
179+
180+
def _get_date_from_iso(self, iso_string: str) -> str:
181+
"""Extract date (YYYY-MM-DD) from ISO timestamp."""
182+
try:
183+
dt = datetime.fromisoformat(iso_string.replace('Z', '+00:00'))
184+
return dt.strftime('%Y-%m-%d')
185+
except (ValueError, AttributeError):
186+
return datetime.now().strftime('%Y-%m-%d')
187+
188+
def _write_daily_json(self, json_path: Path, new_messages: list) -> int:
189+
"""
190+
Write messages to daily JSON file, merging with existing if present.
191+
Returns the number of new messages added.
192+
"""
193+
existing_messages = []
194+
195+
if json_path.exists():
196+
try:
197+
with open(json_path, 'r', encoding='utf-8') as f:
198+
existing_messages = json.load(f)
199+
except (json.JSONDecodeError, IOError):
200+
existing_messages = []
201+
202+
# Create a set of existing timestamps for deduplication
203+
existing_timestamps = {msg.get('created_at') for msg in existing_messages}
204+
205+
# Add only new messages (dedupe by created_at)
206+
new_count = 0
207+
for msg in new_messages:
208+
if msg.get('created_at') not in existing_timestamps:
209+
existing_messages.append(msg)
210+
existing_timestamps.add(msg.get('created_at'))
211+
new_count += 1
212+
213+
# Sort all messages by created_at
214+
existing_messages.sort(key=lambda m: m.get('created_at', ''))
215+
216+
# Write back
217+
json_path.parent.mkdir(parents=True, exist_ok=True)
218+
with open(json_path, 'w', encoding='utf-8') as f:
219+
json.dump(existing_messages, f, indent=2, ensure_ascii=False)
220+
221+
return new_count
222+
223+
async def flush_and_commit(self, channel) -> None:
224+
"""Flush queued messages and commit to git."""
225+
channel_key = f"{channel.guild.id}_{channel.id}"
226+
queued = self.message_queues.pop(channel_key, [])
227+
228+
if not queued:
229+
return
230+
231+
server_id = str(channel.guild.id)
232+
server_config = self.config.get_server_config(server_id)
233+
if server_config is None:
234+
return
235+
236+
include_attachments = server_config.get('include_attachments', True)
237+
238+
repo = self.ensure_repo_cloned(server_id)
239+
if repo is None:
240+
return
241+
242+
repo_path = Path(self._get_repo_path(server_id))
243+
channel_name = channel.name
244+
245+
# Sanitize channel name for filesystem
246+
safe_channel_name = "".join(
247+
c if c.isalnum() or c in '-_' else '_' for c in channel_name
248+
)
249+
250+
channel_dir = repo_path / safe_channel_name
251+
attachments_dir = channel_dir / 'attachments'
252+
253+
# Group messages by date
254+
messages_by_date: dict[str, list] = {}
255+
for item in queued:
256+
backup_msg = item['backup_msg']
257+
date_str = self._get_date_from_iso(backup_msg.get('created_at', ''))
258+
259+
if date_str not in messages_by_date:
260+
messages_by_date[date_str] = []
261+
262+
export_msg = self._to_export_format(
263+
backup_msg, attachments_dir, include_attachments
264+
)
265+
messages_by_date[date_str].append(export_msg)
266+
267+
# Write daily JSON files
268+
total_new = 0
269+
date_range = []
270+
for date_str, messages in messages_by_date.items():
271+
json_path = channel_dir / f"{date_str}.json"
272+
new_count = self._write_daily_json(json_path, messages)
273+
total_new += new_count
274+
if new_count > 0:
275+
date_range.append(date_str)
276+
277+
if total_new == 0:
278+
print(f'\t[Git Archive] No new messages to commit for #{channel_name}')
279+
return
280+
281+
# Stage all changes
282+
repo.git.add(A=True)
283+
284+
# Check if there are staged changes
285+
if not repo.is_dirty(index=True):
286+
print(f'\t[Git Archive] No changes to commit for #{channel_name}')
287+
return
288+
289+
# Create commit message
290+
date_range.sort()
291+
if len(date_range) == 1:
292+
date_info = date_range[0]
293+
else:
294+
date_info = f"{date_range[0]} to {date_range[-1]}"
295+
296+
commit_message = f"Archive {total_new} messages from #{channel_name} ({date_info})"
297+
print(f'\t[Git Archive] {commit_message}')
298+
299+
repo.index.commit(commit_message)
300+
301+
# Push changes
302+
try:
303+
repo.remotes.origin.push()
304+
print(f'\t[Git Archive] Pushed changes for #{channel_name}')
305+
except GitCommandError as e:
306+
print(f'\t[Git Archive] Warning: Could not push: {e}')
307+
308+
309+
def is_git_archive_enabled() -> bool:
310+
"""Check if git archiving is globally enabled."""
311+
return os.getenv('GIT_ARCHIVE_ENABLED') == '1'
312+
313+
314+
def init_git_archive() -> Optional[GitArchiveManager]:
315+
"""Initialize git archive manager if enabled."""
316+
if not is_git_archive_enabled():
317+
return None
318+
319+
config_path = os.getenv('GIT_ARCHIVE_CONFIG_PATH')
320+
if config_path is None:
321+
print('[Git Archive] Warning: GIT_ARCHIVE_ENABLED=1 but GIT_ARCHIVE_CONFIG_PATH not set')
322+
return None
323+
324+
clone_path = os.getenv('GIT_ARCHIVE_CLONE_PATH')
325+
if clone_path is None:
326+
print('[Git Archive] Warning: GIT_ARCHIVE_ENABLED=1 but GIT_ARCHIVE_CLONE_PATH not set')
327+
return None
328+
329+
github_token = os.getenv('GITHUB_TOKEN')
330+
if github_token is None:
331+
print('[Git Archive] Warning: GIT_ARCHIVE_ENABLED=1 but GITHUB_TOKEN not set')
332+
return None
333+
334+
try:
335+
config = GitArchiveConfig(config_path)
336+
print('[Git Archive] Initialized successfully')
337+
return GitArchiveManager(config, clone_path, github_token)
338+
except FileNotFoundError as e:
339+
print(f'[Git Archive] Warning: {e}')
340+
return None
341+
except json.JSONDecodeError as e:
342+
print(f'[Git Archive] Warning: Invalid config JSON: {e}')
343+
return None

git_archive_config.json.example

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"default_branch": "main",
3+
"commit_batch_size": 50,
4+
"servers": {
5+
"1194636226041479298": {
6+
"enabled": true,
7+
"repo_url": "https://github.com/org/server-archive.git",
8+
"branch": "main",
9+
"allowed_categories": ["public", "governance"],
10+
"excluded_channels": ["mod-chat"],
11+
"include_attachments": true
12+
}
13+
}
14+
}

0 commit comments

Comments
 (0)