|
| 1 | +import asyncio |
| 2 | +import builtins |
1 | 3 | import hashlib |
| 4 | +import re |
| 5 | +import shutil |
2 | 6 | import time |
| 7 | +import urllib.parse |
| 8 | +import urllib.request |
3 | 9 | import xml.etree.ElementTree as ET |
4 | 10 | from dataclasses import dataclass, field |
5 | 11 | from datetime import datetime |
|
34 | 40 | # NOTE: Pagination limit set to the upper end of the recommended range |
35 | 41 | # https://api.slack.com/apis/pagination#facts |
36 | 42 | PAGINATION_LIMIT = 200 |
| 43 | +PRIVATE_FILE_DOWNLOAD_TIMEOUT_SECONDS = 60 |
| 44 | +SLACK_PRIVATE_FILE_HOST = "files.slack.com" |
37 | 45 |
|
38 | 46 | CONNECTOR_TYPE = "slack" |
39 | 47 |
|
40 | 48 |
|
| 49 | +def _safe_slack_filename(filename: str) -> str: |
| 50 | + sanitized = re.sub(r"[/\\]+", "_", filename).strip() |
| 51 | + return sanitized or "slack-file" |
| 52 | + |
| 53 | + |
| 54 | +def _validate_private_download_url(download_url: str) -> str: |
| 55 | + parsed_url = urllib.parse.urlparse(download_url) |
| 56 | + hostname = parsed_url.hostname.lower() if parsed_url.hostname else None |
| 57 | + |
| 58 | + if parsed_url.scheme != "https" or hostname != SLACK_PRIVATE_FILE_HOST: |
| 59 | + raise ValueError("Slack file download URL must be an HTTPS files.slack.com URL.") |
| 60 | + |
| 61 | + if parsed_url.username or parsed_url.password: |
| 62 | + raise ValueError("Slack file download URL must not include credentials.") |
| 63 | + |
| 64 | + try: |
| 65 | + port = parsed_url.port |
| 66 | + except builtins.ValueError as exc: |
| 67 | + raise ValueError("Slack file download URL has an invalid port.") from exc |
| 68 | + |
| 69 | + if port not in (None, 443): |
| 70 | + raise ValueError("Slack file download URL must use the default HTTPS port.") |
| 71 | + |
| 72 | + return download_url |
| 73 | + |
| 74 | + |
| 75 | +class _NoRedirectHandler(urllib.request.HTTPRedirectHandler): |
| 76 | + def redirect_request(self, req, fp, code, msg, headers, newurl): # noqa: D102, ANN001 |
| 77 | + raise ValueError( |
| 78 | + "Slack file download redirected; refusing to forward bearer authorization." |
| 79 | + ) |
| 80 | + |
| 81 | + |
41 | 82 | class SlackAccessConfig(AccessConfig): |
42 | 83 | token: str = Field( |
43 | 84 | description="Bot token used to access Slack API, must have channels:history scope for the" |
44 | 85 | " bot user." |
45 | 86 | ) |
| 87 | + refresh_token: Optional[str] = Field(default=None, description="Slack OAuth refresh token.") |
46 | 88 |
|
47 | 89 |
|
48 | 90 | class SlackConnectionConfig(ConnectionConfig): |
@@ -109,6 +151,8 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: |
109 | 151 | messages = conversation_history.get("messages", []) |
110 | 152 | if messages: |
111 | 153 | yield self._messages_to_file_data(messages, channel) |
| 154 | + for file_data in self._message_files_to_file_data(messages, channel): |
| 155 | + yield file_data |
112 | 156 |
|
113 | 157 | def _messages_to_file_data( |
114 | 158 | self, |
@@ -142,6 +186,44 @@ def _messages_to_file_data( |
142 | 186 | display_name=source_identifiers.fullpath, |
143 | 187 | ) |
144 | 188 |
|
| 189 | + def _message_files_to_file_data( |
| 190 | + self, |
| 191 | + messages: list[dict], |
| 192 | + channel: str, |
| 193 | + ) -> Generator[FileData, None, None]: |
| 194 | + for message in messages: |
| 195 | + message_ts = message.get("ts") |
| 196 | + for slack_file in message.get("files", []) or []: |
| 197 | + file_id = slack_file.get("id") |
| 198 | + if not file_id or not message_ts: |
| 199 | + continue |
| 200 | + |
| 201 | + filename = _safe_slack_filename( |
| 202 | + f"{file_id}-{slack_file.get('name') or slack_file.get('title') or file_id}" |
| 203 | + ) |
| 204 | + identifier_base = f"{channel}-{message_ts}-{file_id}" |
| 205 | + identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest() |
| 206 | + source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename) |
| 207 | + yield FileData( |
| 208 | + identifier=identifier, |
| 209 | + connector_type=CONNECTOR_TYPE, |
| 210 | + source_identifiers=source_identifiers, |
| 211 | + metadata=FileDataSourceMetadata( |
| 212 | + date_created=( |
| 213 | + str(slack_file.get("created")) if slack_file.get("created") else None |
| 214 | + ), |
| 215 | + date_modified=message_ts, |
| 216 | + date_processed=str(time.time()), |
| 217 | + record_locator={ |
| 218 | + "type": "file", |
| 219 | + "channel": channel, |
| 220 | + "message_ts": message_ts, |
| 221 | + "file_id": file_id, |
| 222 | + }, |
| 223 | + ), |
| 224 | + display_name=source_identifiers.fullpath, |
| 225 | + ) |
| 226 | + |
145 | 227 | @SourceConnectionError.wrap |
146 | 228 | def precheck(self) -> None: |
147 | 229 | client = self.connection_config.get_client() |
@@ -172,7 +254,13 @@ async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse: |
172 | 254 | ) |
173 | 255 | raise ValueError("Generated invalid download path.") |
174 | 256 |
|
175 | | - await self._download_conversation(file_data, download_path) |
| 257 | + if ( |
| 258 | + file_data.metadata.record_locator |
| 259 | + and file_data.metadata.record_locator.get("type") == "file" |
| 260 | + ): |
| 261 | + await self._download_file(file_data, download_path) |
| 262 | + else: |
| 263 | + await self._download_conversation(file_data, download_path) |
176 | 264 | return self.generate_download_response(file_data, download_path) |
177 | 265 |
|
178 | 266 | def is_async(self): |
@@ -224,6 +312,42 @@ async def _download_conversation(self, file_data: FileData, download_path: Path) |
224 | 312 | download_path.parent.mkdir(exist_ok=True, parents=True) |
225 | 313 | conversation_xml.write(download_path, encoding="utf-8", xml_declaration=True) |
226 | 314 |
|
| 315 | + async def _download_file(self, file_data: FileData, download_path: Path) -> None: |
| 316 | + record_locator = file_data.metadata.record_locator |
| 317 | + if record_locator is None or "file_id" not in record_locator: |
| 318 | + logger.error(f"Invalid file record locator in metadata: {record_locator}.") |
| 319 | + raise ValueError("Invalid file record locator.") |
| 320 | + |
| 321 | + client = self.connection_config.get_async_client() |
| 322 | + file_info = await client.files_info(file=record_locator["file_id"]) |
| 323 | + if not file_info.get("ok", True): |
| 324 | + raise ValueError(f"Slack files.info failed: {file_info.get('error')}") |
| 325 | + |
| 326 | + slack_file = file_info.get("file", {}) |
| 327 | + download_url = slack_file.get("url_private_download") or record_locator.get( |
| 328 | + "url_private_download" |
| 329 | + ) |
| 330 | + if not download_url: |
| 331 | + raise ValueError("Slack file is missing url_private_download.") |
| 332 | + download_url = _validate_private_download_url(download_url) |
| 333 | + |
| 334 | + token = self.connection_config.access_config.get_secret_value().token |
| 335 | + request = urllib.request.Request( |
| 336 | + download_url, |
| 337 | + headers={"Authorization": f"Bearer {token}"}, |
| 338 | + ) |
| 339 | + download_path.parent.mkdir(exist_ok=True, parents=True) |
| 340 | + await asyncio.to_thread(self._download_private_file, request, download_path) |
| 341 | + |
| 342 | + @staticmethod |
| 343 | + def _download_private_file(request: urllib.request.Request, download_path: Path) -> None: |
| 344 | + opener = urllib.request.build_opener(_NoRedirectHandler) |
| 345 | + with opener.open( |
| 346 | + request, |
| 347 | + timeout=PRIVATE_FILE_DOWNLOAD_TIMEOUT_SECONDS, |
| 348 | + ) as response, download_path.open("wb") as output_file: |
| 349 | + shutil.copyfileobj(response, output_file) |
| 350 | + |
227 | 351 | def _conversation_to_xml(self, conversation: list[list[dict]]) -> ET.ElementTree: |
228 | 352 | root = ET.Element("messages") |
229 | 353 |
|
|
0 commit comments