Skip to content

Commit 380509a

Browse files
mateuszkuprowskiMateusz Kuprowskicursoragent
authored
Enabled slack connector to ingest files and support OAuth (#707)
Co-authored-by: Mateusz Kuprowski <mateusz@unstructured.io> Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 264c3cd commit 380509a

4 files changed

Lines changed: 302 additions & 2 deletions

File tree

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## [1.6.4]
2+
3+
### Enhancements
4+
5+
- **feat(slack): support file attachments and OAuth refresh tokens.** Slack indexing now emits file attachment records, downloading uses Slack private file URLs with bearer authentication, and `SlackAccessConfig` accepts `refresh_token` so platform plugin schemas can expose OAuth token rotation settings.
6+
7+
### Fixes
8+
9+
- **fix(slack): guard private file downloads.** Validate Slack private download URLs before sending bearer credentials, refuse redirects that could forward bearer credentials, stream private file downloads to disk, and use a bounded timeout for private file reads.
10+
111
## [1.6.3]
212

313
### Enhancements

test/unit/connectors/test_slack.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
from unittest.mock import Mock
2+
3+
import pytest
4+
5+
from unstructured_ingest.data_types.file_data import (
6+
FileData,
7+
FileDataSourceMetadata,
8+
SourceIdentifiers,
9+
)
10+
from unstructured_ingest.error import ValueError as IngestValueError
11+
from unstructured_ingest.processes.connectors.slack import (
12+
PRIVATE_FILE_DOWNLOAD_TIMEOUT_SECONDS,
13+
SlackAccessConfig,
14+
SlackDownloader,
15+
SlackIndexer,
16+
SlackIndexerConfig,
17+
_NoRedirectHandler,
18+
)
19+
20+
21+
def test_slack_access_config_accepts_refresh_token():
22+
config = SlackAccessConfig(token="xoxb-slack-token", refresh_token="xoxe-slack-refresh-token")
23+
24+
assert config.token == "xoxb-slack-token"
25+
assert config.refresh_token == "xoxe-slack-refresh-token"
26+
27+
28+
def test_slack_indexer_emits_file_data_for_message_files():
29+
client = Mock()
30+
client.conversations_history.return_value = [
31+
{
32+
"messages": [
33+
{
34+
"ts": "1710000000.000100",
35+
"text": "Here is the report",
36+
"files": [
37+
{
38+
"id": "F123",
39+
"name": "report.pdf",
40+
"url_private_download": "https://files.slack.com/report.pdf",
41+
}
42+
],
43+
}
44+
]
45+
}
46+
]
47+
connection_config = Mock()
48+
connection_config.get_client.return_value = client
49+
indexer = SlackIndexer(
50+
index_config=SlackIndexerConfig(channels=["C123"]),
51+
connection_config=connection_config,
52+
)
53+
54+
file_data = list(indexer.run())
55+
56+
assert len(file_data) == 2
57+
assert file_data[0].source_identifiers.filename.endswith(".xml")
58+
slack_file = file_data[1]
59+
assert slack_file.source_identifiers.filename == "F123-report.pdf"
60+
assert slack_file.metadata.record_locator == {
61+
"type": "file",
62+
"channel": "C123",
63+
"message_ts": "1710000000.000100",
64+
"file_id": "F123",
65+
}
66+
67+
68+
@pytest.mark.asyncio
69+
async def test_slack_downloader_downloads_file_attachment(tmp_path, mocker):
70+
async_client = Mock()
71+
async_client.files_info = mocker.AsyncMock(
72+
return_value={
73+
"ok": True,
74+
"file": {
75+
"id": "F123",
76+
"name": "report.pdf",
77+
"url_private_download": "https://files.slack.com/report.pdf",
78+
},
79+
}
80+
)
81+
connection_config = Mock()
82+
connection_config.get_async_client.return_value = async_client
83+
connection_config.access_config.get_secret_value.return_value.token = "xoxb-slack-token"
84+
downloader = SlackDownloader(connection_config=connection_config)
85+
file_data = FileData(
86+
identifier="F123",
87+
connector_type="slack",
88+
source_identifiers=SourceIdentifiers(
89+
filename="F123-report.pdf",
90+
fullpath="F123-report.pdf",
91+
),
92+
metadata=FileDataSourceMetadata(
93+
record_locator={
94+
"type": "file",
95+
"channel": "C123",
96+
"message_ts": "1710000000.000100",
97+
"file_id": "F123",
98+
}
99+
),
100+
)
101+
response = Mock()
102+
response.__enter__ = Mock(return_value=response)
103+
response.__exit__ = Mock(return_value=None)
104+
response.read.side_effect = [b"pdf ", b"bytes", b""]
105+
opener = Mock()
106+
opener.open.return_value = response
107+
build_opener = mocker.patch("urllib.request.build_opener", return_value=opener)
108+
109+
await downloader._download_file(file_data, tmp_path / "F123-report.pdf")
110+
111+
assert (tmp_path / "F123-report.pdf").read_bytes() == b"pdf bytes"
112+
build_opener.assert_called_once_with(_NoRedirectHandler)
113+
opener.open.assert_called_once()
114+
request = opener.open.call_args.args[0]
115+
assert request.full_url == "https://files.slack.com/report.pdf"
116+
assert request.headers["Authorization"] == "Bearer xoxb-slack-token"
117+
assert opener.open.call_args.kwargs["timeout"] == PRIVATE_FILE_DOWNLOAD_TIMEOUT_SECONDS
118+
assert response.read.call_count > 1
119+
120+
121+
def test_slack_private_file_download_rejects_redirects():
122+
handler = _NoRedirectHandler()
123+
124+
with pytest.raises(IngestValueError, match="redirected"):
125+
handler.redirect_request(None, None, 302, "Found", {}, "https://example.com/report.pdf")
126+
127+
128+
@pytest.mark.asyncio
129+
async def test_slack_downloader_rejects_non_slack_private_download_url(tmp_path, mocker):
130+
async_client = Mock()
131+
async_client.files_info = mocker.AsyncMock(
132+
return_value={
133+
"ok": True,
134+
"file": {
135+
"id": "F123",
136+
"name": "report.pdf",
137+
"url_private_download": "https://example.com/report.pdf",
138+
},
139+
}
140+
)
141+
connection_config = Mock()
142+
connection_config.get_async_client.return_value = async_client
143+
connection_config.access_config.get_secret_value.return_value.token = "xoxb-slack-token"
144+
downloader = SlackDownloader(connection_config=connection_config)
145+
file_data = FileData(
146+
identifier="F123",
147+
connector_type="slack",
148+
source_identifiers=SourceIdentifiers(
149+
filename="F123-report.pdf",
150+
fullpath="F123-report.pdf",
151+
),
152+
metadata=FileDataSourceMetadata(
153+
record_locator={
154+
"type": "file",
155+
"channel": "C123",
156+
"message_ts": "1710000000.000100",
157+
"file_id": "F123",
158+
}
159+
),
160+
)
161+
build_opener = mocker.patch("urllib.request.build_opener")
162+
163+
with pytest.raises(IngestValueError, match="files.slack.com"):
164+
await downloader._download_file(file_data, tmp_path / "F123-report.pdf")
165+
166+
build_opener.assert_not_called()

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.6.3" # pragma: no cover
1+
__version__ = "1.6.4" # pragma: no cover

unstructured_ingest/processes/connectors/slack.py

Lines changed: 125 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
1+
import asyncio
2+
import builtins
13
import hashlib
4+
import re
5+
import shutil
26
import time
7+
import urllib.parse
8+
import urllib.request
39
import xml.etree.ElementTree as ET
410
from dataclasses import dataclass, field
511
from datetime import datetime
@@ -34,15 +40,51 @@
3440
# NOTE: Pagination limit set to the upper end of the recommended range
3541
# https://api.slack.com/apis/pagination#facts
3642
PAGINATION_LIMIT = 200
43+
PRIVATE_FILE_DOWNLOAD_TIMEOUT_SECONDS = 60
44+
SLACK_PRIVATE_FILE_HOST = "files.slack.com"
3745

3846
CONNECTOR_TYPE = "slack"
3947

4048

49+
def _safe_slack_filename(filename: str) -> str:
50+
sanitized = re.sub(r"[/\\]+", "_", filename).strip()
51+
return sanitized or "slack-file"
52+
53+
54+
def _validate_private_download_url(download_url: str) -> str:
55+
parsed_url = urllib.parse.urlparse(download_url)
56+
hostname = parsed_url.hostname.lower() if parsed_url.hostname else None
57+
58+
if parsed_url.scheme != "https" or hostname != SLACK_PRIVATE_FILE_HOST:
59+
raise ValueError("Slack file download URL must be an HTTPS files.slack.com URL.")
60+
61+
if parsed_url.username or parsed_url.password:
62+
raise ValueError("Slack file download URL must not include credentials.")
63+
64+
try:
65+
port = parsed_url.port
66+
except builtins.ValueError as exc:
67+
raise ValueError("Slack file download URL has an invalid port.") from exc
68+
69+
if port not in (None, 443):
70+
raise ValueError("Slack file download URL must use the default HTTPS port.")
71+
72+
return download_url
73+
74+
75+
class _NoRedirectHandler(urllib.request.HTTPRedirectHandler):
76+
def redirect_request(self, req, fp, code, msg, headers, newurl): # noqa: D102, ANN001
77+
raise ValueError(
78+
"Slack file download redirected; refusing to forward bearer authorization."
79+
)
80+
81+
4182
class SlackAccessConfig(AccessConfig):
4283
token: str = Field(
4384
description="Bot token used to access Slack API, must have channels:history scope for the"
4485
" bot user."
4586
)
87+
refresh_token: Optional[str] = Field(default=None, description="Slack OAuth refresh token.")
4688

4789

4890
class SlackConnectionConfig(ConnectionConfig):
@@ -109,6 +151,8 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
109151
messages = conversation_history.get("messages", [])
110152
if messages:
111153
yield self._messages_to_file_data(messages, channel)
154+
for file_data in self._message_files_to_file_data(messages, channel):
155+
yield file_data
112156

113157
def _messages_to_file_data(
114158
self,
@@ -142,6 +186,44 @@ def _messages_to_file_data(
142186
display_name=source_identifiers.fullpath,
143187
)
144188

189+
def _message_files_to_file_data(
190+
self,
191+
messages: list[dict],
192+
channel: str,
193+
) -> Generator[FileData, None, None]:
194+
for message in messages:
195+
message_ts = message.get("ts")
196+
for slack_file in message.get("files", []) or []:
197+
file_id = slack_file.get("id")
198+
if not file_id or not message_ts:
199+
continue
200+
201+
filename = _safe_slack_filename(
202+
f"{file_id}-{slack_file.get('name') or slack_file.get('title') or file_id}"
203+
)
204+
identifier_base = f"{channel}-{message_ts}-{file_id}"
205+
identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
206+
source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
207+
yield FileData(
208+
identifier=identifier,
209+
connector_type=CONNECTOR_TYPE,
210+
source_identifiers=source_identifiers,
211+
metadata=FileDataSourceMetadata(
212+
date_created=(
213+
str(slack_file.get("created")) if slack_file.get("created") else None
214+
),
215+
date_modified=message_ts,
216+
date_processed=str(time.time()),
217+
record_locator={
218+
"type": "file",
219+
"channel": channel,
220+
"message_ts": message_ts,
221+
"file_id": file_id,
222+
},
223+
),
224+
display_name=source_identifiers.fullpath,
225+
)
226+
145227
@SourceConnectionError.wrap
146228
def precheck(self) -> None:
147229
client = self.connection_config.get_client()
@@ -172,7 +254,13 @@ async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
172254
)
173255
raise ValueError("Generated invalid download path.")
174256

175-
await self._download_conversation(file_data, download_path)
257+
if (
258+
file_data.metadata.record_locator
259+
and file_data.metadata.record_locator.get("type") == "file"
260+
):
261+
await self._download_file(file_data, download_path)
262+
else:
263+
await self._download_conversation(file_data, download_path)
176264
return self.generate_download_response(file_data, download_path)
177265

178266
def is_async(self):
@@ -224,6 +312,42 @@ async def _download_conversation(self, file_data: FileData, download_path: Path)
224312
download_path.parent.mkdir(exist_ok=True, parents=True)
225313
conversation_xml.write(download_path, encoding="utf-8", xml_declaration=True)
226314

315+
async def _download_file(self, file_data: FileData, download_path: Path) -> None:
316+
record_locator = file_data.metadata.record_locator
317+
if record_locator is None or "file_id" not in record_locator:
318+
logger.error(f"Invalid file record locator in metadata: {record_locator}.")
319+
raise ValueError("Invalid file record locator.")
320+
321+
client = self.connection_config.get_async_client()
322+
file_info = await client.files_info(file=record_locator["file_id"])
323+
if not file_info.get("ok", True):
324+
raise ValueError(f"Slack files.info failed: {file_info.get('error')}")
325+
326+
slack_file = file_info.get("file", {})
327+
download_url = slack_file.get("url_private_download") or record_locator.get(
328+
"url_private_download"
329+
)
330+
if not download_url:
331+
raise ValueError("Slack file is missing url_private_download.")
332+
download_url = _validate_private_download_url(download_url)
333+
334+
token = self.connection_config.access_config.get_secret_value().token
335+
request = urllib.request.Request(
336+
download_url,
337+
headers={"Authorization": f"Bearer {token}"},
338+
)
339+
download_path.parent.mkdir(exist_ok=True, parents=True)
340+
await asyncio.to_thread(self._download_private_file, request, download_path)
341+
342+
@staticmethod
343+
def _download_private_file(request: urllib.request.Request, download_path: Path) -> None:
344+
opener = urllib.request.build_opener(_NoRedirectHandler)
345+
with opener.open(
346+
request,
347+
timeout=PRIVATE_FILE_DOWNLOAD_TIMEOUT_SECONDS,
348+
) as response, download_path.open("wb") as output_file:
349+
shutil.copyfileobj(response, output_file)
350+
227351
def _conversation_to_xml(self, conversation: list[list[dict]]) -> ET.ElementTree:
228352
root = ET.Element("messages")
229353

0 commit comments

Comments
 (0)