From 0241e50878461e0b7250d08ae3dd5c3707027625 Mon Sep 17 00:00:00 2001 From: danielle-unstructured-io Date: Fri, 1 May 2026 14:42:47 -0600 Subject: [PATCH 01/16] feat(box): add ACL permissions metadata to Box connector Fetches collaborations from Box API (direct + inherited via path_collection ancestor walk) and normalizes into permissions_data on FileDataSourceMetadata, consistent with Confluence and Google Drive connectors. Co-Authored-By: Claude Sonnet 4.6 --- .../box_second_tier/directory_structure.json | 5 + .../8b0303ba-7c77-5e47-b0ff-790b8fc9881f.json | 61 ++++ .../box_top_folder/directory_structure.json | 5 + .../11333818-b47e-5991-b32f-701975b2caca.json | 61 ++++ test/integration/connectors/test_box.py | 83 ++++++ test/unit/connectors/test_box.py | 276 ++++++++++++++++++ .../processes/connectors/fsspec/box.py | 120 +++++++- 7 files changed, 609 insertions(+), 2 deletions(-) create mode 100644 test/integration/connectors/expected_results/box_second_tier/directory_structure.json create mode 100644 test/integration/connectors/expected_results/box_second_tier/file_data/8b0303ba-7c77-5e47-b0ff-790b8fc9881f.json create mode 100644 test/integration/connectors/expected_results/box_top_folder/directory_structure.json create mode 100644 test/integration/connectors/expected_results/box_top_folder/file_data/11333818-b47e-5991-b32f-701975b2caca.json create mode 100644 test/integration/connectors/test_box.py create mode 100644 test/unit/connectors/test_box.py diff --git a/test/integration/connectors/expected_results/box_second_tier/directory_structure.json b/test/integration/connectors/expected_results/box_second_tier/directory_structure.json new file mode 100644 index 000000000..3181bd646 --- /dev/null +++ b/test/integration/connectors/expected_results/box_second_tier/directory_structure.json @@ -0,0 +1,5 @@ +{ + "directory_structure": [ + "unstructured_uvopv4ry/catalog.pdf" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/box_second_tier/file_data/8b0303ba-7c77-5e47-b0ff-790b8fc9881f.json b/test/integration/connectors/expected_results/box_second_tier/file_data/8b0303ba-7c77-5e47-b0ff-790b8fc9881f.json new file mode 100644 index 000000000..73849f056 --- /dev/null +++ b/test/integration/connectors/expected_results/box_second_tier/file_data/8b0303ba-7c77-5e47-b0ff-790b8fc9881f.json @@ -0,0 +1,61 @@ +{ + "identifier": "8b0303ba-7c77-5e47-b0ff-790b8fc9881f", + "connector_type": "box", + "source_identifiers": { + "filename": "catalog.pdf", + "fullpath": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf", + "rel_path": "catalog.pdf" + }, + "metadata": { + "url": "box:///TestACLs-topfolder/TestACLs-secondtier/catalog.pdf", + "version": "2216144540657", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://TestACLs-topfolder/TestACLs-secondtier", + "file_id": "2216144540657" + }, + "date_created": "1777662782.0", + "date_modified": "1777662782.0", + "date_processed": "1777665707.7073228", + "permissions_data": [ + { + "read": { + "users": [ + "50881967280", + "50882409531" + ], + "groups": [] + } + }, + { + "update": { + "users": [ + "50881967280" + ], + "groups": [] + } + }, + { + "delete": { + "users": [ + "50881967280" + ], + "groups": [] + } + } + ], + "filesize_bytes": 296006 + }, + "additional_metadata": { + "name": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf", + "size": 296006, + "type": "file", + "id": "2216144540657", + "modified_at": "2026-05-01T12:13:02-07:00", + "created_at": "2026-05-01T12:13:02-07:00", + "original_file_path": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/gf/qwh2bdg93kb9gzxd_xhb49wc0000gn/T/tmpekwnxs4a/unstructured_uvopv4ry/catalog.pdf", + "display_name": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf" +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/box_top_folder/directory_structure.json b/test/integration/connectors/expected_results/box_top_folder/directory_structure.json new file mode 100644 index 000000000..c785e5c96 --- /dev/null +++ b/test/integration/connectors/expected_results/box_top_folder/directory_structure.json @@ -0,0 +1,5 @@ +{ + "directory_structure": [ + "unstructured_aqpewcxk/Billing issue - Example 1.pdf" + ] +} \ No newline at end of file diff --git a/test/integration/connectors/expected_results/box_top_folder/file_data/11333818-b47e-5991-b32f-701975b2caca.json b/test/integration/connectors/expected_results/box_top_folder/file_data/11333818-b47e-5991-b32f-701975b2caca.json new file mode 100644 index 000000000..7629ea710 --- /dev/null +++ b/test/integration/connectors/expected_results/box_top_folder/file_data/11333818-b47e-5991-b32f-701975b2caca.json @@ -0,0 +1,61 @@ +{ + "identifier": "11333818-b47e-5991-b32f-701975b2caca", + "connector_type": "box", + "source_identifiers": { + "filename": "Billing issue - Example 1.pdf", + "fullpath": "/TestACLs-topfolder/Billing issue - Example 1.pdf", + "rel_path": "Billing issue - Example 1.pdf" + }, + "metadata": { + "url": "box:///TestACLs-topfolder/Billing issue - Example 1.pdf", + "version": "2216145342898", + "record_locator": { + "protocol": "box", + "remote_file_path": "box://TestACLs-topfolder", + "file_id": "2216145342898" + }, + "date_created": "1777662769.0", + "date_modified": "1777662769.0", + "date_processed": "1777665696.530676", + "permissions_data": [ + { + "read": { + "users": [ + "50881967280", + "50882409531" + ], + "groups": [] + } + }, + { + "update": { + "users": [ + "50881967280" + ], + "groups": [] + } + }, + { + "delete": { + "users": [ + "50881967280" + ], + "groups": [] + } + } + ], + "filesize_bytes": 142776 + }, + "additional_metadata": { + "name": "/TestACLs-topfolder/Billing issue - Example 1.pdf", + "size": 142776, + "type": "file", + "id": "2216145342898", + "modified_at": "2026-05-01T12:12:49-07:00", + "created_at": "2026-05-01T12:12:49-07:00", + "original_file_path": "/TestACLs-topfolder/Billing issue - Example 1.pdf" + }, + "reprocess": false, + "local_download_path": "/private/var/folders/gf/qwh2bdg93kb9gzxd_xhb49wc0000gn/T/tmpqw6nq7zk/unstructured_aqpewcxk/Billing issue - Example 1.pdf", + "display_name": "/TestACLs-topfolder/Billing issue - Example 1.pdf" +} \ No newline at end of file diff --git a/test/integration/connectors/test_box.py b/test/integration/connectors/test_box.py new file mode 100644 index 000000000..fd681d7dd --- /dev/null +++ b/test/integration/connectors/test_box.py @@ -0,0 +1,83 @@ +import os + +import pytest + +from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG +from test.integration.connectors.utils.validation.source import ( + SourceValidationConfigs, + source_connector_validation, +) +from test.integration.utils import requires_env +from unstructured_ingest.processes.connectors.fsspec.box import ( + CONNECTOR_TYPE, + BoxAccessConfig, + BoxConnectionConfig, + BoxDownloader, + BoxDownloaderConfig, + BoxIndexer, + BoxIndexerConfig, +) + + +def make_box_components(remote_url: str, download_dir): + app_config = os.environ["BOX_APP_CONFIG"] + connection_config = BoxConnectionConfig( + access_config=BoxAccessConfig(box_app_config=app_config) + ) + index_config = BoxIndexerConfig(remote_url=remote_url) + download_config = BoxDownloaderConfig(download_dir=download_dir) + indexer = BoxIndexer(connection_config=connection_config, index_config=index_config) + downloader = BoxDownloader(connection_config=connection_config, download_config=download_config) + return indexer, downloader + + +@pytest.mark.asyncio +@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG) +@requires_env("BOX_APP_CONFIG") +async def test_box_top_folder(temp_dir): + """ + Integration test for Box source connector against the top-level ACL test folder. + Validates that permissions_data is populated from direct folder collaborations. + """ + indexer, downloader = make_box_components( + remote_url="box://TestACLs-topfolder", + download_dir=temp_dir, + ) + await source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="box_top_folder", + validate_downloaded_files=False, + validate_file_data=True, + exclude_fields_extend=[ + "metadata.date_processed", + ], + ), + ) + + +@pytest.mark.asyncio +@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG) +@requires_env("BOX_APP_CONFIG") +async def test_box_second_tier(temp_dir): + """ + Integration test for Box source connector against the nested ACL test folder. + Validates that permissions_data reflects inherited permissions from the parent folder. + """ + indexer, downloader = make_box_components( + remote_url="box://TestACLs-topfolder/TestACLs-secondtier", + download_dir=temp_dir, + ) + await source_connector_validation( + indexer=indexer, + downloader=downloader, + configs=SourceValidationConfigs( + test_id="box_second_tier", + validate_downloaded_files=False, + validate_file_data=True, + exclude_fields_extend=[ + "metadata.date_processed", + ], + ), + ) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py new file mode 100644 index 000000000..c2f5c7673 --- /dev/null +++ b/test/unit/connectors/test_box.py @@ -0,0 +1,276 @@ +from collections import OrderedDict +from unittest.mock import MagicMock, patch + +import pytest + +from unstructured_ingest.processes.connectors.fsspec.box import ( + BOX_ROLE_MAPPING, + BoxDownloader, + BoxDownloaderConfig, +) + + +def make_downloader(max_permissions: int = 500) -> BoxDownloader: + connection_config = MagicMock() + download_config = BoxDownloaderConfig(max_num_metadata_permissions=max_permissions) + downloader = BoxDownloader.__new__(BoxDownloader) + downloader.connection_config = connection_config + downloader.download_config = download_config + downloader._folder_collab_cache = OrderedDict() + downloader._folder_collab_cache_max_size = 5 + return downloader + + +def make_collab( + entity_type: str, + entity_id: str, + role: str, + status: str = "accepted", + group_type: str = "managed_group", +) -> dict: + accessible_by = {"type": entity_type, "id": entity_id} + if entity_type == "group": + accessible_by["group_type"] = group_type + return {"accessible_by": accessible_by, "role": role, "status": status} + + +# --------------------------------------------------------------------------- +# BOX_ROLE_MAPPING +# --------------------------------------------------------------------------- + + +class TestBoxRoleMapping: + def test_all_expected_roles_present(self): + expected = {"owner", "co-owner", "editor", "viewer uploader", "previewer uploader", + "viewer", "previewer"} + assert set(BOX_ROLE_MAPPING.keys()) == expected + + def test_uploader_excluded(self): + assert "uploader" not in BOX_ROLE_MAPPING + + def test_owner_gets_all_operations(self): + assert set(BOX_ROLE_MAPPING["owner"]) == {"read", "update", "delete"} + + def test_co_owner_gets_all_operations(self): + assert set(BOX_ROLE_MAPPING["co-owner"]) == {"read", "update", "delete"} + + def test_editor_gets_read_and_update(self): + assert set(BOX_ROLE_MAPPING["editor"]) == {"read", "update"} + + def test_read_only_roles(self): + for role in ("viewer", "previewer", "viewer uploader", "previewer uploader"): + assert BOX_ROLE_MAPPING[role] == ["read"], f"{role} should map to read only" + + +# --------------------------------------------------------------------------- +# _normalize_collaborations +# --------------------------------------------------------------------------- + + +class TestNormalizeCollaborations: + def _empty_normalized(self): + return { + "read": {"users": set(), "groups": set()}, + "update": {"users": set(), "groups": set()}, + "delete": {"users": set(), "groups": set()}, + } + + def test_user_owner_gets_all_operations(self): + downloader = make_downloader() + normalized = self._empty_normalized() + total = [0] + downloader._normalize_collaborations( + [make_collab("user", "u1", "owner")], normalized, total + ) + assert "u1" in normalized["read"]["users"] + assert "u1" in normalized["update"]["users"] + assert "u1" in normalized["delete"]["users"] + assert total[0] == 1 + + def test_group_viewer_gets_read_only(self): + downloader = make_downloader() + normalized = self._empty_normalized() + total = [0] + downloader._normalize_collaborations( + [make_collab("group", "g1", "viewer")], normalized, total + ) + assert "g1" in normalized["read"]["groups"] + assert "g1" not in normalized["update"]["groups"] + assert "g1" not in normalized["delete"]["groups"] + + def test_pending_collab_skipped(self): + downloader = make_downloader() + normalized = self._empty_normalized() + total = [0] + downloader._normalize_collaborations( + [make_collab("user", "u1", "editor", status="pending")], normalized, total + ) + assert total[0] == 0 + assert not normalized["read"]["users"] + + def test_all_users_group_skipped(self): + downloader = make_downloader() + normalized = self._empty_normalized() + total = [0] + downloader._normalize_collaborations( + [make_collab("group", "g1", "editor", group_type="all_users_group")], + normalized, + total, + ) + assert total[0] == 0 + assert not normalized["read"]["groups"] + + def test_unknown_role_produces_no_operations(self): + downloader = make_downloader() + normalized = self._empty_normalized() + total = [0] + downloader._normalize_collaborations( + [make_collab("user", "u1", "some_future_role")], normalized, total + ) + assert total[0] == 0 + + def test_uploader_role_produces_no_operations(self): + downloader = make_downloader() + normalized = self._empty_normalized() + total = [0] + downloader._normalize_collaborations( + [make_collab("user", "u1", "uploader")], normalized, total + ) + assert total[0] == 0 + assert not normalized["read"]["users"] + + def test_max_permissions_cap_respected(self): + downloader = make_downloader(max_permissions=2) + normalized = self._empty_normalized() + total = [0] + collabs = [make_collab("user", f"u{i}", "viewer") for i in range(5)] + downloader._normalize_collaborations(collabs, normalized, total) + assert total[0] == 2 + assert len(normalized["read"]["users"]) == 2 + + def test_missing_accessible_by_skipped(self): + downloader = make_downloader() + normalized = self._empty_normalized() + total = [0] + downloader._normalize_collaborations( + [{"accessible_by": None, "role": "editor", "status": "accepted"}], + normalized, + total, + ) + assert total[0] == 0 + + +# --------------------------------------------------------------------------- +# get_permissions_for_file +# --------------------------------------------------------------------------- + + +def _make_collab_obj(entity_type, entity_id, role, status="accepted", group_type="managed_group"): + raw = make_collab(entity_type, entity_id, role, status, group_type) + obj = MagicMock() + obj.response_object = raw + return obj + + +class TestGetPermissionsForFile: + def _make_client(self, path_entries, folder_collabs_by_id, file_collabs): + client = MagicMock() + + file_obj = MagicMock() + file_obj.response_object = { + "path_collection": {"entries": path_entries}, + "has_collaborations": True, + } + client.file.return_value.get.return_value = file_obj + client.file.return_value.get_collaborations.return_value = file_collabs + + def folder_side_effect(folder_id): + folder_mock = MagicMock() + folder_mock.get_collaborations.return_value = folder_collabs_by_id.get(folder_id, []) + return folder_mock + + client.folder.side_effect = folder_side_effect + return client + + def test_basic_file_collaboration(self): + downloader = make_downloader() + client = self._make_client( + path_entries=[], + folder_collabs_by_id={}, + file_collabs=[_make_collab_obj("user", "u1", "editor")], + ) + result = downloader.get_permissions_for_file(client, "file123") + read_entry = next(d for d in result if "read" in d) + update_entry = next(d for d in result if "update" in d) + assert "u1" in read_entry["read"]["users"] + assert "u1" in update_entry["update"]["users"] + + def test_inherited_folder_collaboration(self): + downloader = make_downloader() + folder_collab = MagicMock() + folder_collab.response_object = make_collab("user", "u2", "viewer") + client = self._make_client( + path_entries=[{"id": "folder99"}], + folder_collabs_by_id={"folder99": [folder_collab]}, + file_collabs=[], + ) + result = downloader.get_permissions_for_file(client, "file123") + read_entry = next(d for d in result if "read" in d) + assert "u2" in read_entry["read"]["users"] + + def test_root_folder_skipped(self): + downloader = make_downloader() + client = self._make_client( + path_entries=[{"id": "0"}], + folder_collabs_by_id={"0": [MagicMock()]}, + file_collabs=[], + ) + # folder "0" should not be fetched + result = downloader.get_permissions_for_file(client, "file123") + client.folder.assert_not_called() + + def test_output_ids_are_sorted(self): + downloader = make_downloader() + client = self._make_client( + path_entries=[], + folder_collabs_by_id={}, + file_collabs=[ + _make_collab_obj("user", "z_user", "viewer"), + _make_collab_obj("user", "a_user", "viewer"), + ], + ) + result = downloader.get_permissions_for_file(client, "file123") + read_entry = next(d for d in result if "read" in d) + assert read_entry["read"]["users"] == sorted(read_entry["read"]["users"]) + + def test_output_has_all_three_operations(self): + downloader = make_downloader() + client = self._make_client(path_entries=[], folder_collabs_by_id={}, file_collabs=[]) + result = downloader.get_permissions_for_file(client, "file123") + keys = [list(d.keys())[0] for d in result] + assert set(keys) == {"read", "update", "delete"} + + def test_folder_collab_cache_used(self): + downloader = make_downloader() + folder_collab = MagicMock() + folder_collab.response_object = make_collab("user", "u1", "viewer") + client = self._make_client( + path_entries=[{"id": "f1"}], + folder_collabs_by_id={"f1": [folder_collab]}, + file_collabs=[], + ) + downloader.get_permissions_for_file(client, "file1") + downloader.get_permissions_for_file(client, "file2") + # folder f1 should only be fetched once despite two file calls + assert client.folder.call_count == 1 + + def test_api_error_on_path_collection_returns_empty(self): + downloader = make_downloader() + client = MagicMock() + client.file.return_value.get.side_effect = Exception("API error") + client.file.return_value.get_collaborations.return_value = [] + result = downloader.get_permissions_for_file(client, "file123") + # should not raise; all lists should be empty + for entry in result: + for val in list(entry.values())[0].values(): + assert val == [] diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index a36fa3acc..37c33482f 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import OrderedDict from contextlib import contextmanager from dataclasses import dataclass, field from time import time @@ -9,8 +10,9 @@ from pydantic import Field, Secret from pydantic.functional_validators import BeforeValidator -from unstructured_ingest.data_types.file_data import FileDataSourceMetadata +from unstructured_ingest.data_types.file_data import FileData, FileDataSourceMetadata from unstructured_ingest.error import ProviderError, UserAuthError, UserError +from unstructured_ingest.interfaces import DownloadResponse from unstructured_ingest.logger import logger from unstructured_ingest.processes.connector_registry import ( DestinationRegistryEntry, @@ -33,6 +35,17 @@ ) from unstructured_ingest.utils.dep_check import requires_dependencies +BOX_ROLE_MAPPING: dict[str, list[str]] = { + "owner": ["read", "update", "delete"], + "co-owner": ["read", "update", "delete"], + "editor": ["read", "update"], + "viewer uploader": ["read"], + "previewer uploader": ["read"], + "viewer": ["read"], + "previewer": ["read"], + # "uploader" excluded — write-only, cannot view content +} + if TYPE_CHECKING: from boxfs import BoxFileSystem @@ -95,6 +108,15 @@ def wrap_error(self, e: Exception) -> Exception: logger.error(f"unhandled exception from box ({type(e)}): {e}", exc_info=True) return e + @requires_dependencies(["boxsdk"], extras="box") + def get_box_client(self): + from boxsdk import Client, JWTAuth + + ac = self.access_config.get_secret_value() + oauth = JWTAuth.from_settings_dictionary(ac.box_app_config) + oauth.authenticate_instance() + return Client(oauth) + @requires_dependencies(["boxfs"], extras="box") @contextmanager def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]: @@ -137,7 +159,9 @@ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata: class BoxDownloaderConfig(FsspecDownloaderConfig): - pass + max_num_metadata_permissions: int = Field( + 500, description="Approximate maximum number of permissions included in metadata" + ) @dataclass @@ -146,6 +170,98 @@ class BoxDownloader(FsspecDownloader): connection_config: BoxConnectionConfig connector_type: str = CONNECTOR_TYPE download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig) + _folder_collab_cache: dict = field(default_factory=OrderedDict) + _folder_collab_cache_max_size: int = 5 + + def _get_collaborations_for_folder(self, client, folder_id: str) -> list[dict]: + if folder_id in self._folder_collab_cache: + self._folder_collab_cache.move_to_end(folder_id) + logger.debug(f"Retrieved cached collaborations for folder {folder_id}") + return self._folder_collab_cache[folder_id] + + collabs = [] + try: + for collab in client.folder(folder_id).get_collaborations(): + collabs.append(collab.response_object) + except Exception as e: + logger.debug(f"Could not retrieve collaborations for folder {folder_id}: {e}") + + if len(self._folder_collab_cache) >= self._folder_collab_cache_max_size: + self._folder_collab_cache.popitem(last=False) + self._folder_collab_cache[folder_id] = collabs + return collabs + + def _normalize_collaborations(self, collabs: list[dict], normalized: dict, total: list) -> None: + max_perms = self.download_config.max_num_metadata_permissions + for collab in collabs: + if total[0] >= max_perms: + break + if collab.get("status") != "accepted": + continue + accessible_by = collab.get("accessible_by") or {} + entity_type = accessible_by.get("type") + if entity_type not in ("user", "group"): + continue + if entity_type == "group" and accessible_by.get("group_type") == "all_users_group": + continue + entity_id = accessible_by.get("id") + if not entity_id: + continue + operations = BOX_ROLE_MAPPING.get(collab.get("role", ""), []) + if not operations: + continue + type_key = entity_type + "s" + for op in operations: + normalized[op][type_key].add(entity_id) + total[0] += 1 + + def get_permissions_for_file(self, client, file_id: str) -> list[dict]: + normalized = { + "read": {"users": set(), "groups": set()}, + "update": {"users": set(), "groups": set()}, + "delete": {"users": set(), "groups": set()}, + } + total = [0] # mutable counter passed into helper + + try: + file_obj = client.file(file_id).get(fields=["path_collection", "has_collaborations"]) + path_entries = ( + file_obj.response_object.get("path_collection", {}).get("entries", []) + ) + for folder_entry in path_entries: + folder_id = folder_entry.get("id") + # skip root "All Files" folder — it never has meaningful collaborations + if folder_id and folder_id != "0": + folder_collabs = self._get_collaborations_for_folder(client, folder_id) + self._normalize_collaborations(folder_collabs, normalized, total) + except Exception as e: + logger.debug(f"Could not retrieve path_collection for file {file_id}: {e}") + + try: + file_collabs = [c.response_object for c in client.file(file_id).get_collaborations()] + self._normalize_collaborations(file_collabs, normalized, total) + except Exception as e: + logger.debug(f"Could not retrieve collaborations for file {file_id}: {e}") + + for role_dict in normalized.values(): + for key in role_dict: + role_dict[key] = sorted(role_dict[key]) + + logger.debug(f"normalized permissions generated for file {file_id}: {normalized}") + return [{k: v} for k, v in normalized.items()] + + def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: + response = super().run(file_data=file_data, **kwargs) + file_id = (file_data.metadata.record_locator or {}).get("file_id") + if file_id: + try: + client = self.connection_config.get_box_client() + file_data.metadata.permissions_data = self.get_permissions_for_file( + client, file_id + ) + except Exception as e: + logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") + return response class BoxUploaderConfig(FsspecUploaderConfig): From 99299bf116be5d6297a7a1591dbe6fb7ba2bcfd1 Mon Sep 17 00:00:00 2001 From: danielle-unstructured-io Date: Mon, 4 May 2026 15:31:16 -0600 Subject: [PATCH 02/16] refactor: move permissions fetching to indexer, extract module-level helpers --- docs/README.md | 4 +- docs/connector_development.md | 2 +- test/unit/connectors/test_box.py | 75 +++--- .../processes/connectors/fsspec/box.py | 216 +++++++++++------- 4 files changed, 169 insertions(+), 128 deletions(-) diff --git a/docs/README.md b/docs/README.md index 864586aaa..fe2fa380b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -101,7 +101,7 @@ In checklist form, the above steps are summarized as: The ingest flow is similar to an ETL pipeline that gets defined at runtime based on user input: -![unstructured ingest diagram](assets/pipeline.png) +![unstructured ingest diagram](pipeline.png) @@ -117,7 +117,7 @@ The ingest flow is similar to an ETL pipeline that gets defined at runtime based ### Sequence Diagram -![unstructured ingest sequence diagram](assets/sequence.png) +![unstructured ingest sequence diagram](sequence.png) ### Parallel Execution diff --git a/docs/connector_development.md b/docs/connector_development.md index 7d9f18ce0..05414025e 100644 --- a/docs/connector_development.md +++ b/docs/connector_development.md @@ -349,7 +349,7 @@ If you have any questions post in the public Slack channel `ask-for-help-open-so Yellow (without the Uncompressing) represents the steps in a source connector. Orange represents a destination connector. -![unstructured_ingest diagram](assets/pipeline.png) +![unstructured_ingest diagram](pipeline.png) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index c2f5c7673..a3034f2ad 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -1,5 +1,5 @@ from collections import OrderedDict -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest @@ -7,6 +7,8 @@ BOX_ROLE_MAPPING, BoxDownloader, BoxDownloaderConfig, + _get_permissions_for_file, + _normalize_collaborations, ) @@ -17,7 +19,6 @@ def make_downloader(max_permissions: int = 500) -> BoxDownloader: downloader.connection_config = connection_config downloader.download_config = download_config downloader._folder_collab_cache = OrderedDict() - downloader._folder_collab_cache_max_size = 5 return downloader @@ -76,11 +77,10 @@ def _empty_normalized(self): } def test_user_owner_gets_all_operations(self): - downloader = make_downloader() normalized = self._empty_normalized() total = [0] - downloader._normalize_collaborations( - [make_collab("user", "u1", "owner")], normalized, total + _normalize_collaborations( + [make_collab("user", "u1", "owner")], normalized, total, max_perms=500 ) assert "u1" in normalized["read"]["users"] assert "u1" in normalized["update"]["users"] @@ -88,80 +88,75 @@ def test_user_owner_gets_all_operations(self): assert total[0] == 1 def test_group_viewer_gets_read_only(self): - downloader = make_downloader() normalized = self._empty_normalized() total = [0] - downloader._normalize_collaborations( - [make_collab("group", "g1", "viewer")], normalized, total + _normalize_collaborations( + [make_collab("group", "g1", "viewer")], normalized, total, max_perms=500 ) assert "g1" in normalized["read"]["groups"] assert "g1" not in normalized["update"]["groups"] assert "g1" not in normalized["delete"]["groups"] def test_pending_collab_skipped(self): - downloader = make_downloader() normalized = self._empty_normalized() total = [0] - downloader._normalize_collaborations( - [make_collab("user", "u1", "editor", status="pending")], normalized, total + _normalize_collaborations( + [make_collab("user", "u1", "editor", status="pending")], normalized, total, max_perms=500 ) assert total[0] == 0 assert not normalized["read"]["users"] def test_all_users_group_skipped(self): - downloader = make_downloader() normalized = self._empty_normalized() total = [0] - downloader._normalize_collaborations( + _normalize_collaborations( [make_collab("group", "g1", "editor", group_type="all_users_group")], normalized, total, + max_perms=500, ) assert total[0] == 0 assert not normalized["read"]["groups"] def test_unknown_role_produces_no_operations(self): - downloader = make_downloader() normalized = self._empty_normalized() total = [0] - downloader._normalize_collaborations( - [make_collab("user", "u1", "some_future_role")], normalized, total + _normalize_collaborations( + [make_collab("user", "u1", "some_future_role")], normalized, total, max_perms=500 ) assert total[0] == 0 def test_uploader_role_produces_no_operations(self): - downloader = make_downloader() normalized = self._empty_normalized() total = [0] - downloader._normalize_collaborations( - [make_collab("user", "u1", "uploader")], normalized, total + _normalize_collaborations( + [make_collab("user", "u1", "uploader")], normalized, total, max_perms=500 ) assert total[0] == 0 assert not normalized["read"]["users"] def test_max_permissions_cap_respected(self): - downloader = make_downloader(max_permissions=2) normalized = self._empty_normalized() total = [0] collabs = [make_collab("user", f"u{i}", "viewer") for i in range(5)] - downloader._normalize_collaborations(collabs, normalized, total) + _normalize_collaborations(collabs, normalized, total, max_perms=2) assert total[0] == 2 assert len(normalized["read"]["users"]) == 2 def test_missing_accessible_by_skipped(self): - downloader = make_downloader() normalized = self._empty_normalized() total = [0] - downloader._normalize_collaborations( + _normalize_collaborations( [{"accessible_by": None, "role": "editor", "status": "accepted"}], normalized, total, + max_perms=500, ) assert total[0] == 0 # --------------------------------------------------------------------------- -# get_permissions_for_file +# _get_permissions_for_file # --------------------------------------------------------------------------- @@ -193,20 +188,20 @@ def folder_side_effect(folder_id): return client def test_basic_file_collaboration(self): - downloader = make_downloader() + cache = OrderedDict() client = self._make_client( path_entries=[], folder_collabs_by_id={}, file_collabs=[_make_collab_obj("user", "u1", "editor")], ) - result = downloader.get_permissions_for_file(client, "file123") + result = _get_permissions_for_file(client, "file123", cache) read_entry = next(d for d in result if "read" in d) update_entry = next(d for d in result if "update" in d) assert "u1" in read_entry["read"]["users"] assert "u1" in update_entry["update"]["users"] def test_inherited_folder_collaboration(self): - downloader = make_downloader() + cache = OrderedDict() folder_collab = MagicMock() folder_collab.response_object = make_collab("user", "u2", "viewer") client = self._make_client( @@ -214,23 +209,22 @@ def test_inherited_folder_collaboration(self): folder_collabs_by_id={"folder99": [folder_collab]}, file_collabs=[], ) - result = downloader.get_permissions_for_file(client, "file123") + result = _get_permissions_for_file(client, "file123", cache) read_entry = next(d for d in result if "read" in d) assert "u2" in read_entry["read"]["users"] def test_root_folder_skipped(self): - downloader = make_downloader() + cache = OrderedDict() client = self._make_client( path_entries=[{"id": "0"}], folder_collabs_by_id={"0": [MagicMock()]}, file_collabs=[], ) - # folder "0" should not be fetched - result = downloader.get_permissions_for_file(client, "file123") + _get_permissions_for_file(client, "file123", cache) client.folder.assert_not_called() def test_output_ids_are_sorted(self): - downloader = make_downloader() + cache = OrderedDict() client = self._make_client( path_entries=[], folder_collabs_by_id={}, @@ -239,19 +233,19 @@ def test_output_ids_are_sorted(self): _make_collab_obj("user", "a_user", "viewer"), ], ) - result = downloader.get_permissions_for_file(client, "file123") + result = _get_permissions_for_file(client, "file123", cache) read_entry = next(d for d in result if "read" in d) assert read_entry["read"]["users"] == sorted(read_entry["read"]["users"]) def test_output_has_all_three_operations(self): - downloader = make_downloader() + cache = OrderedDict() client = self._make_client(path_entries=[], folder_collabs_by_id={}, file_collabs=[]) - result = downloader.get_permissions_for_file(client, "file123") + result = _get_permissions_for_file(client, "file123", cache) keys = [list(d.keys())[0] for d in result] assert set(keys) == {"read", "update", "delete"} def test_folder_collab_cache_used(self): - downloader = make_downloader() + cache = OrderedDict() folder_collab = MagicMock() folder_collab.response_object = make_collab("user", "u1", "viewer") client = self._make_client( @@ -259,18 +253,17 @@ def test_folder_collab_cache_used(self): folder_collabs_by_id={"f1": [folder_collab]}, file_collabs=[], ) - downloader.get_permissions_for_file(client, "file1") - downloader.get_permissions_for_file(client, "file2") + _get_permissions_for_file(client, "file1", cache) + _get_permissions_for_file(client, "file2", cache) # folder f1 should only be fetched once despite two file calls assert client.folder.call_count == 1 def test_api_error_on_path_collection_returns_empty(self): - downloader = make_downloader() + cache = OrderedDict() client = MagicMock() client.file.return_value.get.side_effect = Exception("API error") client.file.return_value.get_collaborations.return_value = [] - result = downloader.get_permissions_for_file(client, "file123") - # should not raise; all lists should be empty + result = _get_permissions_for_file(client, "file123", cache) for entry in result: for val in list(entry.values())[0].values(): assert val == [] diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 37c33482f..5323eb382 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -52,6 +52,88 @@ CONNECTOR_TYPE = "box" +def _normalize_collaborations( + collabs: list[dict], normalized: dict, total: list, max_perms: int +) -> None: + for collab in collabs: + if total[0] >= max_perms: + break + if collab.get("status") != "accepted": + continue + accessible_by = collab.get("accessible_by") or {} + entity_type = accessible_by.get("type") + if entity_type not in ("user", "group"): + continue + if entity_type == "group" and accessible_by.get("group_type") == "all_users_group": + continue + entity_id = accessible_by.get("id") + if not entity_id: + continue + operations = BOX_ROLE_MAPPING.get(collab.get("role", ""), []) + if not operations: + continue + type_key = entity_type + "s" + for op in operations: + normalized[op][type_key].add(entity_id) + total[0] += 1 + + +def _get_collaborations_for_folder(client, folder_id: str, cache: OrderedDict) -> list[dict]: + if folder_id in cache: + cache.move_to_end(folder_id) + logger.debug(f"Retrieved cached collaborations for folder {folder_id}") + return cache[folder_id] + + collabs = [] + try: + for collab in client.folder(folder_id).get_collaborations(): + collabs.append(collab.response_object) + except Exception as e: + logger.debug(f"Could not retrieve collaborations for folder {folder_id}: {e}") + + if len(cache) >= 5: + cache.popitem(last=False) + cache[folder_id] = collabs + return collabs + + +def _get_permissions_for_file( + client, file_id: str, cache: OrderedDict, max_perms: int = 500 +) -> list[dict]: + normalized = { + "read": {"users": set(), "groups": set()}, + "update": {"users": set(), "groups": set()}, + "delete": {"users": set(), "groups": set()}, + } + total = [0] + + try: + file_obj = client.file(file_id).get(fields=["path_collection", "has_collaborations"]) + path_entries = ( + file_obj.response_object.get("path_collection", {}).get("entries", []) + ) + for folder_entry in path_entries: + folder_id = folder_entry.get("id") + if folder_id and folder_id != "0": + folder_collabs = _get_collaborations_for_folder(client, folder_id, cache) + _normalize_collaborations(folder_collabs, normalized, total, max_perms) + except Exception as e: + logger.debug(f"Could not retrieve path_collection for file {file_id}: {e}") + + try: + file_collabs = [c.response_object for c in client.file(file_id).get_collaborations()] + _normalize_collaborations(file_collabs, normalized, total, max_perms) + except Exception as e: + logger.debug(f"Could not retrieve collaborations for file {file_id}: {e}") + + for role_dict in normalized.values(): + for key in role_dict: + role_dict[key] = sorted(role_dict[key]) + + logger.debug(f"normalized permissions generated for file {file_id}: {normalized}") + return [{k: v} for k, v in normalized.items()] + + class BoxIndexerConfig(FsspecIndexerConfig): pass @@ -157,6 +239,26 @@ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata: filesize_bytes=file_size, ) + def run(self, **kwargs: Any) -> Generator[FileData, None, None]: + client = None + cache: OrderedDict = OrderedDict() + try: + client = self.connection_config.get_box_client() + except Exception as e: + logger.warning(f"Could not initialize Box client for permissions fetching: {e}") + + for file_data in super().run(**kwargs): + if client: + file_id = (file_data.metadata.record_locator or {}).get("file_id") + if file_id: + try: + file_data.metadata.permissions_data = _get_permissions_for_file( + client, file_id, cache + ) + except Exception as e: + logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") + yield file_data + class BoxDownloaderConfig(FsspecDownloaderConfig): max_num_metadata_permissions: int = Field( @@ -170,97 +272,43 @@ class BoxDownloader(FsspecDownloader): connection_config: BoxConnectionConfig connector_type: str = CONNECTOR_TYPE download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig) - _folder_collab_cache: dict = field(default_factory=OrderedDict) - _folder_collab_cache_max_size: int = 5 def _get_collaborations_for_folder(self, client, folder_id: str) -> list[dict]: - if folder_id in self._folder_collab_cache: - self._folder_collab_cache.move_to_end(folder_id) - logger.debug(f"Retrieved cached collaborations for folder {folder_id}") - return self._folder_collab_cache[folder_id] - - collabs = [] - try: - for collab in client.folder(folder_id).get_collaborations(): - collabs.append(collab.response_object) - except Exception as e: - logger.debug(f"Could not retrieve collaborations for folder {folder_id}: {e}") - - if len(self._folder_collab_cache) >= self._folder_collab_cache_max_size: - self._folder_collab_cache.popitem(last=False) - self._folder_collab_cache[folder_id] = collabs - return collabs - - def _normalize_collaborations(self, collabs: list[dict], normalized: dict, total: list) -> None: - max_perms = self.download_config.max_num_metadata_permissions - for collab in collabs: - if total[0] >= max_perms: - break - if collab.get("status") != "accepted": - continue - accessible_by = collab.get("accessible_by") or {} - entity_type = accessible_by.get("type") - if entity_type not in ("user", "group"): - continue - if entity_type == "group" and accessible_by.get("group_type") == "all_users_group": - continue - entity_id = accessible_by.get("id") - if not entity_id: - continue - operations = BOX_ROLE_MAPPING.get(collab.get("role", ""), []) - if not operations: - continue - type_key = entity_type + "s" - for op in operations: - normalized[op][type_key].add(entity_id) - total[0] += 1 + if not hasattr(self, "_folder_collab_cache"): + self._folder_collab_cache = OrderedDict() + return _get_collaborations_for_folder(client, folder_id, self._folder_collab_cache) + + def _normalize_collaborations( + self, collabs: list[dict], normalized: dict, total: list + ) -> None: + _normalize_collaborations( + collabs, normalized, total, self.download_config.max_num_metadata_permissions + ) def get_permissions_for_file(self, client, file_id: str) -> list[dict]: - normalized = { - "read": {"users": set(), "groups": set()}, - "update": {"users": set(), "groups": set()}, - "delete": {"users": set(), "groups": set()}, - } - total = [0] # mutable counter passed into helper - - try: - file_obj = client.file(file_id).get(fields=["path_collection", "has_collaborations"]) - path_entries = ( - file_obj.response_object.get("path_collection", {}).get("entries", []) - ) - for folder_entry in path_entries: - folder_id = folder_entry.get("id") - # skip root "All Files" folder — it never has meaningful collaborations - if folder_id and folder_id != "0": - folder_collabs = self._get_collaborations_for_folder(client, folder_id) - self._normalize_collaborations(folder_collabs, normalized, total) - except Exception as e: - logger.debug(f"Could not retrieve path_collection for file {file_id}: {e}") - - try: - file_collabs = [c.response_object for c in client.file(file_id).get_collaborations()] - self._normalize_collaborations(file_collabs, normalized, total) - except Exception as e: - logger.debug(f"Could not retrieve collaborations for file {file_id}: {e}") - - for role_dict in normalized.values(): - for key in role_dict: - role_dict[key] = sorted(role_dict[key]) - - logger.debug(f"normalized permissions generated for file {file_id}: {normalized}") - return [{k: v} for k, v in normalized.items()] + if not hasattr(self, "_folder_collab_cache"): + self._folder_collab_cache = OrderedDict() + return _get_permissions_for_file( + client, + file_id, + self._folder_collab_cache, + self.download_config.max_num_metadata_permissions, + ) def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: response = super().run(file_data=file_data, **kwargs) - file_id = (file_data.metadata.record_locator or {}).get("file_id") - if file_id: - try: - client = self.connection_config.get_box_client() - file_data.metadata.permissions_data = self.get_permissions_for_file( - client, file_id - ) - except Exception as e: - logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") + # permissions_data is set during indexing in BoxIndexer.run(); this fallback handles + # standalone downloader usage (e.g., CLI, integration tests without the SND plugin layer) + if file_data.metadata.permissions_data is None: + file_id = (file_data.metadata.record_locator or {}).get("file_id") + if file_id: + try: + client = self.connection_config.get_box_client() + file_data.metadata.permissions_data = self.get_permissions_for_file( + client, file_id + ) + except Exception as e: + logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") return response From 7fc3a81bccfd9d58c493a670762d274aafa1e4b3 Mon Sep 17 00:00:00 2001 From: danielle-unstructured-io Date: Mon, 11 May 2026 10:32:26 -0600 Subject: [PATCH 03/16] fix(box): move max_num_metadata_permissions to indexer config The cap lived on BoxDownloaderConfig but permissions are now fetched during indexing, so the user's configured value was ignored in normal pipelines (the downloader fallback only fires for standalone use). Moved the field to BoxIndexerConfig where the work happens. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/unit/connectors/test_box.py | 12 ------ .../processes/connectors/fsspec/box.py | 39 +++++-------------- 2 files changed, 10 insertions(+), 41 deletions(-) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index a3034f2ad..00528710f 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -5,23 +5,11 @@ from unstructured_ingest.processes.connectors.fsspec.box import ( BOX_ROLE_MAPPING, - BoxDownloader, - BoxDownloaderConfig, _get_permissions_for_file, _normalize_collaborations, ) -def make_downloader(max_permissions: int = 500) -> BoxDownloader: - connection_config = MagicMock() - download_config = BoxDownloaderConfig(max_num_metadata_permissions=max_permissions) - downloader = BoxDownloader.__new__(BoxDownloader) - downloader.connection_config = connection_config - downloader.download_config = download_config - downloader._folder_collab_cache = OrderedDict() - return downloader - - def make_collab( entity_type: str, entity_id: str, diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 5323eb382..4fd05b865 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -135,7 +135,9 @@ def _get_permissions_for_file( class BoxIndexerConfig(FsspecIndexerConfig): - pass + max_num_metadata_permissions: int = Field( + 500, description="Approximate maximum number of permissions included in metadata" + ) class BoxAccessConfig(FsspecAccessConfig): @@ -253,7 +255,10 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: if file_id: try: file_data.metadata.permissions_data = _get_permissions_for_file( - client, file_id, cache + client, + file_id, + cache, + self.index_config.max_num_metadata_permissions, ) except Exception as e: logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") @@ -261,9 +266,7 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: class BoxDownloaderConfig(FsspecDownloaderConfig): - max_num_metadata_permissions: int = Field( - 500, description="Approximate maximum number of permissions included in metadata" - ) + pass @dataclass @@ -273,28 +276,6 @@ class BoxDownloader(FsspecDownloader): connector_type: str = CONNECTOR_TYPE download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig) - def _get_collaborations_for_folder(self, client, folder_id: str) -> list[dict]: - if not hasattr(self, "_folder_collab_cache"): - self._folder_collab_cache = OrderedDict() - return _get_collaborations_for_folder(client, folder_id, self._folder_collab_cache) - - def _normalize_collaborations( - self, collabs: list[dict], normalized: dict, total: list - ) -> None: - _normalize_collaborations( - collabs, normalized, total, self.download_config.max_num_metadata_permissions - ) - - def get_permissions_for_file(self, client, file_id: str) -> list[dict]: - if not hasattr(self, "_folder_collab_cache"): - self._folder_collab_cache = OrderedDict() - return _get_permissions_for_file( - client, - file_id, - self._folder_collab_cache, - self.download_config.max_num_metadata_permissions, - ) - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: response = super().run(file_data=file_data, **kwargs) # permissions_data is set during indexing in BoxIndexer.run(); this fallback handles @@ -304,8 +285,8 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: if file_id: try: client = self.connection_config.get_box_client() - file_data.metadata.permissions_data = self.get_permissions_for_file( - client, file_id + file_data.metadata.permissions_data = _get_permissions_for_file( + client, file_id, OrderedDict() ) except Exception as e: logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") From 6b9683e75cb4e2d0436023ea452107b3ba35b52f Mon Sep 17 00:00:00 2001 From: danielle-unstructured-io Date: Mon, 11 May 2026 12:42:07 -0600 Subject: [PATCH 04/16] fix(box): skip is_access_only collaborations to prevent ACL overgrant Access-only collabs are Box's mechanism for hidden, scoped grants (e.g. shared-link backing). They do not cascade like normal collabs, so picking them up during ancestor-folder walks added users to sibling files' permissions_data, overgranting downstream ACLs. Skip everywhere (folder walk and direct file collabs) for consistency with Box's own access-list semantics, which deliberately hide these grants. Explicitly request is_access_only via fields= on both collab calls so the guard fires reliably. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/unit/connectors/test_box.py | 10 ++++++++++ .../processes/connectors/fsspec/box.py | 15 +++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index 00528710f..ad3512017 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -106,6 +106,16 @@ def test_all_users_group_skipped(self): assert total[0] == 0 assert not normalized["read"]["groups"] + def test_access_only_collab_skipped(self): + normalized = self._empty_normalized() + total = [0] + collab = make_collab("user", "u1", "editor") + collab["is_access_only"] = True + _normalize_collaborations([collab], normalized, total, max_perms=500) + assert total[0] == 0 + assert not normalized["read"]["users"] + assert not normalized["update"]["users"] + def test_unknown_role_produces_no_operations(self): normalized = self._empty_normalized() total = [0] diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 4fd05b865..cc77660f1 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -60,6 +60,10 @@ def _normalize_collaborations( break if collab.get("status") != "accepted": continue + # Access-only collabs are hidden, scoped grants (e.g. shared-link backing) and do + # not cascade like normal collabs — including them would overgrant siblings. + if collab.get("is_access_only"): + continue accessible_by = collab.get("accessible_by") or {} entity_type = accessible_by.get("type") if entity_type not in ("user", "group"): @@ -86,7 +90,9 @@ def _get_collaborations_for_folder(client, folder_id: str, cache: OrderedDict) - collabs = [] try: - for collab in client.folder(folder_id).get_collaborations(): + for collab in client.folder(folder_id).get_collaborations( + fields=["accessible_by", "role", "status", "is_access_only"] + ): collabs.append(collab.response_object) except Exception as e: logger.debug(f"Could not retrieve collaborations for folder {folder_id}: {e}") @@ -121,7 +127,12 @@ def _get_permissions_for_file( logger.debug(f"Could not retrieve path_collection for file {file_id}: {e}") try: - file_collabs = [c.response_object for c in client.file(file_id).get_collaborations()] + file_collabs = [ + c.response_object + for c in client.file(file_id).get_collaborations( + fields=["accessible_by", "role", "status", "is_access_only"] + ) + ] _normalize_collaborations(file_collabs, normalized, total, max_perms) except Exception as e: logger.debug(f"Could not retrieve collaborations for file {file_id}: {e}") From c99b15095a8530326a27f91bed4ed04a23e100cd Mon Sep 17 00:00:00 2001 From: danielle-unstructured-io Date: Mon, 11 May 2026 14:44:08 -0600 Subject: [PATCH 05/16] fix(box): grant editor role delete permission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Box Editors can delete files and folders (view, download, upload, edit, delete, copy, move, rename, share is the full editor capability set). BOX_ROLE_MAPPING omitted delete, so editor users and groups were missing from permissions_data["delete"] — an under-grant on delete access for valid Box collaborators. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/unit/connectors/test_box.py | 4 ++-- unstructured_ingest/processes/connectors/fsspec/box.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index ad3512017..348a982a2 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -43,8 +43,8 @@ def test_owner_gets_all_operations(self): def test_co_owner_gets_all_operations(self): assert set(BOX_ROLE_MAPPING["co-owner"]) == {"read", "update", "delete"} - def test_editor_gets_read_and_update(self): - assert set(BOX_ROLE_MAPPING["editor"]) == {"read", "update"} + def test_editor_gets_all_operations(self): + assert set(BOX_ROLE_MAPPING["editor"]) == {"read", "update", "delete"} def test_read_only_roles(self): for role in ("viewer", "previewer", "viewer uploader", "previewer uploader"): diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index cc77660f1..322676700 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -38,7 +38,7 @@ BOX_ROLE_MAPPING: dict[str, list[str]] = { "owner": ["read", "update", "delete"], "co-owner": ["read", "update", "delete"], - "editor": ["read", "update"], + "editor": ["read", "update", "delete"], "viewer uploader": ["read"], "previewer uploader": ["read"], "viewer": ["read"], From b6059ab3468a1fc88b505d6d6b1e32935acfd248 Mon Sep 17 00:00:00 2001 From: danielle-unstructured-io Date: Mon, 11 May 2026 14:56:06 -0600 Subject: [PATCH 06/16] fix(test): strip randomized tempdir prefix from fixture paths FsspecDownloader writes each file into a fresh tempfile.mkdtemp ("unstructured_") subdir to avoid path collisions. The captured directory_structure.json fixtures included that random suffix, so directory-structure validation would fail on every run even when downloads succeeded. Strip the unstructured_/ segment in get_files() so fixtures capture the logical structure, and regenerate the Box fixtures with clean paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../box_second_tier/directory_structure.json | 4 ++-- .../box_top_folder/directory_structure.json | 4 ++-- .../integration/connectors/utils/validation/source.py | 11 ++++++++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/test/integration/connectors/expected_results/box_second_tier/directory_structure.json b/test/integration/connectors/expected_results/box_second_tier/directory_structure.json index 3181bd646..13d9fd565 100644 --- a/test/integration/connectors/expected_results/box_second_tier/directory_structure.json +++ b/test/integration/connectors/expected_results/box_second_tier/directory_structure.json @@ -1,5 +1,5 @@ { "directory_structure": [ - "unstructured_uvopv4ry/catalog.pdf" + "catalog.pdf" ] -} \ No newline at end of file +} diff --git a/test/integration/connectors/expected_results/box_top_folder/directory_structure.json b/test/integration/connectors/expected_results/box_top_folder/directory_structure.json index c785e5c96..457aadcbf 100644 --- a/test/integration/connectors/expected_results/box_top_folder/directory_structure.json +++ b/test/integration/connectors/expected_results/box_top_folder/directory_structure.json @@ -1,5 +1,5 @@ { "directory_structure": [ - "unstructured_aqpewcxk/Billing issue - Example 1.pdf" + "Billing issue - Example 1.pdf" ] -} \ No newline at end of file +} diff --git a/test/integration/connectors/utils/validation/source.py b/test/integration/connectors/utils/validation/source.py index 4c3131c03..9bc97ba7f 100644 --- a/test/integration/connectors/utils/validation/source.py +++ b/test/integration/connectors/utils/validation/source.py @@ -1,5 +1,6 @@ import json import os +import re import shutil from pathlib import Path from typing import Callable, Optional @@ -86,9 +87,17 @@ def omit_ignored_fields(self, data: dict) -> dict: return copied_data +# FsspecDownloader writes each file into a fresh tempfile.mkdtemp("unstructured_") subdir +# to avoid path collisions. Strip that segment so fixtures capture the logical structure +# rather than a randomized suffix that changes every run. +_FSSPEC_TEMP_DIR_PATTERN = re.compile(r"^unstructured_[a-zA-Z0-9_-]+/") + + def get_files(dir_path: Path) -> list[str]: return [ - str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file() + _FSSPEC_TEMP_DIR_PATTERN.sub("", str(f).replace(str(dir_path), "").lstrip("/")) + for f in dir_path.rglob("*") + if f.is_file() ] From bcd862232ab23c44868785a109b75bdefaaa3aea Mon Sep 17 00:00:00 2001 From: danielle-unstructured-io Date: Mon, 11 May 2026 14:58:58 -0600 Subject: [PATCH 07/16] chore: bump version to 1.6.0 and add changelog entry Minor bump for the Box ACL permissions feature, matching the precedent set by the SharePoint ACL pass-through (1.4.29 -> 1.5.0). Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 10 ++++++++++ unstructured_ingest/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 099de6e7d..90d6038d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## [1.6.0] + +### Enhancements + +- **feat(box): pass through ACL permission metadata.** Extract Box collaboration data and normalize to the standard read/update/delete schema. Permissions are fetched during indexing with an LRU-cached ancestor folder walk to handle inherited collaborations. Access-only collabs (`is_access_only=true`) are skipped to avoid overgranting; group IDs are stored directly without member expansion (consistent with Confluence). Cap on permissions captured per file is configurable via `BoxIndexerConfig.max_num_metadata_permissions` (default 500). + +### Fixes + +- **fix(test): strip randomized tempdir prefix from FsspecDownloader fixture paths.** `get_files()` now drops the leading `unstructured_/` segment so `directory_structure.json` captures the logical structure rather than the per-run random suffix injected by `tempfile.mkdtemp`. + ## [1.4.29] ### Chores diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 82a536187..e10d078bf 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "1.4.29" # pragma: no cover +__version__ = "1.6.0" # pragma: no cover From 6b45a6031214f9f24d04a06e2a7ccfa54835a7de Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 11 May 2026 17:10:43 -0400 Subject: [PATCH 08/16] chore: bump version to 1.6.0 instead of 1.5.3 ACL reading is a substantial enough feature to warrant a minor bump. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- unstructured_ingest/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28a73b6e4..6f006e555 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## [1.5.3] +## [1.6.0] ### Enhancements diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index e375e78df..e10d078bf 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "1.5.3" # pragma: no cover +__version__ = "1.6.0" # pragma: no cover From b29c6a565163c72d8cf19c18b7e43686c5a433f4 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 11 May 2026 17:32:25 -0400 Subject: [PATCH 09/16] fix(box): don't cache folder collabs on API failure A transient 503 or timeout against `client.folder(id).get_collaborations` was being swallowed and an empty list cached for the rest of the indexer run, silently zeroing out permissions for every descendant file sharing that ancestor. Return without writing the cache on exception so the next file retries, and bump the log to warning so the failure is visible. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/unit/connectors/test_box.py | 28 +++++++++++++++++++ .../processes/connectors/fsspec/box.py | 6 +++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index 348a982a2..7ff1fe340 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -256,6 +256,34 @@ def test_folder_collab_cache_used(self): # folder f1 should only be fetched once despite two file calls assert client.folder.call_count == 1 + def test_folder_collab_transient_failure_not_cached(self): + cache = OrderedDict() + folder_collab = MagicMock() + folder_collab.response_object = make_collab("user", "u1", "viewer") + + folder_mock = MagicMock() + folder_mock.get_collaborations.side_effect = [ + Exception("transient 503"), + [folder_collab], + ] + + client = MagicMock() + file_obj = MagicMock() + file_obj.response_object = {"path_collection": {"entries": [{"id": "f1"}]}} + client.file.return_value.get.return_value = file_obj + client.file.return_value.get_collaborations.return_value = [] + client.folder.return_value = folder_mock + + first = _get_permissions_for_file(client, "file1", cache) + first_read = next(d for d in first if "read" in d) + assert "u1" not in first_read["read"]["users"] + + # Second call must retry, not serve a cached empty list from the transient failure. + second = _get_permissions_for_file(client, "file2", cache) + second_read = next(d for d in second if "read" in d) + assert "u1" in second_read["read"]["users"] + assert folder_mock.get_collaborations.call_count == 2 + def test_api_error_on_path_collection_returns_empty(self): cache = OrderedDict() client = MagicMock() diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 322676700..53a627c4b 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -95,7 +95,11 @@ def _get_collaborations_for_folder(client, folder_id: str, cache: OrderedDict) - ): collabs.append(collab.response_object) except Exception as e: - logger.debug(f"Could not retrieve collaborations for folder {folder_id}: {e}") + # Don't cache on failure — a transient 503/timeout would otherwise silently + # zero out permissions for every descendant file sharing this ancestor for + # the rest of the indexer run. + logger.warning(f"Could not retrieve collaborations for folder {folder_id}: {e}") + return collabs if len(cache) >= 5: cache.popitem(last=False) From af78a0e6093cc52b1b7890047bda2d2678ff9f5c Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 11 May 2026 17:37:53 -0400 Subject: [PATCH 10/16] fix(test): walk on-disk tree in check_raw_file_contents get_files() strips the unstructured_/ tempdir prefix for fixture comparison, but the actual downloaded files still live under that subdir. Reusing the stripped paths for filesystem lookups would FileNotFoundError for any future fsspec connector test with validate_downloaded_files=True. Walk the real tree, then strip the prefix only when resolving the expected fixture path. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../connectors/utils/validation/source.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/test/integration/connectors/utils/validation/source.py b/test/integration/connectors/utils/validation/source.py index 9bc97ba7f..e6556f7d1 100644 --- a/test/integration/connectors/utils/validation/source.py +++ b/test/integration/connectors/utils/validation/source.py @@ -138,12 +138,17 @@ def check_raw_file_contents( current_output_dir: Path, configs: SourceValidationConfigs, ): - current_files = get_files(dir_path=current_output_dir) found_diff = False files = [] - for current_file in current_files: - current_file_path = current_output_dir / current_file - expected_file_path = expected_output_dir / current_file + for current_file_path in current_output_dir.rglob("*"): + if not current_file_path.is_file(): + continue + relative = str(current_file_path.relative_to(current_output_dir)) + # Strip the unstructured_/ tempdir segment when locating the + # corresponding fixture; the on-disk file still lives under the random + # subdir so don't strip it from current_file_path. + expected_relative = _FSSPEC_TEMP_DIR_PATTERN.sub("", relative) + expected_file_path = expected_output_dir / expected_relative if configs.detect_diff(expected_file_path, current_file_path): found_diff = True files.append(str(expected_file_path)) From 45bdbabf30bf874d443b53a13d8beb11ef273084 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 11 May 2026 17:40:33 -0400 Subject: [PATCH 11/16] fix(box): drop previewer roles from BOX_ROLE_MAPPING Box previewers can render content in the web UI but cannot download files, so mapping them to ["read"] would overstate access in downstream ACL consumers that interpret read as download capability. Mirrors the existing exclusion of "uploader" (write-only, can't view). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/unit/connectors/test_box.py | 10 +++++++--- unstructured_ingest/processes/connectors/fsspec/box.py | 7 ++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index 7ff1fe340..fc2d748bd 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -30,13 +30,17 @@ def make_collab( class TestBoxRoleMapping: def test_all_expected_roles_present(self): - expected = {"owner", "co-owner", "editor", "viewer uploader", "previewer uploader", - "viewer", "previewer"} + expected = {"owner", "co-owner", "editor", "viewer uploader", "viewer"} assert set(BOX_ROLE_MAPPING.keys()) == expected def test_uploader_excluded(self): assert "uploader" not in BOX_ROLE_MAPPING + def test_previewer_roles_excluded(self): + # Previewers can render but not download, so they don't get "read". + assert "previewer" not in BOX_ROLE_MAPPING + assert "previewer uploader" not in BOX_ROLE_MAPPING + def test_owner_gets_all_operations(self): assert set(BOX_ROLE_MAPPING["owner"]) == {"read", "update", "delete"} @@ -47,7 +51,7 @@ def test_editor_gets_all_operations(self): assert set(BOX_ROLE_MAPPING["editor"]) == {"read", "update", "delete"} def test_read_only_roles(self): - for role in ("viewer", "previewer", "viewer uploader", "previewer uploader"): + for role in ("viewer", "viewer uploader"): assert BOX_ROLE_MAPPING[role] == ["read"], f"{role} should map to read only" diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 53a627c4b..f43868c51 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -40,10 +40,11 @@ "co-owner": ["read", "update", "delete"], "editor": ["read", "update", "delete"], "viewer uploader": ["read"], - "previewer uploader": ["read"], "viewer": ["read"], - "previewer": ["read"], - # "uploader" excluded — write-only, cannot view content + # "previewer" and "previewer uploader" excluded — Box previewers can render + # content in the web UI but cannot download, so granting "read" would + # overstate access to downstream ACL consumers expecting download capability. + # "uploader" excluded — write-only, cannot view content. } if TYPE_CHECKING: From 8416d479c39e1cab86d276921f8008e5ea3cefd4 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 11 May 2026 17:42:34 -0400 Subject: [PATCH 12/16] make tidy --- test/unit/connectors/test_box.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index fc2d748bd..fbf3a67ef 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -1,8 +1,6 @@ from collections import OrderedDict from unittest.mock import MagicMock -import pytest - from unstructured_ingest.processes.connectors.fsspec.box import ( BOX_ROLE_MAPPING, _get_permissions_for_file, @@ -93,7 +91,10 @@ def test_pending_collab_skipped(self): normalized = self._empty_normalized() total = [0] _normalize_collaborations( - [make_collab("user", "u1", "editor", status="pending")], normalized, total, max_perms=500 + [make_collab("user", "u1", "editor", status="pending")], + normalized, + total, + max_perms=500, ) assert total[0] == 0 assert not normalized["read"]["users"] From ff0861ad877cb531fdb3951a94c064a666fed0bb Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 12 May 2026 09:28:06 -0400 Subject: [PATCH 13/16] perf(box): skip file-collab fetch when has_collaborations is False; reuse client and cache in downloader Two follow-ups in the Box ACL path: 1. _get_permissions_for_file already fetches has_collaborations alongside path_collection but ignored the value. Skip the per-file get_collaborations() round-trip when Box tells us there are no direct collaborations on the file. Fall back to fetching when the file.get() itself fails so we don't lose data on transient errors. 2. BoxDownloader.run()'s standalone-mode fallback was re-running JWT auth and constructing a new Client for every file. Memoize the client and the ancestor-folder cache on the downloader instance so a multi-file run does one auth and reuses the cache across files (matching how BoxIndexer.run() already initializes its client once). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/unit/connectors/test_box.py | 14 +++++++ .../processes/connectors/fsspec/box.py | 39 +++++++++++-------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index fbf3a67ef..841c46e58 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -261,6 +261,20 @@ def test_folder_collab_cache_used(self): # folder f1 should only be fetched once despite two file calls assert client.folder.call_count == 1 + def test_has_collaborations_false_skips_file_collab_fetch(self): + cache = OrderedDict() + client = MagicMock() + file_obj = MagicMock() + file_obj.response_object = { + "path_collection": {"entries": []}, + "has_collaborations": False, + } + client.file.return_value.get.return_value = file_obj + # Folder walk is empty so no folder calls happen; the only thing under test + # is whether the per-file get_collaborations() round-trip is skipped. + _get_permissions_for_file(client, "file123", cache) + client.file.return_value.get_collaborations.assert_not_called() + def test_folder_collab_transient_failure_not_cached(self): cache = OrderedDict() folder_collab = MagicMock() diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index f43868c51..11a25f5f1 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -118,11 +118,13 @@ def _get_permissions_for_file( } total = [0] + # Default True so a failed file.get() doesn't suppress the direct-collab fetch. + has_direct_collabs = True try: file_obj = client.file(file_id).get(fields=["path_collection", "has_collaborations"]) - path_entries = ( - file_obj.response_object.get("path_collection", {}).get("entries", []) - ) + response_obj = file_obj.response_object + path_entries = response_obj.get("path_collection", {}).get("entries", []) + has_direct_collabs = bool(response_obj.get("has_collaborations", True)) for folder_entry in path_entries: folder_id = folder_entry.get("id") if folder_id and folder_id != "0": @@ -131,16 +133,17 @@ def _get_permissions_for_file( except Exception as e: logger.debug(f"Could not retrieve path_collection for file {file_id}: {e}") - try: - file_collabs = [ - c.response_object - for c in client.file(file_id).get_collaborations( - fields=["accessible_by", "role", "status", "is_access_only"] - ) - ] - _normalize_collaborations(file_collabs, normalized, total, max_perms) - except Exception as e: - logger.debug(f"Could not retrieve collaborations for file {file_id}: {e}") + if has_direct_collabs: + try: + file_collabs = [ + c.response_object + for c in client.file(file_id).get_collaborations( + fields=["accessible_by", "role", "status", "is_access_only"] + ) + ] + _normalize_collaborations(file_collabs, normalized, total, max_perms) + except Exception as e: + logger.debug(f"Could not retrieve collaborations for file {file_id}: {e}") for role_dict in normalized.values(): for key in role_dict: @@ -291,18 +294,22 @@ class BoxDownloader(FsspecDownloader): connection_config: BoxConnectionConfig connector_type: str = CONNECTOR_TYPE download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig) + _box_client: Any = field(default=None, init=False, repr=False) + _collab_cache: OrderedDict = field(default_factory=OrderedDict, init=False, repr=False) def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: response = super().run(file_data=file_data, **kwargs) # permissions_data is set during indexing in BoxIndexer.run(); this fallback handles - # standalone downloader usage (e.g., CLI, integration tests without the SND plugin layer) + # standalone downloader usage (e.g., CLI, integration tests without the SND plugin layer). + # Memoize the JWT-auth'd client and ancestor cache across files so we don't re-auth per file. if file_data.metadata.permissions_data is None: file_id = (file_data.metadata.record_locator or {}).get("file_id") if file_id: try: - client = self.connection_config.get_box_client() + if self._box_client is None: + self._box_client = self.connection_config.get_box_client() file_data.metadata.permissions_data = _get_permissions_for_file( - client, file_id, OrderedDict() + self._box_client, file_id, self._collab_cache ) except Exception as e: logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") From 312f7c53adadb227a8ed623fec6d6a42a95916d2 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 12 May 2026 10:54:09 -0400 Subject: [PATCH 14/16] make tidy --- unstructured_ingest/processes/connectors/fsspec/box.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 11a25f5f1..33b445dea 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -301,7 +301,7 @@ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: response = super().run(file_data=file_data, **kwargs) # permissions_data is set during indexing in BoxIndexer.run(); this fallback handles # standalone downloader usage (e.g., CLI, integration tests without the SND plugin layer). - # Memoize the JWT-auth'd client and ancestor cache across files so we don't re-auth per file. + # Memoize client and cache across files so we don't re-run JWT auth per file. if file_data.metadata.permissions_data is None: file_id = (file_data.metadata.record_locator or {}).get("file_id") if file_id: From e4e521c96e83252735792b1a0c91053166bc8b34 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 13 May 2026 11:48:24 -0400 Subject: [PATCH 15/16] fix(PLU-347): address box ACL review feedback (extras, configurable caches, path_collection dedup) - Add boxsdk to the box extras so `pip install unstructured-ingest[box]` actually installs what `@requires_dependencies` advertises; without it the ACL feature silently degraded. - Expose `max_num_metadata_permissions` and `permissions_cache_max_size` on both `BoxIndexerConfig` and `BoxDownloaderConfig`; the downloader fallback path now respects user-configured caps instead of hardcoded defaults. - Replace the hardcoded 5-entry ancestor-folder cache with a configurable LRU (default 128) so wide folder trees stop thrashing across sibling files. - Cache `path_collection` ancestor IDs by parent path so only the first file in a given parent pays the `file.get()` round-trip; siblings reuse the cached chain. `has_collaborations` short-circuit is preserved on the first-file path. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- test/unit/connectors/test_box.py | 69 ++++++++++++ .../processes/connectors/fsspec/box.py | 105 ++++++++++++++---- uv.lock | 2 + 5 files changed, 159 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f006e555..ff1a3ef7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ### Enhancements -- **feat(box): pass through ACL permission metadata.** Extract Box collaboration data and normalize to the standard read/update/delete schema. Permissions are fetched during indexing with an LRU-cached ancestor folder walk to handle inherited collaborations. Access-only collabs (`is_access_only=true`) are skipped to avoid overgranting; group IDs are stored directly without member expansion (consistent with Confluence). Cap on permissions captured per file is configurable via `BoxIndexerConfig.max_num_metadata_permissions` (default 500). +- **feat(box): pass through ACL permission metadata.** Extract Box collaboration data and normalize to the standard read/update/delete schema. Permissions are fetched during indexing with an LRU-cached ancestor folder walk to handle inherited collaborations, plus a per-parent-folder `path_collection` cache so only the first file in a given parent pays the `file.get()` round-trip. Access-only collabs (`is_access_only=true`) are skipped to avoid overgranting; group IDs are stored directly without member expansion (consistent with Confluence). `boxsdk` is now installed via the `box` extra. Both the permissions cap and ancestor-cache size are configurable on `BoxIndexerConfig` (`max_num_metadata_permissions`, `permissions_cache_max_size`) and `BoxDownloaderConfig` for the standalone fallback path. ### Fixes diff --git a/pyproject.toml b/pyproject.toml index c090f8c3b..2c230f63f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ astradb = ["astrapy>2.0.0"] azure-ai-search = ["azure-search-documents"] azure = ["adlfs", "fsspec"] biomed = ["beautifulsoup4", "requests"] -box = ["boxfs", "fsspec"] +box = ["boxfs", "boxsdk", "fsspec"] chroma = ["chromadb"] clarifai = ["clarifai"] confluence = ["atlassian-python-api", "requests"] diff --git a/test/unit/connectors/test_box.py b/test/unit/connectors/test_box.py index 841c46e58..0d6c0af4c 100644 --- a/test/unit/connectors/test_box.py +++ b/test/unit/connectors/test_box.py @@ -312,3 +312,72 @@ def test_api_error_on_path_collection_returns_empty(self): for entry in result: for val in list(entry.values())[0].values(): assert val == [] + + def test_parent_chain_cache_skips_file_get(self): + # When two files share a parent path, only the first should trigger file.get(). + cache = OrderedDict() + parent_chain_cache: dict[str, list[str]] = {} + folder_collab = MagicMock() + folder_collab.response_object = make_collab("user", "u1", "viewer") + client = self._make_client( + path_entries=[{"id": "f1"}], + folder_collabs_by_id={"f1": [folder_collab]}, + file_collabs=[], + ) + + first = _get_permissions_for_file( + client, + "file1", + cache, + parent_chain_cache=parent_chain_cache, + parent_path="/parent", + ) + second = _get_permissions_for_file( + client, + "file2", + cache, + parent_chain_cache=parent_chain_cache, + parent_path="/parent", + ) + + first_read = next(d for d in first if "read" in d) + second_read = next(d for d in second if "read" in d) + assert "u1" in first_read["read"]["users"] + assert "u1" in second_read["read"]["users"] + # file.get() runs once (for file1); file2 reuses the cached ancestor chain. + assert client.file.return_value.get.call_count == 1 + assert parent_chain_cache["/parent"] == ["f1"] + + def test_parent_chain_cache_not_populated_on_api_error(self): + # If file.get() fails for the first file, we shouldn't cache an empty ancestor list + # for that parent — subsequent files should retry. + cache = OrderedDict() + parent_chain_cache: dict[str, list[str]] = {} + client = MagicMock() + client.file.return_value.get.side_effect = Exception("transient") + client.file.return_value.get_collaborations.return_value = [] + + _get_permissions_for_file( + client, + "file1", + cache, + parent_chain_cache=parent_chain_cache, + parent_path="/parent", + ) + assert "/parent" not in parent_chain_cache + + def test_folder_cache_size_respected(self): + # When folder_cache_max_size=1, only one ancestor entry should remain after walking + # two distinct ancestors. + cache = OrderedDict() + folder_collab_a = MagicMock() + folder_collab_a.response_object = make_collab("user", "ua", "viewer") + folder_collab_b = MagicMock() + folder_collab_b.response_object = make_collab("user", "ub", "viewer") + client = self._make_client( + path_entries=[{"id": "fa"}, {"id": "fb"}], + folder_collabs_by_id={"fa": [folder_collab_a], "fb": [folder_collab_b]}, + file_collabs=[], + ) + _get_permissions_for_file(client, "file1", cache, folder_cache_max_size=1) + assert len(cache) == 1 diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 33b445dea..0bfeaf383 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -3,6 +3,7 @@ from collections import OrderedDict from contextlib import contextmanager from dataclasses import dataclass, field +from pathlib import Path from time import time from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional @@ -83,7 +84,9 @@ def _normalize_collaborations( total[0] += 1 -def _get_collaborations_for_folder(client, folder_id: str, cache: OrderedDict) -> list[dict]: +def _get_collaborations_for_folder( + client, folder_id: str, cache: OrderedDict, max_size: int = 128 +) -> list[dict]: if folder_id in cache: cache.move_to_end(folder_id) logger.debug(f"Retrieved cached collaborations for folder {folder_id}") @@ -102,14 +105,20 @@ def _get_collaborations_for_folder(client, folder_id: str, cache: OrderedDict) - logger.warning(f"Could not retrieve collaborations for folder {folder_id}: {e}") return collabs - if len(cache) >= 5: + if len(cache) >= max_size: cache.popitem(last=False) cache[folder_id] = collabs return collabs def _get_permissions_for_file( - client, file_id: str, cache: OrderedDict, max_perms: int = 500 + client, + file_id: str, + cache: OrderedDict, + max_perms: int = 500, + folder_cache_max_size: int = 128, + parent_chain_cache: Optional[dict[str, list[str]]] = None, + parent_path: Optional[str] = None, ) -> list[dict]: normalized = { "read": {"users": set(), "groups": set()}, @@ -118,20 +127,41 @@ def _get_permissions_for_file( } total = [0] + # Files sharing a parent folder share the same path_collection. Caching by parent path + # lets us skip the per-file file.get() round-trip for every file after the first in a + # given folder — material savings against Box's 1k-calls/minute Enterprise rate limit. # Default True so a failed file.get() doesn't suppress the direct-collab fetch. has_direct_collabs = True - try: - file_obj = client.file(file_id).get(fields=["path_collection", "has_collaborations"]) - response_obj = file_obj.response_object - path_entries = response_obj.get("path_collection", {}).get("entries", []) - has_direct_collabs = bool(response_obj.get("has_collaborations", True)) - for folder_entry in path_entries: - folder_id = folder_entry.get("id") - if folder_id and folder_id != "0": - folder_collabs = _get_collaborations_for_folder(client, folder_id, cache) - _normalize_collaborations(folder_collabs, normalized, total, max_perms) - except Exception as e: - logger.debug(f"Could not retrieve path_collection for file {file_id}: {e}") + ancestor_folder_ids: Optional[list[str]] = None + if ( + parent_chain_cache is not None + and parent_path is not None + and parent_path in parent_chain_cache + ): + ancestor_folder_ids = parent_chain_cache[parent_path] + + if ancestor_folder_ids is None: + try: + file_obj = client.file(file_id).get(fields=["path_collection", "has_collaborations"]) + response_obj = file_obj.response_object + path_entries = response_obj.get("path_collection", {}).get("entries", []) + has_direct_collabs = bool(response_obj.get("has_collaborations", True)) + ancestor_folder_ids = [ + entry["id"] + for entry in path_entries + if entry.get("id") and entry["id"] != "0" + ] + if parent_chain_cache is not None and parent_path is not None: + parent_chain_cache[parent_path] = ancestor_folder_ids + except Exception as e: + logger.debug(f"Could not retrieve path_collection for file {file_id}: {e}") + ancestor_folder_ids = [] + + for folder_id in ancestor_folder_ids: + folder_collabs = _get_collaborations_for_folder( + client, folder_id, cache, folder_cache_max_size + ) + _normalize_collaborations(folder_collabs, normalized, total, max_perms) if has_direct_collabs: try: @@ -157,6 +187,14 @@ class BoxIndexerConfig(FsspecIndexerConfig): max_num_metadata_permissions: int = Field( 500, description="Approximate maximum number of permissions included in metadata" ) + permissions_cache_max_size: int = Field( + 128, + description=( + "Max entries in the ancestor-folder collaborations LRU cache. Each entry holds the " + "raw collaboration list for one Box folder. Bump higher for wide folder trees to " + "avoid evicting entries needed by sibling files in the same branch." + ), + ) class BoxAccessConfig(FsspecAccessConfig): @@ -263,6 +301,7 @@ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata: def run(self, **kwargs: Any) -> Generator[FileData, None, None]: client = None cache: OrderedDict = OrderedDict() + parent_chain_cache: dict[str, list[str]] = {} try: client = self.connection_config.get_box_client() except Exception as e: @@ -272,12 +311,20 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: if client: file_id = (file_data.metadata.record_locator or {}).get("file_id") if file_id: + parent_path = ( + str(Path(file_data.source_identifiers.fullpath).parent) + if file_data.source_identifiers and file_data.source_identifiers.fullpath + else None + ) try: file_data.metadata.permissions_data = _get_permissions_for_file( client, file_id, cache, - self.index_config.max_num_metadata_permissions, + max_perms=self.index_config.max_num_metadata_permissions, + folder_cache_max_size=self.index_config.permissions_cache_max_size, + parent_chain_cache=parent_chain_cache, + parent_path=parent_path, ) except Exception as e: logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") @@ -285,7 +332,12 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]: class BoxDownloaderConfig(FsspecDownloaderConfig): - pass + max_num_metadata_permissions: int = Field( + 500, description="Approximate maximum number of permissions included in metadata" + ) + permissions_cache_max_size: int = Field( + 128, description="Max entries in the ancestor-folder collaborations LRU cache" + ) @dataclass @@ -296,20 +348,35 @@ class BoxDownloader(FsspecDownloader): download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig) _box_client: Any = field(default=None, init=False, repr=False) _collab_cache: OrderedDict = field(default_factory=OrderedDict, init=False, repr=False) + _parent_chain_cache: dict[str, list[str]] = field( + default_factory=dict, init=False, repr=False + ) def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: response = super().run(file_data=file_data, **kwargs) # permissions_data is set during indexing in BoxIndexer.run(); this fallback handles # standalone downloader usage (e.g., CLI, integration tests without the SND plugin layer). - # Memoize client and cache across files so we don't re-run JWT auth per file. + # Memoize client and caches across files so we don't re-run JWT auth or refetch the + # path_collection per file. if file_data.metadata.permissions_data is None: file_id = (file_data.metadata.record_locator or {}).get("file_id") if file_id: + parent_path = ( + str(Path(file_data.source_identifiers.fullpath).parent) + if file_data.source_identifiers and file_data.source_identifiers.fullpath + else None + ) try: if self._box_client is None: self._box_client = self.connection_config.get_box_client() file_data.metadata.permissions_data = _get_permissions_for_file( - self._box_client, file_id, self._collab_cache + self._box_client, + file_id, + self._collab_cache, + max_perms=self.download_config.max_num_metadata_permissions, + folder_cache_max_size=self.download_config.permissions_cache_max_size, + parent_chain_cache=self._parent_chain_cache, + parent_path=parent_path, ) except Exception as e: logger.warning(f"Could not retrieve permissions for file {file_id}: {e}") diff --git a/uv.lock b/uv.lock index 318135d94..a5368e9dc 100644 --- a/uv.lock +++ b/uv.lock @@ -7008,6 +7008,7 @@ biomed = [ ] box = [ { name = "boxfs" }, + { name = "boxsdk" }, { name = "fsspec" }, ] chroma = [ @@ -7317,6 +7318,7 @@ requires-dist = [ { name = "boto3", marker = "extra == 'opensearch'", specifier = ">=1.26.0" }, { name = "botocore", marker = "extra == 'opensearch'", specifier = ">=1.29.0" }, { name = "boxfs", marker = "extra == 'box'" }, + { name = "boxsdk", marker = "extra == 'box'" }, { name = "certifi", specifier = ">=2026.1.4" }, { name = "chromadb", marker = "extra == 'chroma'" }, { name = "clarifai", marker = "extra == 'clarifai'" }, From 4a2f93c654030a7cef94e2c839d8a62066edc5e4 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 13 May 2026 11:53:07 -0400 Subject: [PATCH 16/16] fix(PLU-347): align permissions_cache_max_size help text across box configs CLI consolidates options with the same name across indexer/downloader configs and rejects them if any attribute (including help text) differs. The longer indexer-side description tripped this in test_ingest_help; collapse both to the shorter shared form. Co-Authored-By: Claude Opus 4.7 (1M context) --- unstructured_ingest/processes/connectors/fsspec/box.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/unstructured_ingest/processes/connectors/fsspec/box.py b/unstructured_ingest/processes/connectors/fsspec/box.py index 0bfeaf383..77b12b315 100644 --- a/unstructured_ingest/processes/connectors/fsspec/box.py +++ b/unstructured_ingest/processes/connectors/fsspec/box.py @@ -188,12 +188,7 @@ class BoxIndexerConfig(FsspecIndexerConfig): 500, description="Approximate maximum number of permissions included in metadata" ) permissions_cache_max_size: int = Field( - 128, - description=( - "Max entries in the ancestor-folder collaborations LRU cache. Each entry holds the " - "raw collaboration list for one Box folder. Bump higher for wide folder trees to " - "avoid evicting entries needed by sibling files in the same branch." - ), + 128, description="Max entries in the ancestor-folder collaborations LRU cache" )