Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
0241e50
feat(box): add ACL permissions metadata to Box connector
danielle-unstructured-io May 1, 2026
99299bf
refactor: move permissions fetching to indexer, extract module-level …
danielle-unstructured-io May 4, 2026
7fc3a81
fix(box): move max_num_metadata_permissions to indexer config
danielle-unstructured-io May 11, 2026
6b9683e
fix(box): skip is_access_only collaborations to prevent ACL overgrant
danielle-unstructured-io May 11, 2026
c99b150
fix(box): grant editor role delete permission
danielle-unstructured-io May 11, 2026
b6059ab
fix(test): strip randomized tempdir prefix from fixture paths
danielle-unstructured-io May 11, 2026
bcd8622
chore: bump version to 1.6.0 and add changelog entry
danielle-unstructured-io May 11, 2026
580bb7a
Merge remote-tracking branch 'origin/main' into feature/box-acl-permi…
awalker4 May 11, 2026
6b45a60
chore: bump version to 1.6.0 instead of 1.5.3
awalker4 May 11, 2026
b29c6a5
fix(box): don't cache folder collabs on API failure
awalker4 May 11, 2026
af78a0e
fix(test): walk on-disk tree in check_raw_file_contents
awalker4 May 11, 2026
45bdbab
fix(box): drop previewer roles from BOX_ROLE_MAPPING
awalker4 May 11, 2026
8416d47
make tidy
awalker4 May 11, 2026
ff0861a
perf(box): skip file-collab fetch when has_collaborations is False; r…
awalker4 May 12, 2026
312f7c5
make tidy
awalker4 May 12, 2026
e4e521c
fix(PLU-347): address box ACL review feedback (extras, configurable c…
awalker4 May 13, 2026
4a2f93c
fix(PLU-347): align permissions_cache_max_size help text across box c…
awalker4 May 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## [1.6.0]

### Enhancements

- **feat(box): pass through ACL permission metadata.** Extract Box collaboration data and normalize to the standard read/update/delete schema. Permissions are fetched during indexing with an LRU-cached ancestor folder walk to handle inherited collaborations, plus a per-parent-folder `path_collection` cache so only the first file in a given parent pays the `file.get()` round-trip. Access-only collabs (`is_access_only=true`) are skipped to avoid overgranting; group IDs are stored directly without member expansion (consistent with Confluence). `boxsdk` is now installed via the `box` extra. Both the permissions cap and ancestor-cache size are configurable on `BoxIndexerConfig` (`max_num_metadata_permissions`, `permissions_cache_max_size`) and `BoxDownloaderConfig` for the standalone fallback path.

### Fixes

- **fix(test): strip randomized tempdir prefix from FsspecDownloader fixture paths.** `get_files()` now drops the leading `unstructured_<random>/` segment so `directory_structure.json` captures the logical structure rather than the per-run random suffix injected by `tempfile.mkdtemp`.

## [1.5.2]

### Enhancements
Expand Down
4 changes: 2 additions & 2 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ In checklist form, the above steps are summarized as:

The ingest flow is similar to an ETL pipeline that gets defined at runtime based on user input:

![unstructured ingest diagram](assets/pipeline.png)
![unstructured ingest diagram](pipeline.png)



Expand All @@ -117,7 +117,7 @@ The ingest flow is similar to an ETL pipeline that gets defined at runtime based


### Sequence Diagram
![unstructured ingest sequence diagram](assets/sequence.png)
![unstructured ingest sequence diagram](sequence.png)


### Parallel Execution
Expand Down
2 changes: 1 addition & 1 deletion docs/connector_development.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ If you have any questions post in the public Slack channel `ask-for-help-open-so

Yellow (without the Uncompressing) represents the steps in a source connector. Orange represents a destination connector.

![unstructured_ingest diagram](assets/pipeline.png)
![unstructured_ingest diagram](pipeline.png)



2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ astradb = ["astrapy>2.0.0"]
azure-ai-search = ["azure-search-documents"]
azure = ["adlfs", "fsspec"]
biomed = ["beautifulsoup4", "requests"]
box = ["boxfs", "fsspec"]
box = ["boxfs", "boxsdk", "fsspec"]
chroma = ["chromadb"]
clarifai = ["clarifai"]
confluence = ["atlassian-python-api", "requests"]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"directory_structure": [
"catalog.pdf"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"identifier": "8b0303ba-7c77-5e47-b0ff-790b8fc9881f",
"connector_type": "box",
"source_identifiers": {
"filename": "catalog.pdf",
"fullpath": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf",
"rel_path": "catalog.pdf"
},
"metadata": {
"url": "box:///TestACLs-topfolder/TestACLs-secondtier/catalog.pdf",
"version": "2216144540657",
"record_locator": {
"protocol": "box",
"remote_file_path": "box://TestACLs-topfolder/TestACLs-secondtier",
"file_id": "2216144540657"
},
"date_created": "1777662782.0",
"date_modified": "1777662782.0",
"date_processed": "1777665707.7073228",
"permissions_data": [
{
"read": {
"users": [
"50881967280",
"50882409531"
],
"groups": []
}
},
{
"update": {
"users": [
"50881967280"
],
"groups": []
}
},
{
"delete": {
"users": [
"50881967280"
],
"groups": []
}
}
],
"filesize_bytes": 296006
},
"additional_metadata": {
"name": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf",
"size": 296006,
"type": "file",
"id": "2216144540657",
"modified_at": "2026-05-01T12:13:02-07:00",
"created_at": "2026-05-01T12:13:02-07:00",
"original_file_path": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf"
},
"reprocess": false,
"local_download_path": "/private/var/folders/gf/qwh2bdg93kb9gzxd_xhb49wc0000gn/T/tmpekwnxs4a/unstructured_uvopv4ry/catalog.pdf",
"display_name": "/TestACLs-topfolder/TestACLs-secondtier/catalog.pdf"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"directory_structure": [
"Billing issue - Example 1.pdf"
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"identifier": "11333818-b47e-5991-b32f-701975b2caca",
"connector_type": "box",
"source_identifiers": {
"filename": "Billing issue - Example 1.pdf",
"fullpath": "/TestACLs-topfolder/Billing issue - Example 1.pdf",
"rel_path": "Billing issue - Example 1.pdf"
},
"metadata": {
"url": "box:///TestACLs-topfolder/Billing issue - Example 1.pdf",
"version": "2216145342898",
"record_locator": {
"protocol": "box",
"remote_file_path": "box://TestACLs-topfolder",
"file_id": "2216145342898"
},
"date_created": "1777662769.0",
"date_modified": "1777662769.0",
"date_processed": "1777665696.530676",
"permissions_data": [
{
"read": {
"users": [
"50881967280",
"50882409531"
],
"groups": []
}
},
{
"update": {
"users": [
"50881967280"
],
"groups": []
}
},
{
"delete": {
"users": [
"50881967280"
],
"groups": []
}
}
],
"filesize_bytes": 142776
},
"additional_metadata": {
"name": "/TestACLs-topfolder/Billing issue - Example 1.pdf",
"size": 142776,
"type": "file",
"id": "2216145342898",
"modified_at": "2026-05-01T12:12:49-07:00",
"created_at": "2026-05-01T12:12:49-07:00",
"original_file_path": "/TestACLs-topfolder/Billing issue - Example 1.pdf"
},
"reprocess": false,
"local_download_path": "/private/var/folders/gf/qwh2bdg93kb9gzxd_xhb49wc0000gn/T/tmpqw6nq7zk/unstructured_aqpewcxk/Billing issue - Example 1.pdf",
"display_name": "/TestACLs-topfolder/Billing issue - Example 1.pdf"
}
83 changes: 83 additions & 0 deletions test/integration/connectors/test_box.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os

import pytest

from test.integration.connectors.utils.constants import BLOB_STORAGE_TAG, SOURCE_TAG
from test.integration.connectors.utils.validation.source import (
SourceValidationConfigs,
source_connector_validation,
)
from test.integration.utils import requires_env
from unstructured_ingest.processes.connectors.fsspec.box import (
CONNECTOR_TYPE,
BoxAccessConfig,
BoxConnectionConfig,
BoxDownloader,
BoxDownloaderConfig,
BoxIndexer,
BoxIndexerConfig,
)


def make_box_components(remote_url: str, download_dir):
app_config = os.environ["BOX_APP_CONFIG"]
connection_config = BoxConnectionConfig(
access_config=BoxAccessConfig(box_app_config=app_config)
)
index_config = BoxIndexerConfig(remote_url=remote_url)
download_config = BoxDownloaderConfig(download_dir=download_dir)
indexer = BoxIndexer(connection_config=connection_config, index_config=index_config)
downloader = BoxDownloader(connection_config=connection_config, download_config=download_config)
return indexer, downloader


@pytest.mark.asyncio
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
@requires_env("BOX_APP_CONFIG")
async def test_box_top_folder(temp_dir):
"""
Integration test for Box source connector against the top-level ACL test folder.
Validates that permissions_data is populated from direct folder collaborations.
"""
indexer, downloader = make_box_components(
remote_url="box://TestACLs-topfolder",
download_dir=temp_dir,
)
await source_connector_validation(
indexer=indexer,
downloader=downloader,
configs=SourceValidationConfigs(
test_id="box_top_folder",
validate_downloaded_files=False,
validate_file_data=True,
exclude_fields_extend=[
"metadata.date_processed",
],
),
)


@pytest.mark.asyncio
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
@requires_env("BOX_APP_CONFIG")
async def test_box_second_tier(temp_dir):
"""
Integration test for Box source connector against the nested ACL test folder.
Validates that permissions_data reflects inherited permissions from the parent folder.
"""
indexer, downloader = make_box_components(
remote_url="box://TestACLs-topfolder/TestACLs-secondtier",
download_dir=temp_dir,
)
await source_connector_validation(
indexer=indexer,
downloader=downloader,
configs=SourceValidationConfigs(
test_id="box_second_tier",
validate_downloaded_files=False,
validate_file_data=True,
exclude_fields_extend=[
"metadata.date_processed",
],
),
)
24 changes: 19 additions & 5 deletions test/integration/connectors/utils/validation/source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import re
import shutil
from pathlib import Path
from typing import Callable, Optional
Expand Down Expand Up @@ -86,9 +87,17 @@ def omit_ignored_fields(self, data: dict) -> dict:
return copied_data


# FsspecDownloader writes each file into a fresh tempfile.mkdtemp("unstructured_") subdir
# to avoid path collisions. Strip that segment so fixtures capture the logical structure
# rather than a randomized suffix that changes every run.
_FSSPEC_TEMP_DIR_PATTERN = re.compile(r"^unstructured_[a-zA-Z0-9_-]+/")


def get_files(dir_path: Path) -> list[str]:
return [
str(f).replace(str(dir_path), "").lstrip("/") for f in dir_path.rglob("*") if f.is_file()
_FSSPEC_TEMP_DIR_PATTERN.sub("", str(f).replace(str(dir_path), "").lstrip("/"))
for f in dir_path.rglob("*")
if f.is_file()
Comment thread
cursor[bot] marked this conversation as resolved.
]


Expand Down Expand Up @@ -129,12 +138,17 @@ def check_raw_file_contents(
current_output_dir: Path,
configs: SourceValidationConfigs,
):
current_files = get_files(dir_path=current_output_dir)
found_diff = False
files = []
for current_file in current_files:
current_file_path = current_output_dir / current_file
expected_file_path = expected_output_dir / current_file
for current_file_path in current_output_dir.rglob("*"):
if not current_file_path.is_file():
continue
relative = str(current_file_path.relative_to(current_output_dir))
# Strip the unstructured_<random>/ tempdir segment when locating the
# corresponding fixture; the on-disk file still lives under the random
# subdir so don't strip it from current_file_path.
expected_relative = _FSSPEC_TEMP_DIR_PATTERN.sub("", relative)
expected_file_path = expected_output_dir / expected_relative
if configs.detect_diff(expected_file_path, current_file_path):
found_diff = True
files.append(str(expected_file_path))
Expand Down
Loading
Loading