Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## [1.5.2]

### Enhancements

- **feat(microsoft): add delegated `oauth_token` to SharePoint, OneDrive, and Outlook AccessConfigs.** Accepts a user access token directly, bypassing MSAL when present. `client_id` / `client_cred` become optional. Mirrors the Google Drive `oauth_token` pattern; refresh is not handled here.

## [1.5.1]

### Fixes
Expand Down
83 changes: 83 additions & 0 deletions test/unit/connectors/test_onedrive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pytest
from pydantic import Secret

from unstructured_ingest.error import ValueError
from unstructured_ingest.processes.connectors.onedrive import (
OnedriveAccessConfig,
OnedriveConnectionConfig,
)


class TestOnedriveAccessConfig:
"""Tests for OnedriveAccessConfig authentication validation."""

def test_client_cred_only(self):
"""Client credential alone should be valid (app-only authentication)."""
config = OnedriveAccessConfig(client_cred="secret-value")
assert config.client_cred == "secret-value"
assert config.oauth_token is None

def test_client_cred_and_password(self):
"""client_cred + password is the password-grant flow and should be valid."""
config = OnedriveAccessConfig(client_cred="secret-value", password="user-password")
assert config.client_cred == "secret-value"
assert config.password == "user-password"
assert config.oauth_token is None

def test_oauth_token_only(self):
"""OAuth token alone should be valid (delegated authentication)."""
config = OnedriveAccessConfig(oauth_token="ey.access.token")
assert config.oauth_token == "ey.access.token"
assert config.client_cred is None

def test_no_auth_raises_error(self):
"""No authentication provided should raise ValueError."""
with pytest.raises(ValueError, match="must be set"):
OnedriveAccessConfig()

def test_oauth_and_client_cred_raises_error(self):
"""Both oauth_token and client_cred provided should raise ValueError."""
with pytest.raises(ValueError, match="cannot use both"):
OnedriveAccessConfig(
client_cred="secret-value",
oauth_token="ey.access.token",
)

def test_oauth_and_password_raises_error(self):
"""oauth_token combined with password should raise ValueError."""
with pytest.raises(ValueError, match="cannot use both"):
OnedriveAccessConfig(
password="user-password",
oauth_token="ey.access.token",
)

def test_empty_oauth_token_treated_as_missing(self):
"""An empty-string oauth_token (e.g. unset env var) should not satisfy the auth requirement.

Validator and runtime both use truthiness; this test pins that consistency.
"""
with pytest.raises(ValueError, match="must be set"):
OnedriveAccessConfig(oauth_token="")


class TestOnedriveConnectionConfig:
"""Tests for OnedriveConnectionConfig cross-field auth validation."""

def test_client_cred_without_client_id_raises(self):
"""client_cred-based auth requires client_id; rejecting at config time
avoids cryptic AADSTS / MSAL errors at runtime."""
with pytest.raises(ValueError, match="client_id is required"):
OnedriveConnectionConfig(
user_pname="alice@contoso.com",
tenant="tenant-id",
access_config=Secret(OnedriveAccessConfig(client_cred="secret-value")),
)

def test_oauth_token_without_client_id_succeeds(self):
"""oauth_token auth doesn't need client_id; this is the delegated path."""
config = OnedriveConnectionConfig(
user_pname="alice@contoso.com",
tenant="tenant-id",
access_config=Secret(OnedriveAccessConfig(oauth_token="ey.access.token")),
)
assert config.client_id is None
65 changes: 65 additions & 0 deletions test/unit/connectors/test_outlook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import pytest
from pydantic import Secret

from unstructured_ingest.error import ValueError
from unstructured_ingest.processes.connectors.outlook import (
OutlookAccessConfig,
OutlookConnectionConfig,
)


class TestOutlookAccessConfig:
"""Tests for OutlookAccessConfig authentication validation."""

def test_client_cred_only(self):
"""Client credential alone should be valid (app-only authentication)."""
config = OutlookAccessConfig(client_cred="secret-value")
# `client_credential` is the field name; `client_cred` is the alias.
assert config.client_credential == "secret-value"
assert config.oauth_token is None

def test_oauth_token_only(self):
"""OAuth token alone should be valid (delegated authentication)."""
config = OutlookAccessConfig(oauth_token="ey.access.token")
assert config.oauth_token == "ey.access.token"
assert config.client_credential is None

def test_no_auth_raises_error(self):
"""No authentication provided should raise ValueError."""
with pytest.raises(ValueError, match="must be set"):
OutlookAccessConfig()

def test_oauth_and_client_cred_raises_error(self):
"""Both oauth_token and client_cred provided should raise ValueError."""
with pytest.raises(ValueError, match="cannot use both"):
OutlookAccessConfig(
client_cred="secret-value",
oauth_token="ey.access.token",
)

def test_empty_oauth_token_treated_as_missing(self):
"""An empty-string oauth_token (e.g. unset env var) should not satisfy the auth requirement.

Validator and runtime both use truthiness; this test pins that consistency.
"""
with pytest.raises(ValueError, match="must be set"):
OutlookAccessConfig(oauth_token="")


class TestOutlookConnectionConfig:
"""Tests for OutlookConnectionConfig cross-field auth validation."""

def test_client_cred_without_client_id_raises(self):
"""client_cred-based auth requires client_id; rejecting at config time
avoids cryptic AADSTS / MSAL errors at runtime."""
with pytest.raises(ValueError, match="client_id is required"):
OutlookConnectionConfig(
access_config=Secret(OutlookAccessConfig(client_cred="secret-value")),
)

def test_oauth_token_without_client_id_succeeds(self):
"""oauth_token auth doesn't need client_id; this is the delegated path."""
config = OutlookConnectionConfig(
access_config=Secret(OutlookAccessConfig(oauth_token="ey.access.token")),
)
assert config.client_id is None
78 changes: 77 additions & 1 deletion test/unit/connectors/test_sharepoint.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from unittest.mock import Mock

import pytest
from pydantic import Secret

from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
from unstructured_ingest.error import SourceConnectionError
from unstructured_ingest.error import SourceConnectionError, ValueError
from unstructured_ingest.processes.connectors.sharepoint import (
MICROSOFT_ROLE_MAPPING,
SharepointAccessConfig,
SharepointConnectionConfig,
SharepointDownloader,
SharepointDownloaderConfig,
Expand All @@ -14,6 +16,80 @@
)


class TestSharepointAccessConfig:
"""Tests for SharepointAccessConfig authentication validation."""

def test_client_cred_only(self):
"""Client credential alone should be valid (app-only authentication)."""
config = SharepointAccessConfig(client_cred="secret-value")
assert config.client_cred == "secret-value"
assert config.oauth_token is None

def test_oauth_token_only(self):
"""OAuth token alone should be valid (delegated authentication)."""
config = SharepointAccessConfig(oauth_token="ey.access.token")
assert config.oauth_token == "ey.access.token"
assert config.client_cred is None

def test_no_auth_raises_error(self):
"""No authentication provided should raise ValueError."""
with pytest.raises(ValueError, match="must be set"):
SharepointAccessConfig()

def test_oauth_and_client_cred_raises_error(self):
"""Both oauth_token and client_cred provided should raise ValueError."""
with pytest.raises(ValueError, match="cannot use both"):
SharepointAccessConfig(
client_cred="secret-value",
oauth_token="ey.access.token",
)

def test_oauth_and_password_raises_error(self):
"""oauth_token combined with password should raise ValueError."""
with pytest.raises(ValueError, match="cannot use both"):
SharepointAccessConfig(
password="user-password",
oauth_token="ey.access.token",
)

def test_empty_oauth_token_treated_as_missing(self):
"""An empty-string oauth_token (e.g. unset env var) should not satisfy the auth requirement.

Validator and runtime both use truthiness; this test pins that consistency.
"""
with pytest.raises(ValueError, match="must be set"):
SharepointAccessConfig(oauth_token="")


class TestSharepointConnectionConfig:
"""Tests for SharepointConnectionConfig cross-field auth validation.

SharepointConnectionConfig inherits the validator from OnedriveConnectionConfig;
these tests verify the inheritance carries the cross-field constraint through.
"""

def test_client_cred_without_client_id_raises(self):
"""client_cred-based auth requires client_id; rejecting at config time
avoids cryptic AADSTS / MSAL errors at runtime."""
with pytest.raises(ValueError, match="client_id is required"):
SharepointConnectionConfig(
site="https://contoso.sharepoint.com/sites/acme",
user_pname="alice@contoso.com",
tenant="tenant-id",
access_config=Secret(SharepointAccessConfig(client_cred="secret-value")),
)

def test_oauth_token_without_client_id_succeeds(self):
"""oauth_token auth doesn't need client_id; this is the delegated path."""
config = SharepointConnectionConfig(
site="https://contoso.sharepoint.com/sites/acme",
user_pname="alice@contoso.com",
tenant="tenant-id",
access_config=Secret(SharepointAccessConfig(oauth_token="ey.access.token")),
)
assert config.client_id is None


@pytest.fixture
def mock_client():
return Mock()
Expand Down
2 changes: 1 addition & 1 deletion unstructured_ingest/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.5.1" # pragma: no cover
__version__ = "1.5.2" # pragma: no cover
63 changes: 59 additions & 4 deletions unstructured_ingest/processes/connectors/onedrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional

from dateutil import parser
from pydantic import Field, Secret
from pydantic import Field, Secret, model_validator

from unstructured_ingest.data_types.file_data import (
FileData,
Expand Down Expand Up @@ -54,12 +54,39 @@


class OnedriveAccessConfig(AccessConfig):
client_cred: str = Field(description="Microsoft App client secret")
client_cred: Optional[str] = Field(default=None, description="Microsoft App client secret")
password: Optional[str] = Field(description="Service account password", default=None)
oauth_token: Optional[str] = Field(
default=None,
description=(
"OAuth 2.0 access token for delegated user authentication. "
"Tokens typically expire after ~1 hour; this connector does not "
"refresh tokens."
),
)

def model_post_init(self, __context: Any) -> None:
# Use truthiness so empty strings (e.g. from unset env vars) are treated
# consistently with the runtime auth-mode check in get_token below.
has_client_cred = bool(self.client_cred)
has_oauth_token = bool(self.oauth_token)
has_password = bool(self.password)

if not has_client_cred and not has_oauth_token:
raise ValueError("either client_cred or oauth_token must be set")

if has_oauth_token and (has_client_cred or has_password):
raise ValueError("cannot use both oauth_token and client_cred/password authentication")
Comment thread
cursor[bot] marked this conversation as resolved.


class OnedriveConnectionConfig(ConnectionConfig):
client_id: str = Field(description="Microsoft app client ID")
client_id: Optional[str] = Field(
default=None,
description=(
"Microsoft app client ID. Required for app-only and password-grant authentication;"
" not required when using oauth_token."
),
)
Comment thread
cursor[bot] marked this conversation as resolved.
user_pname: str = Field(
description="User principal name or service account, usually your Azure AD email."
)
Expand All @@ -74,6 +101,25 @@ class OnedriveConnectionConfig(ConnectionConfig):
)
access_config: Secret[OnedriveAccessConfig]

@model_validator(mode="after")
def _require_client_id_without_oauth(self) -> "OnedriveConnectionConfig":
# client_id lives on ConnectionConfig (above) and oauth_token on AccessConfig,
# so this cross-field rule can't live in either model_post_init alone.
if not self.access_config.get_secret_value().oauth_token and not self.client_id:
raise ValueError("client_id is required when oauth_token is not set")
return self

def _log_oauth_advisory(self) -> None:
"""Emit a one-shot advisory at precheck time when delegated OAuth is in use.

Lives on ConnectionConfig so Indexer/Uploader/Downloader prechecks share
one source of truth instead of each duplicating the message. Called from
precheck (once per step instance) rather than from get_token (called per
Graph request) to avoid log spam during normal indexing.
"""
if self.access_config.get_secret_value().oauth_token:
logger.warning("Using OAuth token authentication. Tokens expire after ~1 hour.")

def get_drive(self) -> "Drive":
client = self.get_client()
drive = client.users[self.user_pname].drive
Expand All @@ -84,7 +130,14 @@ def get_token(self):
from msal import ConfidentialClientApplication
from requests import post

if self.access_config.get_secret_value().password:
access_config = self.access_config.get_secret_value()

if access_config.oauth_token:
# Delegated user authentication: hand the access token through directly.
# Tokens typically expire after ~1 hour; refresh is not handled here.
return {"access_token": access_config.oauth_token, "token_type": "Bearer"}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oauth_token path unnecessarily requires msal dependency

Low Severity

The oauth_token short-circuit in get_token and _acquire_token is placed after the from msal import ... (and from requests import ...) statements. Both the @requires_dependencies decorator and these local imports execute before the if access_config.oauth_token: check, meaning the oauth_token path — which never uses MSAL — still requires msal (and requests) to be installed and importable. This contradicts the stated intent of "no MSAL round-trip" for the delegated token path.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 7fb41d4. Configure here.


if access_config.password:
url = f"https://login.microsoftonline.com/{self.tenant}/oauth2/v2.0/token"
headers = {"Content-Type": "application/x-www-form-urlencoded"}
data = {
Expand Down Expand Up @@ -160,6 +213,7 @@ class OnedriveIndexer(Indexer):
connector_type: str = CONNECTOR_TYPE

def precheck(self) -> None:
self.connection_config._log_oauth_advisory()
try:
token_resp: dict = self.connection_config.get_token()
if error := token_resp.get("error"):
Expand Down Expand Up @@ -358,6 +412,7 @@ class OnedriveUploader(Uploader):
def precheck(self) -> None:
from office365.runtime.client_request_exception import ClientRequestException

self.connection_config._log_oauth_advisory()
try:
token_resp: dict = self.connection_config.get_token()
if error := token_resp.get("error"):
Expand Down
Loading
Loading