Skip to content

Commit 3ab29e6

Browse files
ds-filipknefelFilip Knefelclaude
authored
feat(microsoft): add delegated oauth_token authentication (#706)
Mirror the Google Drive oauth_token pattern across the three Microsoft Graph connectors (SharePoint, OneDrive, Outlook). --------- Co-authored-by: Filip Knefel <filip@unstructured.io> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 333f56b commit 3ab29e6

8 files changed

Lines changed: 355 additions & 13 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## [1.5.2]
2+
3+
### Enhancements
4+
5+
- **feat(microsoft): add delegated `oauth_token` to SharePoint, OneDrive, and Outlook AccessConfigs.** Accepts a user access token directly, bypassing MSAL when present. `client_id` / `client_cred` become optional. Mirrors the Google Drive `oauth_token` pattern; refresh is not handled here.
6+
17
## [1.5.1]
28

39
### Fixes
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import pytest
2+
from pydantic import Secret
3+
4+
from unstructured_ingest.error import ValueError
5+
from unstructured_ingest.processes.connectors.onedrive import (
6+
OnedriveAccessConfig,
7+
OnedriveConnectionConfig,
8+
)
9+
10+
11+
class TestOnedriveAccessConfig:
12+
"""Tests for OnedriveAccessConfig authentication validation."""
13+
14+
def test_client_cred_only(self):
15+
"""Client credential alone should be valid (app-only authentication)."""
16+
config = OnedriveAccessConfig(client_cred="secret-value")
17+
assert config.client_cred == "secret-value"
18+
assert config.oauth_token is None
19+
20+
def test_client_cred_and_password(self):
21+
"""client_cred + password is the password-grant flow and should be valid."""
22+
config = OnedriveAccessConfig(client_cred="secret-value", password="user-password")
23+
assert config.client_cred == "secret-value"
24+
assert config.password == "user-password"
25+
assert config.oauth_token is None
26+
27+
def test_oauth_token_only(self):
28+
"""OAuth token alone should be valid (delegated authentication)."""
29+
config = OnedriveAccessConfig(oauth_token="ey.access.token")
30+
assert config.oauth_token == "ey.access.token"
31+
assert config.client_cred is None
32+
33+
def test_no_auth_raises_error(self):
34+
"""No authentication provided should raise ValueError."""
35+
with pytest.raises(ValueError, match="must be set"):
36+
OnedriveAccessConfig()
37+
38+
def test_oauth_and_client_cred_raises_error(self):
39+
"""Both oauth_token and client_cred provided should raise ValueError."""
40+
with pytest.raises(ValueError, match="cannot use both"):
41+
OnedriveAccessConfig(
42+
client_cred="secret-value",
43+
oauth_token="ey.access.token",
44+
)
45+
46+
def test_oauth_and_password_raises_error(self):
47+
"""oauth_token combined with password should raise ValueError."""
48+
with pytest.raises(ValueError, match="cannot use both"):
49+
OnedriveAccessConfig(
50+
password="user-password",
51+
oauth_token="ey.access.token",
52+
)
53+
54+
def test_empty_oauth_token_treated_as_missing(self):
55+
"""An empty-string oauth_token (e.g. unset env var) should not satisfy the auth requirement.
56+
57+
Validator and runtime both use truthiness; this test pins that consistency.
58+
"""
59+
with pytest.raises(ValueError, match="must be set"):
60+
OnedriveAccessConfig(oauth_token="")
61+
62+
63+
class TestOnedriveConnectionConfig:
64+
"""Tests for OnedriveConnectionConfig cross-field auth validation."""
65+
66+
def test_client_cred_without_client_id_raises(self):
67+
"""client_cred-based auth requires client_id; rejecting at config time
68+
avoids cryptic AADSTS / MSAL errors at runtime."""
69+
with pytest.raises(ValueError, match="client_id is required"):
70+
OnedriveConnectionConfig(
71+
user_pname="alice@contoso.com",
72+
tenant="tenant-id",
73+
access_config=Secret(OnedriveAccessConfig(client_cred="secret-value")),
74+
)
75+
76+
def test_oauth_token_without_client_id_succeeds(self):
77+
"""oauth_token auth doesn't need client_id; this is the delegated path."""
78+
config = OnedriveConnectionConfig(
79+
user_pname="alice@contoso.com",
80+
tenant="tenant-id",
81+
access_config=Secret(OnedriveAccessConfig(oauth_token="ey.access.token")),
82+
)
83+
assert config.client_id is None
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import pytest
2+
from pydantic import Secret
3+
4+
from unstructured_ingest.error import ValueError
5+
from unstructured_ingest.processes.connectors.outlook import (
6+
OutlookAccessConfig,
7+
OutlookConnectionConfig,
8+
)
9+
10+
11+
class TestOutlookAccessConfig:
12+
"""Tests for OutlookAccessConfig authentication validation."""
13+
14+
def test_client_cred_only(self):
15+
"""Client credential alone should be valid (app-only authentication)."""
16+
config = OutlookAccessConfig(client_cred="secret-value")
17+
# `client_credential` is the field name; `client_cred` is the alias.
18+
assert config.client_credential == "secret-value"
19+
assert config.oauth_token is None
20+
21+
def test_oauth_token_only(self):
22+
"""OAuth token alone should be valid (delegated authentication)."""
23+
config = OutlookAccessConfig(oauth_token="ey.access.token")
24+
assert config.oauth_token == "ey.access.token"
25+
assert config.client_credential is None
26+
27+
def test_no_auth_raises_error(self):
28+
"""No authentication provided should raise ValueError."""
29+
with pytest.raises(ValueError, match="must be set"):
30+
OutlookAccessConfig()
31+
32+
def test_oauth_and_client_cred_raises_error(self):
33+
"""Both oauth_token and client_cred provided should raise ValueError."""
34+
with pytest.raises(ValueError, match="cannot use both"):
35+
OutlookAccessConfig(
36+
client_cred="secret-value",
37+
oauth_token="ey.access.token",
38+
)
39+
40+
def test_empty_oauth_token_treated_as_missing(self):
41+
"""An empty-string oauth_token (e.g. unset env var) should not satisfy the auth requirement.
42+
43+
Validator and runtime both use truthiness; this test pins that consistency.
44+
"""
45+
with pytest.raises(ValueError, match="must be set"):
46+
OutlookAccessConfig(oauth_token="")
47+
48+
49+
class TestOutlookConnectionConfig:
50+
"""Tests for OutlookConnectionConfig cross-field auth validation."""
51+
52+
def test_client_cred_without_client_id_raises(self):
53+
"""client_cred-based auth requires client_id; rejecting at config time
54+
avoids cryptic AADSTS / MSAL errors at runtime."""
55+
with pytest.raises(ValueError, match="client_id is required"):
56+
OutlookConnectionConfig(
57+
access_config=Secret(OutlookAccessConfig(client_cred="secret-value")),
58+
)
59+
60+
def test_oauth_token_without_client_id_succeeds(self):
61+
"""oauth_token auth doesn't need client_id; this is the delegated path."""
62+
config = OutlookConnectionConfig(
63+
access_config=Secret(OutlookAccessConfig(oauth_token="ey.access.token")),
64+
)
65+
assert config.client_id is None

test/unit/connectors/test_sharepoint.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from unittest.mock import Mock
22

33
import pytest
4+
from pydantic import Secret
45

56
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
6-
from unstructured_ingest.error import SourceConnectionError
7+
from unstructured_ingest.error import SourceConnectionError, ValueError
78
from unstructured_ingest.processes.connectors.sharepoint import (
89
MICROSOFT_ROLE_MAPPING,
10+
SharepointAccessConfig,
911
SharepointConnectionConfig,
1012
SharepointDownloader,
1113
SharepointDownloaderConfig,
@@ -14,6 +16,80 @@
1416
)
1517

1618

19+
class TestSharepointAccessConfig:
20+
"""Tests for SharepointAccessConfig authentication validation."""
21+
22+
def test_client_cred_only(self):
23+
"""Client credential alone should be valid (app-only authentication)."""
24+
config = SharepointAccessConfig(client_cred="secret-value")
25+
assert config.client_cred == "secret-value"
26+
assert config.oauth_token is None
27+
28+
def test_oauth_token_only(self):
29+
"""OAuth token alone should be valid (delegated authentication)."""
30+
config = SharepointAccessConfig(oauth_token="ey.access.token")
31+
assert config.oauth_token == "ey.access.token"
32+
assert config.client_cred is None
33+
34+
def test_no_auth_raises_error(self):
35+
"""No authentication provided should raise ValueError."""
36+
with pytest.raises(ValueError, match="must be set"):
37+
SharepointAccessConfig()
38+
39+
def test_oauth_and_client_cred_raises_error(self):
40+
"""Both oauth_token and client_cred provided should raise ValueError."""
41+
with pytest.raises(ValueError, match="cannot use both"):
42+
SharepointAccessConfig(
43+
client_cred="secret-value",
44+
oauth_token="ey.access.token",
45+
)
46+
47+
def test_oauth_and_password_raises_error(self):
48+
"""oauth_token combined with password should raise ValueError."""
49+
with pytest.raises(ValueError, match="cannot use both"):
50+
SharepointAccessConfig(
51+
password="user-password",
52+
oauth_token="ey.access.token",
53+
)
54+
55+
def test_empty_oauth_token_treated_as_missing(self):
56+
"""An empty-string oauth_token (e.g. unset env var) should not satisfy the auth requirement.
57+
58+
Validator and runtime both use truthiness; this test pins that consistency.
59+
"""
60+
with pytest.raises(ValueError, match="must be set"):
61+
SharepointAccessConfig(oauth_token="")
62+
63+
64+
class TestSharepointConnectionConfig:
65+
"""Tests for SharepointConnectionConfig cross-field auth validation.
66+
67+
SharepointConnectionConfig inherits the validator from OnedriveConnectionConfig;
68+
these tests verify the inheritance carries the cross-field constraint through.
69+
"""
70+
71+
def test_client_cred_without_client_id_raises(self):
72+
"""client_cred-based auth requires client_id; rejecting at config time
73+
avoids cryptic AADSTS / MSAL errors at runtime."""
74+
with pytest.raises(ValueError, match="client_id is required"):
75+
SharepointConnectionConfig(
76+
site="https://contoso.sharepoint.com/sites/acme",
77+
user_pname="alice@contoso.com",
78+
tenant="tenant-id",
79+
access_config=Secret(SharepointAccessConfig(client_cred="secret-value")),
80+
)
81+
82+
def test_oauth_token_without_client_id_succeeds(self):
83+
"""oauth_token auth doesn't need client_id; this is the delegated path."""
84+
config = SharepointConnectionConfig(
85+
site="https://contoso.sharepoint.com/sites/acme",
86+
user_pname="alice@contoso.com",
87+
tenant="tenant-id",
88+
access_config=Secret(SharepointAccessConfig(oauth_token="ey.access.token")),
89+
)
90+
assert config.client_id is None
91+
92+
1793
@pytest.fixture
1894
def mock_client():
1995
return Mock()

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.5.1" # pragma: no cover
1+
__version__ = "1.5.2" # pragma: no cover

unstructured_ingest/processes/connectors/onedrive.py

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional
99

1010
from dateutil import parser
11-
from pydantic import Field, Secret
11+
from pydantic import Field, Secret, model_validator
1212

1313
from unstructured_ingest.data_types.file_data import (
1414
FileData,
@@ -54,12 +54,39 @@
5454

5555

5656
class OnedriveAccessConfig(AccessConfig):
57-
client_cred: str = Field(description="Microsoft App client secret")
57+
client_cred: Optional[str] = Field(default=None, description="Microsoft App client secret")
5858
password: Optional[str] = Field(description="Service account password", default=None)
59+
oauth_token: Optional[str] = Field(
60+
default=None,
61+
description=(
62+
"OAuth 2.0 access token for delegated user authentication. "
63+
"Tokens typically expire after ~1 hour; this connector does not "
64+
"refresh tokens."
65+
),
66+
)
67+
68+
def model_post_init(self, __context: Any) -> None:
69+
# Use truthiness so empty strings (e.g. from unset env vars) are treated
70+
# consistently with the runtime auth-mode check in get_token below.
71+
has_client_cred = bool(self.client_cred)
72+
has_oauth_token = bool(self.oauth_token)
73+
has_password = bool(self.password)
74+
75+
if not has_client_cred and not has_oauth_token:
76+
raise ValueError("either client_cred or oauth_token must be set")
77+
78+
if has_oauth_token and (has_client_cred or has_password):
79+
raise ValueError("cannot use both oauth_token and client_cred/password authentication")
5980

6081

6182
class OnedriveConnectionConfig(ConnectionConfig):
62-
client_id: str = Field(description="Microsoft app client ID")
83+
client_id: Optional[str] = Field(
84+
default=None,
85+
description=(
86+
"Microsoft app client ID. Required for app-only and password-grant authentication;"
87+
" not required when using oauth_token."
88+
),
89+
)
6390
user_pname: str = Field(
6491
description="User principal name or service account, usually your Azure AD email."
6592
)
@@ -74,6 +101,25 @@ class OnedriveConnectionConfig(ConnectionConfig):
74101
)
75102
access_config: Secret[OnedriveAccessConfig]
76103

104+
@model_validator(mode="after")
105+
def _require_client_id_without_oauth(self) -> "OnedriveConnectionConfig":
106+
# client_id lives on ConnectionConfig (above) and oauth_token on AccessConfig,
107+
# so this cross-field rule can't live in either model_post_init alone.
108+
if not self.access_config.get_secret_value().oauth_token and not self.client_id:
109+
raise ValueError("client_id is required when oauth_token is not set")
110+
return self
111+
112+
def _log_oauth_advisory(self) -> None:
113+
"""Emit a one-shot advisory at precheck time when delegated OAuth is in use.
114+
115+
Lives on ConnectionConfig so Indexer/Uploader/Downloader prechecks share
116+
one source of truth instead of each duplicating the message. Called from
117+
precheck (once per step instance) rather than from get_token (called per
118+
Graph request) to avoid log spam during normal indexing.
119+
"""
120+
if self.access_config.get_secret_value().oauth_token:
121+
logger.warning("Using OAuth token authentication. Tokens expire after ~1 hour.")
122+
77123
def get_drive(self) -> "Drive":
78124
client = self.get_client()
79125
drive = client.users[self.user_pname].drive
@@ -84,7 +130,14 @@ def get_token(self):
84130
from msal import ConfidentialClientApplication
85131
from requests import post
86132

87-
if self.access_config.get_secret_value().password:
133+
access_config = self.access_config.get_secret_value()
134+
135+
if access_config.oauth_token:
136+
# Delegated user authentication: hand the access token through directly.
137+
# Tokens typically expire after ~1 hour; refresh is not handled here.
138+
return {"access_token": access_config.oauth_token, "token_type": "Bearer"}
139+
140+
if access_config.password:
88141
url = f"https://login.microsoftonline.com/{self.tenant}/oauth2/v2.0/token"
89142
headers = {"Content-Type": "application/x-www-form-urlencoded"}
90143
data = {
@@ -160,6 +213,7 @@ class OnedriveIndexer(Indexer):
160213
connector_type: str = CONNECTOR_TYPE
161214

162215
def precheck(self) -> None:
216+
self.connection_config._log_oauth_advisory()
163217
try:
164218
token_resp: dict = self.connection_config.get_token()
165219
if error := token_resp.get("error"):
@@ -358,6 +412,7 @@ class OnedriveUploader(Uploader):
358412
def precheck(self) -> None:
359413
from office365.runtime.client_request_exception import ClientRequestException
360414

415+
self.connection_config._log_oauth_advisory()
361416
try:
362417
token_resp: dict = self.connection_config.get_token()
363418
if error := token_resp.get("error"):

0 commit comments

Comments
 (0)