diff --git a/CHANGELOG.md b/CHANGELOG.md index a9ceb7ecf..b548a5f97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.5.12-dev0 + +### Fixes + +* **Added extensive logging to sharepoint connector** + ## 0.5.11 ### Features diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 3af59ec2c..37a9412a6 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.5.11" # pragma: no cover +__version__ = "0.5.12-dev0" # pragma: no cover diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index c6586dc6e..20e13248a 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -63,24 +63,72 @@ class SharepointIndexer(OnedriveIndexer): async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: from office365.runtime.client_request_exception import ClientRequestException + logger.info(f"[{self.connector_type}] Fetching access token...") token_resp = await asyncio.to_thread(self.connection_config.get_token) + if "error" in token_resp: - raise SourceConnectionError( - f"[{self.connector_type}]: {token_resp['error']} " - f"({token_resp.get('error_description')})" - ) + error_message = f"[{self.connector_type}] Authentication error: {token_resp['error']} \ + ({token_resp.get('error_description')})" + logger.error(error_message) + raise SourceConnectionError(error_message) + + logger.info( + f"[{self.connector_type}] Successfully obtained access token. \ + Connecting to SharePoint site: {self.connection_config.site}" + ) client = await asyncio.to_thread(self.connection_config.get_client) + try: site = client.sites.get_by_url(self.connection_config.site).get().execute_query() + logger.info( + f"[{self.connector_type}] Successfully retrieved site object: {site.properties}" + ) + site_drive_item = site.drive.get().execute_query().root - except ClientRequestException: - logger.info("Site not found") + if site_drive_item is None: + raise ValueError( + f"[{self.connector_type}] \ + No root drive found for site {self.connection_config.site}. \ + Please check site permissions or if the site has a document library." + ) + + logger.info(f"[{self.connector_type}] Successfully retrieved site drive root.") + + except ClientRequestException as e: + logger.error(f"[{self.connector_type}] Failed to fetch SharePoint site: {str(e)}") + raise SourceConnectionError( + f"[{self.connector_type}] Site not found or inaccessible: {str(e)}" + ) + # Check if a path was provided and attempt to retrieve the specific path path = self.index_config.path - # Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not. if path and path != LEGACY_DEFAULT_PATH: - site_drive_item = site_drive_item.get_by_path(path).get().execute_query() + logger.info(f"[{self.connector_type}] Fetching site drive item at path: {path}") + + try: + site_drive_item = site_drive_item.get_by_path(path).get().execute_query() + logger.info( + f"[{self.connector_type}] \ + Successfully retrieved site drive item at path: {path}" + ) + except ClientRequestException as e: + logger.error( + f"[{self.connector_type}] Invalid path '{path}'. \ + Please verify the path exists in SharePoint. Error: {str(e)}" + ) + raise ValueError( + f"[{self.connector_type}] Invalid path '{path}' or path does not exist." + ) + + # Final validation before proceeding to file retrieval + if site_drive_item is None: + error_msg = f"[{self.connector_type}] \ + Unable to retrieve site drive item. \ + This may be due to incorrect site URL, \ + missing permissions, or an empty document library." + logger.error(error_msg) + raise ValueError(error_msg) for drive_item in site_drive_item.get_files( recursive=self.index_config.recursive