From 9a2893618b824b775954febdf42c284bb0f21b25 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 7 Mar 2025 11:31:52 +0100 Subject: [PATCH 1/5] Added improved logging for debugging purpooose --- .../v2/processes/connectors/sharepoint.py | 49 +++++++++++++++---- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index c6586dc6e..be5bb58c9 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -63,24 +63,54 @@ class SharepointIndexer(OnedriveIndexer): async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: from office365.runtime.client_request_exception import ClientRequestException + logger.info(f"[{self.connector_type}] Fetching access token...") token_resp = await asyncio.to_thread(self.connection_config.get_token) + if "error" in token_resp: - raise SourceConnectionError( - f"[{self.connector_type}]: {token_resp['error']} " - f"({token_resp.get('error_description')})" - ) + error_message = f"[{self.connector_type}] Authentication error: {token_resp['error']} \ + ({token_resp.get('error_description')})" + logger.error(error_message) + raise SourceConnectionError(error_message) + + logger.info(f"[{self.connector_type}] Successfully obtained access token. \ + Connecting to SharePoint site: {self.connection_config.site}") client = await asyncio.to_thread(self.connection_config.get_client) + try: site = client.sites.get_by_url(self.connection_config.site).get().execute_query() + logger.info(f"[{self.connector_type}] Successfully retrieved site object: {site.url}") + site_drive_item = site.drive.get().execute_query().root - except ClientRequestException: - logger.info("Site not found") - + if site_drive_item is None: + raise ValueError(f"[{self.connector_type}] No root drive found for site {self.connection_config.site}. \ + Please check site permissions or if the site has a document library.") + + logger.info(f"[{self.connector_type}] Successfully retrieved site drive root.") + + except ClientRequestException as e: + logger.error(f"[{self.connector_type}] Failed to fetch SharePoint site: {str(e)}") + raise SourceConnectionError(f"[{self.connector_type}] Site not found or inaccessible: {str(e)}") + + # Check if a path was provided and attempt to retrieve the specific path path = self.index_config.path - # Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not. if path and path != LEGACY_DEFAULT_PATH: - site_drive_item = site_drive_item.get_by_path(path).get().execute_query() + logger.info(f"[{self.connector_type}] Fetching site drive item at path: {path}") + + try: + site_drive_item = site_drive_item.get_by_path(path).get().execute_query() + logger.info(f"[{self.connector_type}] Successfully retrieved site drive item at path: {path}") + except ClientRequestException as e: + logger.error(f"[{self.connector_type}] Invalid path '{path}'. \ + Please verify the path exists in SharePoint. Error: {str(e)}") + raise ValueError(f"[{self.connector_type}] Invalid path '{path}' or path does not exist.") + + # Final validation before proceeding to file retrieval + if site_drive_item is None: + error_msg = f"[{self.connector_type}] Unable to retrieve site drive item. \ + This may be due to incorrect site URL, missing permissions, or an empty document library." + logger.error(error_msg) + raise ValueError(error_msg) for drive_item in site_drive_item.get_files( recursive=self.index_config.recursive @@ -89,6 +119,7 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: yield file_data + class SharepointDownloaderConfig(OnedriveDownloaderConfig): pass From 9fb19b5505b31ef1fbb59a2665443146c08afe7b Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 7 Mar 2025 11:41:54 +0100 Subject: [PATCH 2/5] Linter fix --- .../v2/processes/connectors/sharepoint.py | 49 ++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index be5bb58c9..abc5be962 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -65,33 +65,40 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: logger.info(f"[{self.connector_type}] Fetching access token...") token_resp = await asyncio.to_thread(self.connection_config.get_token) - + if "error" in token_resp: error_message = f"[{self.connector_type}] Authentication error: {token_resp['error']} \ ({token_resp.get('error_description')})" logger.error(error_message) raise SourceConnectionError(error_message) - logger.info(f"[{self.connector_type}] Successfully obtained access token. \ - Connecting to SharePoint site: {self.connection_config.site}") + logger.info( + f"[{self.connector_type}] Successfully obtained access token. \ + Connecting to SharePoint site: {self.connection_config.site}" + ) client = await asyncio.to_thread(self.connection_config.get_client) try: site = client.sites.get_by_url(self.connection_config.site).get().execute_query() logger.info(f"[{self.connector_type}] Successfully retrieved site object: {site.url}") - + site_drive_item = site.drive.get().execute_query().root if site_drive_item is None: - raise ValueError(f"[{self.connector_type}] No root drive found for site {self.connection_config.site}. \ - Please check site permissions or if the site has a document library.") - + raise ValueError( + f"[{self.connector_type}] \ + No root drive found for site {self.connection_config.site}. \ + Please check site permissions or if the site has a document library." + ) + logger.info(f"[{self.connector_type}] Successfully retrieved site drive root.") - + except ClientRequestException as e: logger.error(f"[{self.connector_type}] Failed to fetch SharePoint site: {str(e)}") - raise SourceConnectionError(f"[{self.connector_type}] Site not found or inaccessible: {str(e)}") - + raise SourceConnectionError( + f"[{self.connector_type}] Site not found or inaccessible: {str(e)}" + ) + # Check if a path was provided and attempt to retrieve the specific path path = self.index_config.path if path and path != LEGACY_DEFAULT_PATH: @@ -99,16 +106,25 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: try: site_drive_item = site_drive_item.get_by_path(path).get().execute_query() - logger.info(f"[{self.connector_type}] Successfully retrieved site drive item at path: {path}") + logger.info( + f"[{self.connector_type}] \ + Successfully retrieved site drive item at path: {path}" + ) except ClientRequestException as e: - logger.error(f"[{self.connector_type}] Invalid path '{path}'. \ - Please verify the path exists in SharePoint. Error: {str(e)}") - raise ValueError(f"[{self.connector_type}] Invalid path '{path}' or path does not exist.") + logger.error( + f"[{self.connector_type}] Invalid path '{path}'. \ + Please verify the path exists in SharePoint. Error: {str(e)}" + ) + raise ValueError( + f"[{self.connector_type}] Invalid path '{path}' or path does not exist." + ) # Final validation before proceeding to file retrieval if site_drive_item is None: - error_msg = f"[{self.connector_type}] Unable to retrieve site drive item. \ - This may be due to incorrect site URL, missing permissions, or an empty document library." + error_msg = f"[{self.connector_type}] \ + Unable to retrieve site drive item. \ + This may be due to incorrect site URL, \ + missing permissions, or an empty document library." logger.error(error_msg) raise ValueError(error_msg) @@ -119,7 +135,6 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: yield file_data - class SharepointDownloaderConfig(OnedriveDownloaderConfig): pass From 099d506c7d537ea2f333d2a49fae18402dc1faec Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 7 Mar 2025 11:43:03 +0100 Subject: [PATCH 3/5] Version bump --- CHANGELOG.md | 6 ++++++ unstructured_ingest/__version__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3ef03b2f..2b73352c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.5.10-dev4 + +### Fixes + +* **Added extensive logging to sharepoint connector** + ## 0.5.10-dev3 ### Enhancements diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index f20358195..9a7fa7739 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.5.10-dev3" # pragma: no cover +__version__ = "0.5.10-dev4" # pragma: no cover From e76231555d79251b908c81c8036a2e425f2982c1 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 7 Mar 2025 11:57:30 +0100 Subject: [PATCH 4/5] Test with props display --- unstructured_ingest/v2/processes/connectors/sharepoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index abc5be962..7dd6ddc1d 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -81,7 +81,7 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: try: site = client.sites.get_by_url(self.connection_config.site).get().execute_query() - logger.info(f"[{self.connector_type}] Successfully retrieved site object: {site.url}") + logger.info(f"[{self.connector_type}] Successfully retrieved site object: {site.properties}") site_drive_item = site.drive.get().execute_query().root if site_drive_item is None: From 21d29223bb5c7042461df8b201900aff5f858f97 Mon Sep 17 00:00:00 2001 From: Mateusz Kuprowski Date: Fri, 7 Mar 2025 12:03:12 +0100 Subject: [PATCH 5/5] Black fix --- unstructured_ingest/v2/processes/connectors/sharepoint.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unstructured_ingest/v2/processes/connectors/sharepoint.py b/unstructured_ingest/v2/processes/connectors/sharepoint.py index 7dd6ddc1d..20e13248a 100644 --- a/unstructured_ingest/v2/processes/connectors/sharepoint.py +++ b/unstructured_ingest/v2/processes/connectors/sharepoint.py @@ -81,7 +81,9 @@ async def run_async(self, **kwargs: Any) -> AsyncIterator[FileData]: try: site = client.sites.get_by_url(self.connection_config.site).get().execute_query() - logger.info(f"[{self.connector_type}] Successfully retrieved site object: {site.properties}") + logger.info( + f"[{self.connector_type}] Successfully retrieved site object: {site.properties}" + ) site_drive_item = site.drive.get().execute_query().root if site_drive_item is None: