File tree Expand file tree Collapse file tree
test/integration/connectors/utils/validation
processes/connectors/fsspec Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -169,18 +169,25 @@ def run_expected_download_files_validation(
169169
170170def run_directory_structure_validation (expected_output_dir : Path , download_files : list [str ]):
171171 s3_keys_file = expected_output_dir / "expected_s3_keys.json"
172- with s3_keys_file .open ("r" ) as f :
173- s3_keys = json .load (f )["s3_keys" ]
174172
175- expected_paths = []
176- for s3_key in s3_keys :
177- hash_based_path = generate_hash_based_path (s3_key )
178- if hash_based_path :
179- expected_paths .append (str (hash_based_path ))
180-
181- expected_paths .sort ()
182- download_files .sort ()
183- assert expected_paths == download_files
173+ if s3_keys_file .exists ():
174+ with s3_keys_file .open ("r" ) as f :
175+ s3_keys = json .load (f )["s3_keys" ]
176+
177+ expected_paths = []
178+ for s3_key in s3_keys :
179+ hash_based_path = generate_hash_based_path (s3_key )
180+ if hash_based_path :
181+ expected_paths .append (str (hash_based_path ))
182+
183+ expected_paths .sort ()
184+ download_files .sort ()
185+ assert expected_paths == download_files
186+ else :
187+ directory_record = expected_output_dir / "directory_structure.json"
188+ with directory_record .open ("r" ) as f :
189+ directory_structure = json .load (f )["directory_structure" ]
190+ assert directory_structure == download_files
184191
185192
186193def update_fixtures (
Original file line number Diff line number Diff line change 88from unstructured_ingest .data_types .file_data import FileData
99from unstructured_ingest .interfaces .connector import BaseConnector
1010from unstructured_ingest .interfaces .process import BaseProcess
11- from unstructured_ingest .utils .filesystem import generate_hash_based_path
1211
1312
1413class DownloaderConfig (BaseModel ):
@@ -42,7 +41,8 @@ def get_download_path(self, file_data: FileData) -> Optional[Path]:
4241 if not rel_path :
4342 return None
4443
45- return generate_hash_based_path (rel_path , self .download_dir )
44+ rel_path = rel_path [1 :] if rel_path .startswith ("/" ) else rel_path
45+ return self .download_dir / Path (rel_path )
4646
4747 @staticmethod
4848 def is_float (value : str ):
Original file line number Diff line number Diff line change 2929 UploaderConfig ,
3030)
3131from unstructured_ingest .processes .connectors .fsspec .utils import sterilize_dict
32- from unstructured_ingest .utils .filesystem import mkdir_concurrent_safe
32+ from unstructured_ingest .utils .filesystem import generate_hash_based_path , mkdir_concurrent_safe
3333
3434if TYPE_CHECKING :
3535 from fsspec import AbstractFileSystem
@@ -270,6 +270,16 @@ class FsspecDownloader(Downloader):
270270 download_config : Optional [FsspecDownloaderConfigT ] = field (
271271 default_factory = lambda : FsspecDownloaderConfig ()
272272 )
273+
274+ def get_download_path (self , file_data : FileData ) -> Optional [Path ]:
275+ if not file_data .source_identifiers :
276+ return None
277+
278+ rel_path = file_data .source_identifiers .relative_path
279+ if not rel_path :
280+ return None
281+
282+ return generate_hash_based_path (rel_path , self .download_dir )
273283
274284 def is_async (self ) -> bool :
275285 with self .connection_config .get_client (protocol = self .protocol ) as client :
You can’t perform that action at this time.
0 commit comments