Skip to content

Commit eedf93c

Browse files
committed
target s3
1 parent 3102775 commit eedf93c

3 files changed

Lines changed: 31 additions & 14 deletions

File tree

test/integration/connectors/utils/validation/source.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -169,18 +169,25 @@ def run_expected_download_files_validation(
169169

170170
def run_directory_structure_validation(expected_output_dir: Path, download_files: list[str]):
171171
s3_keys_file = expected_output_dir / "expected_s3_keys.json"
172-
with s3_keys_file.open("r") as f:
173-
s3_keys = json.load(f)["s3_keys"]
174172

175-
expected_paths = []
176-
for s3_key in s3_keys:
177-
hash_based_path = generate_hash_based_path(s3_key)
178-
if hash_based_path:
179-
expected_paths.append(str(hash_based_path))
180-
181-
expected_paths.sort()
182-
download_files.sort()
183-
assert expected_paths == download_files
173+
if s3_keys_file.exists():
174+
with s3_keys_file.open("r") as f:
175+
s3_keys = json.load(f)["s3_keys"]
176+
177+
expected_paths = []
178+
for s3_key in s3_keys:
179+
hash_based_path = generate_hash_based_path(s3_key)
180+
if hash_based_path:
181+
expected_paths.append(str(hash_based_path))
182+
183+
expected_paths.sort()
184+
download_files.sort()
185+
assert expected_paths == download_files
186+
else:
187+
directory_record = expected_output_dir / "directory_structure.json"
188+
with directory_record.open("r") as f:
189+
directory_structure = json.load(f)["directory_structure"]
190+
assert directory_structure == download_files
184191

185192

186193
def update_fixtures(

unstructured_ingest/interfaces/downloader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from unstructured_ingest.data_types.file_data import FileData
99
from unstructured_ingest.interfaces.connector import BaseConnector
1010
from unstructured_ingest.interfaces.process import BaseProcess
11-
from unstructured_ingest.utils.filesystem import generate_hash_based_path
1211

1312

1413
class DownloaderConfig(BaseModel):
@@ -42,7 +41,8 @@ def get_download_path(self, file_data: FileData) -> Optional[Path]:
4241
if not rel_path:
4342
return None
4443

45-
return generate_hash_based_path(rel_path, self.download_dir)
44+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
45+
return self.download_dir / Path(rel_path)
4646

4747
@staticmethod
4848
def is_float(value: str):

unstructured_ingest/processes/connectors/fsspec/fsspec.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
UploaderConfig,
3030
)
3131
from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
32-
from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
32+
from unstructured_ingest.utils.filesystem import generate_hash_based_path, mkdir_concurrent_safe
3333

3434
if TYPE_CHECKING:
3535
from fsspec import AbstractFileSystem
@@ -270,6 +270,16 @@ class FsspecDownloader(Downloader):
270270
download_config: Optional[FsspecDownloaderConfigT] = field(
271271
default_factory=lambda: FsspecDownloaderConfig()
272272
)
273+
274+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
275+
if not file_data.source_identifiers:
276+
return None
277+
278+
rel_path = file_data.source_identifiers.relative_path
279+
if not rel_path:
280+
return None
281+
282+
return generate_hash_based_path(rel_path, self.download_dir)
273283

274284
def is_async(self) -> bool:
275285
with self.connection_config.get_client(protocol=self.protocol) as client:

0 commit comments

Comments
 (0)