Skip to content

Commit 7ede56d

Browse files
committed
separating the signed ingestion functionality from the original code and adding new logging on crawldir signed usage
1 parent a1e00b4 commit 7ede56d

3 files changed

Lines changed: 46 additions & 29 deletions

File tree

alephclient/api.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,6 @@ def ingest_upload(
472472
metadata: Optional[Dict] = None,
473473
sync: bool = False,
474474
index: bool = True,
475-
signed_url: bool = False,
476475
) -> Dict:
477476
"""
478477
Create an empty folder in a collection or upload a document to it
@@ -485,9 +484,6 @@ def ingest_upload(
485484
files, metadata contains foreign_id of the parent. Metadata for a
486485
directory contains foreign_id for itself as well as its parent and the
487486
name of the directory.
488-
signed_url: use the signed URL workflow for file uploads. When True,
489-
files are uploaded via a signed URL instead of multipart ingest.
490-
Directories always use the standard ingest endpoint.
491487
"""
492488
url_path = "collections/{0}/ingest".format(collection_id)
493489
params = {"sync": sync, "index": index}
@@ -496,9 +492,6 @@ def ingest_upload(
496492
data = {"meta": json.dumps(metadata)}
497493
return self._request("POST", url, data=data)
498494

499-
if signed_url:
500-
return self._signed_url_upload(collection_id, file_path, metadata, index)
501-
502495
for attempt in count(1):
503496
try:
504497
with file_path.open("rb") as fh:
@@ -517,27 +510,51 @@ def ingest_upload(
517510
backoff(ae, attempt)
518511
return {}
519512

520-
def _signed_url_upload(
513+
def signed_url_upload(
521514
self,
522515
collection_id: str,
523-
file_path: Path,
524-
metadata: Optional[Dict],
525-
index: bool,
516+
file_path: Optional[Path] = None,
517+
metadata: Optional[Dict] = None,
518+
index: bool = True,
526519
) -> Dict:
520+
"""
521+
Upload a document using the signed URL workflow.
522+
523+
For directories (no file), falls back to the standard ingest endpoint
524+
since there is no file content to upload.
525+
526+
The workflow is:
527+
1. POST /file/uploadUrl -> {url, id}
528+
2. PUT file content to the signed url
529+
3. POST /collections/{id}/document with the upload_id and metadata
530+
531+
params
532+
------
533+
collection_id: id of the collection to upload to
534+
file_path: path of the file to upload. None while creating folders
535+
metadata: dict containing metadata for the file or folders
536+
index: whether to index the document after creation
537+
"""
538+
if not file_path or file_path.is_dir():
539+
return self.ingest_upload(
540+
collection_id, file_path, metadata=metadata, index=index
541+
)
542+
527543
mime_type = mimetypes.guess_type(file_path.name)[0] or MIME
528544
meta = dict(metadata or {})
529545
meta["file_name"] = file_path.name
530546
meta["mime_type"] = mime_type
531547

532548
for attempt in count(1):
533549
try:
534-
# Request a signed upload URL
550+
# Step 1: request a signed upload URL
535551
upload_url = self._make_url("file/uploadUrl")
536552
result = self._request("POST", upload_url)
537553
signed_url = result["url"]
538554
upload_id = result["id"]
555+
log.info("Signed URL [%s]: %s", upload_id, signed_url)
539556

540-
# PUT file content to the signed URL
557+
# Step 2: PUT file content to the signed URL
541558
try:
542559
with file_path.open("rb") as fh:
543560
response = self.session.put(
@@ -549,9 +566,7 @@ def _signed_url_upload(
549566
except (RequestException, HTTPError) as exc:
550567
raise AlephException(exc) from exc
551568

552-
# Create document record.
553-
# The server returns an empty 200 when a document with
554-
# the same foreign_id already exists in the collection.
569+
# Step 3: create the document record
555570
doc_url_path = f"collections/{collection_id}/document"
556571
doc_url = self._make_url(doc_url_path, params={"index": index})
557572
payload = {"upload_id": upload_id, "meta": meta}

alephclient/crawldir.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,20 @@ def ingest_upload(self, path: Path, parent_id: str, foreign_id: str) -> str:
130130
log.info("Upload [%s->%s]: %s", self.collection_id, parent_id, foreign_id)
131131
if parent_id is not None:
132132
metadata["parent_id"] = parent_id
133-
kwargs = {}
134133
if self.signed_url:
135-
kwargs["signed_url"] = True
136-
result = self.api.ingest_upload(
137-
self.collection_id,
138-
path,
139-
metadata=metadata,
140-
index=self.index,
141-
**kwargs,
142-
)
134+
result = self.api.signed_url_upload(
135+
self.collection_id,
136+
path,
137+
metadata=metadata,
138+
index=self.index,
139+
)
140+
else:
141+
result = self.api.ingest_upload(
142+
self.collection_id,
143+
path,
144+
metadata=metadata,
145+
index=self.index,
146+
)
143147
if "id" not in result and not hasattr(result, "id"):
144148
raise AlephException("Upload failed")
145149
return result["id"]

alephclient/tests/test_tasks.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ def test_ingest(self, mocker):
122122
assert call in self.api.ingest_upload.mock_calls
123123

124124
def test_ingest_signed_url(self, mocker):
125-
mocker.patch.object(self.api, "ingest_upload", return_value={"id": 42})
125+
mocker.patch.object(self.api, "signed_url_upload", return_value={"id": 42})
126126
mocker.patch.object(
127127
self.api, "load_collection_by_foreign_id", return_value={"id": 2}
128128
)
@@ -136,6 +136,4 @@ def test_ingest_signed_url(self, mocker):
136136
True,
137137
signed_url=True,
138138
)
139-
assert self.api.ingest_upload.call_count == 6
140-
for call in self.api.ingest_upload.call_args_list:
141-
assert call.kwargs.get("signed_url") is True
139+
assert self.api.signed_url_upload.call_count == 6

0 commit comments

Comments
 (0)