Skip to content

Commit b71bcec

Browse files
committed
more fixes
1 parent 715ccfd commit b71bcec

File tree

1 file changed

+6
-7
lines changed

1 file changed

+6
-7
lines changed

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,6 @@ def before_request(
294294
# We need to pass it on to after_success so
295295
# we know which results are ours
296296
operation_id = str(uuid.uuid4())
297-
self.operation_timeouts[operation_id] = _get_request_timeout_seconds(request)
298297

299298
content_type = request.headers.get("Content-Type")
300299
if content_type is None:
@@ -336,16 +335,13 @@ def before_request(
336335
fallback_value=DEFAULT_ALLOW_FAILED,
337336
)
338337

339-
self.concurrency_level[operation_id] = form_utils.get_split_pdf_concurrency_level_param(
338+
concurrency_level = form_utils.get_split_pdf_concurrency_level_param(
340339
form_data,
341340
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
342341
fallback_value=DEFAULT_CONCURRENCY_LEVEL,
343342
max_allowed=MAX_CONCURRENCY_LEVEL,
344343
)
345344

346-
executor = futures.ThreadPoolExecutor(max_workers=1)
347-
self.executors[operation_id] = executor
348-
349345
self.cache_tmp_data_feature = form_utils.get_split_pdf_cache_tmp_data(
350346
form_data,
351347
key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
@@ -367,14 +363,17 @@ def before_request(
367363
page_count = page_range_end - page_range_start + 1
368364

369365
split_size = get_optimal_split_size(
370-
num_pages=page_count, concurrency_level=self.concurrency_level[operation_id]
366+
num_pages=page_count, concurrency_level=concurrency_level
371367
)
372368

373369
# If the doc is small enough, and we aren't slicing it with a page range:
374370
# do not split, just continue with the original request
375371
if split_size >= page_count and page_count == len(pdf.pages):
376372
return request
377373

374+
self.concurrency_level[operation_id] = concurrency_level
375+
self.executors[operation_id] = futures.ThreadPoolExecutor(max_workers=1)
376+
378377
pdf = self._trim_large_pages(pdf, form_data)
379378

380379
pdf.stream.seek(0)
@@ -420,7 +419,7 @@ def before_request(
420419
self.coroutines_to_execute[operation_id].append(coroutine)
421420
set_index += 1
422421

423-
# Track the operation_id so after_error can clean up if the dummy request fails.
422+
self.operation_timeouts[operation_id] = _get_request_timeout_seconds(request)
424423
self.pending_operation_ids[hook_ctx.operation_id] = operation_id
425424

426425
return httpx.Request(

0 commit comments

Comments
 (0)