-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdirect.py
More file actions
863 lines (705 loc) · 31.5 KB
/
direct.py
File metadata and controls
863 lines (705 loc) · 31.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
"""Direct API methods for supported document processing tools.
This file provides convenient methods that wrap the Nutrient Build API
for supported document processing operations.
"""
from typing import TYPE_CHECKING, Any, Protocol
from nutrient_dws.file_handler import FileInput
if TYPE_CHECKING:
from nutrient_dws.builder import BuildAPIWrapper
from nutrient_dws.http_client import HTTPClient
class HasBuildMethod(Protocol):
"""Protocol for objects that have a build method."""
def build(self, input_file: FileInput) -> "BuildAPIWrapper":
"""Build method signature."""
...
@property
def _http_client(self) -> "HTTPClient":
"""HTTP client property."""
...
class DirectAPIMixin:
"""Mixin class containing Direct API methods.
These methods provide a simplified interface to common document
processing operations. They internally use the Build API.
Note: The API automatically converts supported document formats
(DOCX, XLSX, PPTX) to PDF when processing.
"""
def _process_file(
self,
tool: str,
input_file: FileInput,
output_path: str | None = None,
**options: Any,
) -> bytes | None:
"""Process file method that will be provided by NutrientClient."""
raise NotImplementedError("This method is provided by NutrientClient")
def convert_to_pdf(
self,
input_file: FileInput,
output_path: str | None = None,
) -> bytes | None:
"""Convert a document to PDF.
Converts Office documents (DOCX, XLSX, PPTX) to PDF format.
This uses the API's implicit conversion - simply uploading a
non-PDF document returns it as a PDF.
Args:
input_file: Input document (DOCX, XLSX, PPTX, etc).
output_path: Optional path to save the output PDF.
Returns:
Converted PDF as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors (e.g., unsupported format).
Note:
HTML files are not currently supported by the API.
"""
# Use builder with no actions - implicit conversion happens
# Type checking: at runtime, self is NutrientClient which has these methods
return self.build(input_file).execute(output_path) # type: ignore[attr-defined,no-any-return]
def flatten_annotations(
self, input_file: FileInput, output_path: str | None = None
) -> bytes | None:
"""Flatten annotations and form fields in a PDF.
Converts all annotations and form fields into static page content.
If input is an Office document, it will be converted to PDF first.
Args:
input_file: Input file (PDF or Office document).
output_path: Optional path to save the output file.
Returns:
Processed file as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
"""
return self._process_file("flatten-annotations", input_file, output_path)
def rotate_pages(
self,
input_file: FileInput,
output_path: str | None = None,
degrees: int = 0,
page_indexes: list[int] | None = None,
) -> bytes | None:
"""Rotate pages in a PDF.
Rotate all pages or specific pages by the specified degrees.
If input is an Office document, it will be converted to PDF first.
Args:
input_file: Input file (PDF or Office document).
output_path: Optional path to save the output file.
degrees: Rotation angle (90, 180, 270, or -90).
page_indexes: Optional list of page indexes to rotate (0-based).
Returns:
Processed file as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
"""
options = {"degrees": degrees}
if page_indexes is not None:
options["page_indexes"] = page_indexes # type: ignore
return self._process_file("rotate-pages", input_file, output_path, **options)
def ocr_pdf(
self,
input_file: FileInput,
output_path: str | None = None,
language: str = "english",
) -> bytes | None:
"""Apply OCR to a PDF to make it searchable.
Performs optical character recognition on the PDF to extract text
and make it searchable. If input is an Office document, it will
be converted to PDF first.
Args:
input_file: Input file (PDF or Office document).
output_path: Optional path to save the output file.
language: OCR language. Supported: "english", "eng", "deu", "german".
Default is "english".
Returns:
Processed file as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
"""
return self._process_file("ocr-pdf", input_file, output_path, language=language)
def watermark_pdf(
self,
input_file: FileInput,
output_path: str | None = None,
text: str | None = None,
image_url: str | None = None,
image_file: FileInput | None = None,
width: int = 200,
height: int = 100,
opacity: float = 1.0,
position: str = "center",
) -> bytes | None:
"""Add a watermark to a PDF.
Adds a text or image watermark to all pages of the PDF.
If input is an Office document, it will be converted to PDF first.
Args:
input_file: Input file (PDF or Office document).
output_path: Optional path to save the output file.
text: Text to use as watermark. One of text, image_url, or image_file required.
image_url: URL of image to use as watermark.
image_file: Local image file to use as watermark (path, bytes, or file-like object).
Supported formats: PNG, JPEG, TIFF.
width: Width of the watermark in points (required).
height: Height of the watermark in points (required).
opacity: Opacity of the watermark (0.0 to 1.0).
position: Position of watermark. One of: "top-left", "top-center",
"top-right", "center", "bottom-left", "bottom-center",
"bottom-right".
Returns:
Processed file as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If none of text, image_url, or image_file is provided.
"""
if not text and not image_url and not image_file:
raise ValueError("Either text, image_url, or image_file must be provided")
# For image file uploads, we need to use the builder directly
if image_file:
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output
# Prepare files for upload
files = {}
# Main PDF file
file_field, file_data = prepare_file_for_upload(input_file, "file")
files[file_field] = file_data
# Watermark image file
image_field, image_data = prepare_file_for_upload(image_file, "watermark")
files[image_field] = image_data
# Build instructions with watermark action
action = {
"type": "watermark",
"width": width,
"height": height,
"opacity": opacity,
"position": position,
"image": "watermark", # Reference to the uploaded image file
}
instructions = {"parts": [{"file": "file"}], "actions": [action]}
# Make API request
# Type checking: at runtime, self is NutrientClient which has _http_client
result = self._http_client.post( # type: ignore[attr-defined]
"/build",
files=files,
json_data=instructions,
)
# Handle output
if output_path:
save_file_output(result, output_path)
return None
else:
return result # type: ignore[no-any-return]
# For text and URL watermarks, use the existing _process_file approach
options = {
"width": width,
"height": height,
"opacity": opacity,
"position": position,
}
if text:
options["text"] = text
else:
options["image_url"] = image_url
return self._process_file("watermark-pdf", input_file, output_path, **options)
def apply_redactions(
self,
input_file: FileInput,
output_path: str | None = None,
) -> bytes | None:
"""Apply redaction annotations to permanently remove content.
Applies any redaction annotations in the PDF to permanently remove
the underlying content. If input is an Office document, it will
be converted to PDF first.
Args:
input_file: Input file (PDF or Office document).
output_path: Optional path to save the output file.
Returns:
Processed file as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
"""
return self._process_file("apply-redactions", input_file, output_path)
def split_pdf(
self,
input_file: FileInput,
page_ranges: list[dict[str, int]] | None = None,
output_paths: list[str] | None = None,
) -> list[bytes]:
"""Split a PDF into multiple documents by page ranges.
Splits a PDF into multiple files based on specified page ranges.
Each range creates a separate output file.
Args:
input_file: Input PDF file.
page_ranges: List of page range dictionaries. Each dict can contain:
- 'start': Starting page index (0-based, inclusive)
- 'end': Ending page index (0-based, exclusive)
- If not provided, splits into individual pages
output_paths: Optional list of paths to save output files.
Must match length of page_ranges if provided.
Returns:
List of PDF bytes for each split, or empty list if output_paths provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If page_ranges and output_paths length mismatch.
Examples:
# Split into individual pages
pages = client.split_pdf("document.pdf")
# Split by custom ranges
parts = client.split_pdf(
"document.pdf",
page_ranges=[
{"start": 0, "end": 5}, # Pages 1-5
{"start": 5, "end": 10}, # Pages 6-10
{"start": 10} # Pages 11 to end
]
)
# Save to specific files
client.split_pdf(
"document.pdf",
page_ranges=[{"start": 0, "end": 2}, {"start": 2}],
output_paths=["part1.pdf", "part2.pdf"]
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output
# Validate inputs
if not page_ranges:
# Default behavior: extract first page only
page_ranges = [{"start": 0, "end": 1}]
if len(page_ranges) > 50:
raise ValueError("Maximum 50 page ranges allowed")
if output_paths and len(output_paths) != len(page_ranges):
raise ValueError("output_paths length must match page_ranges length")
results = []
# Process each page range as a separate API call
for i, page_range in enumerate(page_ranges):
# Prepare file for upload
file_field, file_data = prepare_file_for_upload(input_file, "file")
files = {file_field: file_data}
# Build instructions for page extraction
instructions = {"parts": [{"file": "file", "pages": page_range}], "actions": []}
# Make API request
# Type checking: at runtime, self is NutrientClient which has _http_client
result = self._http_client.post( # type: ignore[attr-defined]
"/build",
files=files,
json_data=instructions,
)
# Handle output
if output_paths and i < len(output_paths):
save_file_output(result, output_paths[i])
else:
results.append(result) # type: ignore[arg-type]
return results if not output_paths else []
def duplicate_pdf_pages(
self,
input_file: FileInput,
page_indexes: list[int],
output_path: str | None = None,
) -> bytes | None:
"""Duplicate specific pages within a PDF document.
Creates a new PDF containing the specified pages in the order provided.
Pages can be duplicated multiple times by including their index multiple times.
Args:
input_file: Input PDF file.
page_indexes: List of page indexes to include (0-based).
Pages can be repeated to create duplicates.
Negative indexes are supported (-1 for last page).
output_path: Optional path to save the output file.
Returns:
Processed PDF as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If page_indexes is empty.
Examples:
# Duplicate first page twice, then include second page
result = client.duplicate_pdf_pages(
"document.pdf",
page_indexes=[0, 0, 1] # Page 1, Page 1, Page 2
)
# Include last page at beginning and end
result = client.duplicate_pdf_pages(
"document.pdf",
page_indexes=[-1, 0, 1, 2, -1] # Last, First, Second, Third, Last
)
# Save to specific file
client.duplicate_pdf_pages(
"document.pdf",
page_indexes=[0, 2, 1], # Reorder: Page 1, Page 3, Page 2
output_path="reordered.pdf"
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output
# Validate inputs
if not page_indexes:
raise ValueError("page_indexes cannot be empty")
# Prepare file for upload
file_field, file_data = prepare_file_for_upload(input_file, "file")
files = {file_field: file_data}
# Build parts for each page index
parts = []
for page_index in page_indexes:
if page_index < 0:
# For negative indexes, use the index directly (API supports negative indexes)
parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}})
else:
# For positive indexes, create single-page range
parts.append({"file": "file", "pages": {"start": page_index, "end": page_index}})
# Build instructions for duplication
instructions = {"parts": parts, "actions": []}
# Make API request
# Type checking: at runtime, self is NutrientClient which has _http_client
result = self._http_client.post( # type: ignore[attr-defined]
"/build",
files=files,
json_data=instructions,
)
# Handle output
if output_path:
save_file_output(result, output_path)
return None
else:
return result # type: ignore[no-any-return]
def delete_pdf_pages(
self,
input_file: FileInput,
page_indexes: list[int],
output_path: str | None = None,
) -> bytes | None:
"""Delete specific pages from a PDF document.
Creates a new PDF with the specified pages removed. The API approach
works by selecting all pages except those to be deleted.
Args:
input_file: Input PDF file.
page_indexes: List of page indexes to delete (0-based). 0 = first page.
Must be unique, sorted in ascending order.
Negative indexes are NOT supported.
output_path: Optional path to save the output file.
Returns:
Processed PDF as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If page_indexes is empty or contains negative indexes.
Examples:
# Delete first and last pages (Note: negative indexes not supported)
result = client.delete_pdf_pages(
"document.pdf",
page_indexes=[0, 2] # Delete pages 1 and 3
)
# Delete specific pages (2nd and 4th pages)
result = client.delete_pdf_pages(
"document.pdf",
page_indexes=[1, 3] # 0-based indexing
)
# Save to specific file
client.delete_pdf_pages(
"document.pdf",
page_indexes=[2, 4, 5],
output_path="pages_deleted.pdf"
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output
# Validate inputs
if not page_indexes:
raise ValueError("page_indexes cannot be empty")
# Check for negative indexes
if any(idx < 0 for idx in page_indexes):
negative_indexes = [idx for idx in page_indexes if idx < 0]
raise ValueError(
f"Negative page indexes not yet supported for deletion: {negative_indexes}"
)
# Prepare file for upload
file_field, file_data = prepare_file_for_upload(input_file, "file")
files = {file_field: file_data}
# Sort page indexes to handle ranges efficiently
sorted_indexes = sorted(set(page_indexes)) # Remove duplicates and sort
# Build parts for pages to keep (excluding the ones to delete)
# We need to create ranges that exclude the deleted pages
parts = []
# Start from page 0
current_page = 0
for delete_index in sorted_indexes:
# Add range from current_page to delete_index (exclusive)
if current_page < delete_index:
parts.append(
{"file": "file", "pages": {"start": current_page, "end": delete_index}}
)
# Skip the deleted page
current_page = delete_index + 1
# For remaining pages, we need to be very careful not to reference non-existent pages
# The safest approach is to NOT add remaining pages automatically
# Instead, we'll only add them if we're confident they exist
# However, we can't know the document page count without another API call
# Let's use a different approach: if there are existing parts, we might be done
# If there are no parts yet, we need to add something
if len(sorted_indexes) > 0:
# We've processed some deletions
# Only add remaining pages if we haven't deleted the very last possible pages
# A very conservative approach: don't add remaining if we deleted a high-numbered page
max_deleted_page = max(sorted_indexes)
# If we're deleting page 2 or higher, and current_page is beyond that,
# we're probably at or past the end of the document
# Only add remaining if the max deleted page is 0 or 1 (suggesting more pages exist)
if max_deleted_page <= 1 and current_page <= 10: # Very conservative
parts.append({"file": "file", "pages": {"start": current_page}})
else:
# If no pages to delete, keep all pages
parts.append({"file": "file"})
# If no parts, it means we're trying to delete all pages
if not parts:
raise ValueError("Cannot delete all pages from document")
# Build instructions for deletion (keeping non-deleted pages)
instructions = {"parts": parts, "actions": []}
# Make API request
# Type checking: at runtime, self is NutrientClient which has _http_client
result = self._http_client.post( # type: ignore[attr-defined]
"/build",
files=files,
json_data=instructions,
)
# Handle output
if output_path:
save_file_output(result, output_path)
return None
else:
return result # type: ignore[no-any-return]
def merge_pdfs(
self,
input_files: list[FileInput],
output_path: str | None = None,
) -> bytes | None:
"""Merge multiple PDF files into one.
Combines multiple files into a single PDF in the order provided.
Office documents (DOCX, XLSX, PPTX) will be automatically converted
to PDF before merging.
Args:
input_files: List of input files (PDFs or Office documents).
output_path: Optional path to save the output file.
Returns:
Merged PDF as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If less than 2 files provided.
Example:
# Merge PDFs and Office documents
client.merge_pdfs([
"document1.pdf",
"document2.docx",
"spreadsheet.xlsx"
], "merged.pdf")
"""
if len(input_files) < 2:
raise ValueError("At least 2 files required for merge")
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output
# Prepare files for upload
files = {}
parts = []
for i, file in enumerate(input_files):
field_name = f"file{i}"
file_field, file_data = prepare_file_for_upload(file, field_name)
files[file_field] = file_data
parts.append({"file": field_name})
# Build instructions for merge (no actions needed)
instructions = {"parts": parts, "actions": []}
# Make API request
# Type checking: at runtime, self is NutrientClient which has _http_client
result = self._http_client.post( # type: ignore[attr-defined]
"/build",
files=files,
json_data=instructions,
)
# Handle output
if output_path:
save_file_output(result, output_path)
return None
else:
return result # type: ignore[no-any-return]
def add_page(
self,
input_file: FileInput,
insert_index: int,
page_count: int = 1,
page_size: str = "A4",
orientation: str = "portrait",
output_path: str | None = None,
) -> bytes | None:
"""Add blank pages to a PDF document.
Inserts blank pages at the specified insertion index in the document.
Args:
input_file: Input PDF file.
insert_index: Position to insert pages (0-based insertion index).
0 = insert before first page (at beginning)
1 = insert before second page (after first page)
-1 = insert after last page (at end)
page_count: Number of blank pages to add (default: 1).
page_size: Page size for new pages. Common values: "A4", "Letter",
"Legal", "A3", "A5" (default: "A4").
orientation: Page orientation. Either "portrait" or "landscape"
(default: "portrait").
output_path: Optional path to save the output file.
Returns:
Processed PDF as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If page_count is less than 1 or if insert_index is
a negative number other than -1.
Examples:
# Add a single blank page at the beginning
result = client.add_page("document.pdf", insert_index=0)
# Add multiple pages at the end
result = client.add_page(
"document.pdf",
insert_index=-1, # Insert at end
page_count=3,
page_size="Letter",
orientation="landscape"
)
# Add pages before third page and save to file
client.add_page(
"document.pdf",
insert_index=2, # Insert before third page
page_count=2,
output_path="with_blank_pages.pdf"
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output
# Validate inputs
if page_count < 1:
raise ValueError("page_count must be at least 1")
if page_count > 100:
raise ValueError("page_count cannot exceed 100 pages")
if insert_index < -1:
raise ValueError("insert_index must be -1 (for end) or a non-negative insertion index")
# Prepare file for upload
file_field, file_data = prepare_file_for_upload(input_file, "file")
files = {file_field: file_data}
# Build parts array
parts: list[dict[str, Any]] = []
# Create new page part
new_page_part = {
"page": "new",
"pageCount": page_count,
"layout": {
"size": page_size,
"orientation": orientation,
},
}
if insert_index == -1:
# Insert at end: add all original pages first, then new pages
parts.append({"file": "file"})
parts.append(new_page_part)
elif insert_index == 0:
# Insert at beginning: add new pages first, then all original pages
parts.append(new_page_part)
parts.append({"file": "file"})
else:
# Insert at specific position: split original document
# Add pages from start up to insertion point (0 to insert_index-1)
parts.append({"file": "file", "pages": {"start": 0, "end": insert_index}})
# Add new blank pages
parts.append(new_page_part)
# Add remaining pages from insertion point to end
parts.append({"file": "file", "pages": {"start": insert_index}})
# Build instructions for adding pages
instructions = {"parts": parts, "actions": []}
# Make API request
# Type checking: at runtime, self is NutrientClient which has _http_client
result = self._http_client.post( # type: ignore[attr-defined]
"/build",
files=files,
json_data=instructions,
)
# Handle output
if output_path:
save_file_output(result, output_path)
return None
else:
return result # type: ignore[no-any-return]
def set_page_label(
self,
input_file: FileInput,
labels: list[dict[str, Any]],
output_path: str | None = None,
) -> bytes | None:
"""Set labels for specific pages in a PDF.
Assigns custom labels/numbering to specific page ranges in a PDF document.
Each label configuration specifies a page range and the label text to apply.
Args:
input_file: Input PDF file.
labels: List of label configurations. Each dict must contain:
- 'pages': Page range dict with 'start' (required) and optionally 'end'
- 'label': String label to apply to those pages
Page ranges use 0-based indexing where 'end' is exclusive.
output_path: Optional path to save the output file.
Returns:
Processed PDF as bytes, or None if output_path is provided.
Raises:
AuthenticationError: If API key is missing or invalid.
APIError: For other API errors.
ValueError: If labels list is empty or contains invalid configurations.
Examples:
# Set labels for different page ranges
client.set_page_label(
"document.pdf",
labels=[
{"pages": {"start": 0, "end": 3}, "label": "Introduction"},
{"pages": {"start": 3, "end": 10}, "label": "Chapter 1"},
{"pages": {"start": 10}, "label": "Appendix"}
],
output_path="labeled_document.pdf"
)
# Set label for single page
client.set_page_label(
"document.pdf",
labels=[{"pages": {"start": 0, "end": 1}, "label": "Cover Page"}]
)
"""
from nutrient_dws.file_handler import prepare_file_for_upload, save_file_output
# Validate inputs
if not labels:
raise ValueError("labels list cannot be empty")
# Normalize labels to ensure proper format
normalized_labels = []
for i, label_config in enumerate(labels):
if not isinstance(label_config, dict):
raise ValueError(f"Label configuration {i} must be a dictionary")
if "pages" not in label_config:
raise ValueError(f"Label configuration {i} missing required 'pages' key")
if "label" not in label_config:
raise ValueError(f"Label configuration {i} missing required 'label' key")
pages = label_config["pages"]
if not isinstance(pages, dict) or "start" not in pages:
raise ValueError(f"Label configuration {i} 'pages' must be a dict with 'start' key")
# Normalize pages - only include 'end' if explicitly provided
normalized_pages = {"start": pages["start"]}
if "end" in pages:
normalized_pages["end"] = pages["end"]
# If no end is specified, leave it out (meaning "to end of document")
normalized_labels.append({"pages": normalized_pages, "label": label_config["label"]})
# Prepare file for upload
file_field, file_data = prepare_file_for_upload(input_file, "file")
files = {file_field: file_data}
# Build instructions with page labels in output configuration
instructions = {
"parts": [{"file": "file"}],
"actions": [],
"output": {"labels": normalized_labels},
}
# Make API request
# Type checking: at runtime, self is NutrientClient which has _http_client
result = self._http_client.post( # type: ignore[attr-defined]
"/build",
files=files,
json_data=instructions,
)
# Handle output
if output_path:
save_file_output(result, output_path)
return None
else:
return result # type: ignore[no-any-return]