diff --git a/docling/datamodel/service/__init__.py b/docling/datamodel/service/__init__.py index c380324c6c..a326cf9d3b 100644 --- a/docling/datamodel/service/__init__.py +++ b/docling/datamodel/service/__init__.py @@ -25,8 +25,12 @@ VlmModelLocal, ) from docling.datamodel.service.requests import ( + AnyHttpSourceRequest, BaseChunkDocumentsRequest, + BatchConvertSourcesRequest, + BatchSourceRequestItem, ConvertDocumentsRequest, + ConvertSourcesRequest, FileSourceRequest, GenericChunkDocumentsRequest, HttpSourceRequest, @@ -37,19 +41,23 @@ make_request_model, ) from docling.datamodel.service.responses import ( + ArtifactRef, ChunkDocumentResponse, ChunkedDocumentResult, ChunkedDocumentResultItem, ClearResponse, ConvertDocumentErrorResponse, ConvertDocumentResponse, - ConvertDocumentResult, DoclingTaskResult, + DocumentArtifactItem, + DocumentResultItem, ExportDocumentResponse, ExportResult, HealthCheckResponse, MessageKind, + PresignedArtifactResult, PresignedUrlConvertDocumentResponse, + PresignedUrlConvertResponse, ReadinessResponse, RemoteTargetResult, TaskStatusResponse, @@ -59,6 +67,7 @@ from docling.datamodel.service.sources import FileSource, HttpSource, S3Coordinates from docling.datamodel.service.targets import ( InBodyTarget, + PresignedUrlTarget, PutTarget, S3Target, Target, @@ -67,9 +76,13 @@ from docling.datamodel.service.tasks import TaskProcessingMeta, TaskType __all__ = [ + "AnyHttpSourceRequest", + "ArtifactRef", "BaseChunkDocumentsRequest", "BaseChunkerOptions", "BaseProgress", + "BatchConvertSourcesRequest", + "BatchSourceRequestItem", "CallbackSpec", "ChunkDocumentResponse", "ChunkedDocumentResult", @@ -78,11 +91,13 @@ "ClearResponse", "ConvertDocumentErrorResponse", "ConvertDocumentResponse", - "ConvertDocumentResult", "ConvertDocumentsOptions", "ConvertDocumentsRequest", + "ConvertSourcesRequest", "DoclingTaskResult", + "DocumentArtifactItem", "DocumentCompletedItem", + "DocumentResultItem", "ExportDocumentResponse", "ExportResult", "FailedDocsItem", @@ -98,7 +113,10 @@ "MessageKind", "PictureDescriptionApi", "PictureDescriptionLocal", + "PresignedArtifactResult", "PresignedUrlConvertDocumentResponse", + "PresignedUrlConvertResponse", + "PresignedUrlTarget", "ProgressCallbackRequest", "ProgressCallbackResponse", "ProgressDocumentCompleted", diff --git a/docling/datamodel/service/requests.py b/docling/datamodel/service/requests.py index 83114c7d26..695cda81ec 100644 --- a/docling/datamodel/service/requests.py +++ b/docling/datamodel/service/requests.py @@ -2,7 +2,7 @@ from functools import cache from typing import Annotated, Generic, Literal -from pydantic import BaseModel, Field +from pydantic import AnyHttpUrl, BaseModel, Field, field_validator from typing_extensions import TypeVar from docling.datamodel.service.callbacks import CallbackSpec @@ -13,6 +13,7 @@ from docling.datamodel.service.sources import FileSource, HttpSource, S3Coordinates from docling.datamodel.service.targets import ( InBodyTarget, + PresignedUrlTarget, PutTarget, S3Target, ZipTarget, @@ -25,10 +26,22 @@ class FileSourceRequest(FileSource): kind: Literal["file"] = "file" -class HttpSourceRequest(HttpSource): +class AnyHttpSourceRequest(HttpSource): kind: Literal["http"] = "http" +class HttpSourceRequest(AnyHttpSourceRequest): + """HTTP source for convert endpoints — rejects ZIP URLs.""" + + @field_validator("url") + @classmethod + def reject_zip_url(cls, value: AnyHttpUrl) -> AnyHttpUrl: + path = str(value).lower().split("?", maxsplit=1)[0] + if path.endswith(".zip"): + raise ValueError("ZIP URLs are not accepted on the convert endpoint") + return value + + class S3SourceRequest(S3Coordinates): kind: Literal["s3"] = "s3" @@ -36,28 +49,44 @@ class S3SourceRequest(S3Coordinates): ## Multipart targets class TargetName(str, enum.Enum): INBODY = InBodyTarget().kind + PRESIGNED_URL = PresignedUrlTarget().kind ZIP = ZipTarget().kind ## Aliases +BatchSourceRequestItem = Annotated[ + FileSourceRequest | AnyHttpSourceRequest | S3SourceRequest, + Field(discriminator="kind"), +] + SourceRequestItem = Annotated[ - FileSourceRequest | HttpSourceRequest | S3SourceRequest, Field(discriminator="kind") + FileSourceRequest | HttpSourceRequest, Field(discriminator="kind") ] TargetRequest = Annotated[ - InBodyTarget | ZipTarget | S3Target | PutTarget, + InBodyTarget | ZipTarget | S3Target | PutTarget | PresignedUrlTarget, Field(discriminator="kind"), ] ## Complete Source request -class ConvertDocumentsRequest(BaseModel): +class BatchConvertSourcesRequest(BaseModel): + options: ConvertDocumentsOptions = ConvertDocumentsOptions() + sources: list[BatchSourceRequestItem] + target: TargetRequest = InBodyTarget() + callbacks: list[CallbackSpec] = [] + + +class ConvertSourcesRequest(BaseModel): options: ConvertDocumentsOptions = ConvertDocumentsOptions() sources: list[SourceRequestItem] target: TargetRequest = InBodyTarget() callbacks: list[CallbackSpec] = [] +## Deprecated aliases — will be removed in a future release +ConvertDocumentsRequest = BatchConvertSourcesRequest + ## Source chunking requests diff --git a/docling/datamodel/service/responses.py b/docling/datamodel/service/responses.py index 70c807137a..f158f2803e 100644 --- a/docling/datamodel/service/responses.py +++ b/docling/datamodel/service/responses.py @@ -1,9 +1,10 @@ import enum import warnings +from datetime import datetime from typing import Annotated, Literal, Optional from docling_core.types.doc.document import DoclingDocument -from pydantic import BaseModel, Field +from pydantic import AliasChoices, AnyUrl, BaseModel, ConfigDict, Field from docling.datamodel.base_models import ConversionStatus, ErrorItem from docling.datamodel.service.tasks import TaskProcessingMeta, TaskType @@ -19,15 +20,41 @@ class ExportDocumentResponse(BaseModel): doctags_content: Optional[str] = None -class ExportResult(BaseModel): - """Container of all exported content.""" +class DocumentResultItem(BaseModel): + """Canonical document-level result with legacy ExportResult wire compatibility.""" + + model_config = ConfigDict(populate_by_name=True, serialize_by_alias=True) kind: Literal["ExportResult"] = "ExportResult" - content: ExportDocumentResponse + document: ExportDocumentResponse = Field( + validation_alias=AliasChoices("document", "content"), + serialization_alias="content", + ) status: ConversionStatus errors: list[ErrorItem] = [] timings: dict[str, ProfilingItem] = {} + @property + def content(self) -> ExportDocumentResponse: + warnings.warn( + "DocumentResultItem.content is deprecated; use .document instead.", + DeprecationWarning, + stacklevel=2, + ) + return self.document + + @content.setter + def content(self, value: ExportDocumentResponse) -> None: + warnings.warn( + "DocumentResultItem.content is deprecated; use .document instead.", + DeprecationWarning, + stacklevel=2, + ) + self.document = value + + +ExportResult = DocumentResultItem + class ZipArchiveResult(BaseModel): """Container for a zip archive of the conversion.""" @@ -42,6 +69,27 @@ class RemoteTargetResult(BaseModel): kind: Literal["RemoteTargetResult"] = "RemoteTargetResult" +class ArtifactRef(BaseModel): + artifact_type: Literal[ + "json", "html", "markdown", "text", "doctags", "resource_bundle" + ] + mime_type: str + uri: AnyUrl + url_expires_at: datetime | None = None + + +class DocumentArtifactItem(BaseModel): + """Per-document result item for PresignedUrlTarget responses.""" + + source_index: int + source_uri: str + filename: str + status: ConversionStatus + errors: list[ErrorItem] = [] + timings: dict[str, ProfilingItem] = {} + artifacts: list[ArtifactRef] = [] + + class ChunkedDocumentResultItem(BaseModel): """A single chunk of a document with its metadata and content.""" @@ -91,8 +139,19 @@ class ChunkedDocumentResult(BaseModel): chunking_info: Optional[dict] = None +class PresignedArtifactResult(BaseModel): + """Internal DoclingTaskResult.result union member for PresignedUrlTarget.""" + + kind: Literal["PresignedArtifactResult"] = "PresignedArtifactResult" + documents: list[DocumentArtifactItem] + + ResultType = Annotated[ - ExportResult | ZipArchiveResult | RemoteTargetResult | ChunkedDocumentResult, + ExportResult + | ZipArchiveResult + | RemoteTargetResult + | ChunkedDocumentResult + | PresignedArtifactResult, Field(discriminator="kind"), ] @@ -105,17 +164,6 @@ class DoclingTaskResult(BaseModel): num_failed: int -class ConvertDocumentResult(DoclingTaskResult): - def __init__(self, *args, **kwargs): - warnings.warn( - "ConvertDocumentResult is deprecated and will be removed in a future version. " - "Use DoclingTaskResult instead.", - DeprecationWarning, - stacklevel=2, - ) - super().__init__(*args, **kwargs) - - class HealthCheckResponse(BaseModel): status: str = "ok" @@ -129,14 +177,40 @@ class ClearResponse(BaseModel): class ConvertDocumentResponse(BaseModel): + """Single-document inline response with task-level timing flattened in.""" + document: ExportDocumentResponse status: ConversionStatus errors: list[ErrorItem] = [] + # Inline convert responses have no outer DoclingTaskResult envelope, so the + # task-level elapsed time is flattened onto this response model. processing_time: float timings: dict[str, ProfilingItem] = {} +def _to_convert_document_response( + item: DocumentResultItem, processing_time: float +) -> "ConvertDocumentResponse": + return ConvertDocumentResponse( + document=item.document, + status=item.status, + errors=item.errors, + processing_time=processing_time, + timings=item.timings, + ) + + class PresignedUrlConvertDocumentResponse(BaseModel): + """Counts-only response model for remote targets without per-document artifacts.""" + + processing_time: float + num_converted: int + num_succeeded: int + num_failed: int + + +class PresignedUrlConvertResponse(BaseModel): + documents: list[DocumentArtifactItem] processing_time: float num_converted: int num_succeeded: int diff --git a/docling/datamodel/service/targets.py b/docling/datamodel/service/targets.py index d384bec8ae..481d4c6154 100644 --- a/docling/datamodel/service/targets.py +++ b/docling/datamodel/service/targets.py @@ -22,7 +22,11 @@ class PutTarget(BaseModel): url: AnyHttpUrl +class PresignedUrlTarget(BaseModel): + kind: Literal["presigned_url"] = "presigned_url" + + Target = Annotated[ - InBodyTarget | ZipTarget | S3Target | PutTarget, + InBodyTarget | ZipTarget | S3Target | PutTarget | PresignedUrlTarget, Field(discriminator="kind"), ] diff --git a/tests/test_service_datamodels.py b/tests/test_service_datamodels.py new file mode 100644 index 0000000000..e0bce70d61 --- /dev/null +++ b/tests/test_service_datamodels.py @@ -0,0 +1,132 @@ +import pytest +from pydantic import TypeAdapter, ValidationError + +from docling.datamodel.base_models import ConversionStatus +from docling.datamodel.service.requests import ( + AnyHttpSourceRequest, + ConvertSourcesRequest, + HttpSourceRequest, + TargetRequest, +) +from docling.datamodel.service.responses import ( + ArtifactRef, + ConvertDocumentResponse, + DoclingTaskResult, + DocumentArtifactItem, + DocumentResultItem, + ExportDocumentResponse, + ExportResult, + PresignedArtifactResult, + _to_convert_document_response, +) +from docling.datamodel.service.targets import PresignedUrlTarget + + +def test_http_source_request_rejects_zip_urls() -> None: + with pytest.raises(ValidationError, match="ZIP URLs are not accepted"): + HttpSourceRequest(url="https://example.com/report.zip") + + +def test_any_http_source_request_allows_zip_urls() -> None: + request = AnyHttpSourceRequest(url="https://example.com/report.zip") + + assert str(request.url) == "https://example.com/report.zip" + + +def test_convert_sources_request_rejects_s3_sources() -> None: + with pytest.raises(ValidationError): + ConvertSourcesRequest.model_validate( + { + "sources": [ + { + "kind": "s3", + "endpoint": "s3.example.com", + "access_key": "key", + "secret_key": "secret", + "bucket": "documents", + } + ] + } + ) + + +def test_target_request_accepts_presigned_url_target() -> None: + parsed = TypeAdapter(TargetRequest).validate_python({"kind": "presigned_url"}) + + assert isinstance(parsed, PresignedUrlTarget) + + +def test_document_result_item_maps_to_existing_wire_models() -> None: + item = DocumentResultItem( + document=ExportDocumentResponse(filename="example.pdf", md_content="# hello"), + status=ConversionStatus.SUCCESS, + ) + + convert_response = _to_convert_document_response(item, processing_time=1.25) + + assert item.model_dump(mode="json") == ExportResult( + content=item.document, + status=item.status, + errors=item.errors, + timings=item.timings, + ).model_dump(mode="json") + assert convert_response.model_dump(mode="json") == ConvertDocumentResponse( + document=item.document, + status=item.status, + errors=item.errors, + processing_time=1.25, + timings=item.timings, + ).model_dump(mode="json") + + +def test_document_result_item_accepts_legacy_content_field() -> None: + item = DocumentResultItem.model_validate( + { + "kind": "ExportResult", + "content": {"filename": "example.pdf", "md_content": "# hello"}, + "status": ConversionStatus.SUCCESS, + } + ) + + assert item.document.filename == "example.pdf" + assert item.model_dump(mode="json")["content"]["filename"] == "example.pdf" + + +def test_document_result_item_content_property_warns() -> None: + item = DocumentResultItem( + document=ExportDocumentResponse(filename="example.pdf"), + status=ConversionStatus.SUCCESS, + ) + + with pytest.warns(DeprecationWarning, match="use \\.document instead"): + content = item.content + + assert content.filename == "example.pdf" + + +def test_docling_task_result_accepts_presigned_artifact_results() -> None: + result = DoclingTaskResult( + result=PresignedArtifactResult( + documents=[ + DocumentArtifactItem( + source_index=0, + source_uri="https://example.com/input.pdf", + filename="input.pdf", + status=ConversionStatus.SUCCESS, + artifacts=[ + ArtifactRef( + artifact_type="markdown", + mime_type="text/markdown", + uri="s3://converted/input.md", + ) + ], + ) + ] + ), + processing_time=0.5, + num_converted=1, + num_succeeded=1, + num_failed=0, + ) + + assert result.result.kind == "PresignedArtifactResult"