Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions docling/datamodel/service/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
VlmModelLocal,
)
from docling.datamodel.service.requests import (
AnyHttpSourceRequest,
BaseChunkDocumentsRequest,
BatchConvertSourcesRequest,
BatchSourceRequestItem,
ConvertDocumentsRequest,
ConvertSourcesRequest,
FileSourceRequest,
GenericChunkDocumentsRequest,
HttpSourceRequest,
Expand All @@ -37,19 +41,23 @@
make_request_model,
)
from docling.datamodel.service.responses import (
ArtifactRef,
ChunkDocumentResponse,
ChunkedDocumentResult,
ChunkedDocumentResultItem,
ClearResponse,
ConvertDocumentErrorResponse,
ConvertDocumentResponse,
ConvertDocumentResult,
DoclingTaskResult,
DocumentArtifactItem,
DocumentResultItem,
ExportDocumentResponse,
ExportResult,
HealthCheckResponse,
MessageKind,
PresignedArtifactResult,
PresignedUrlConvertDocumentResponse,
PresignedUrlConvertResponse,
ReadinessResponse,
RemoteTargetResult,
TaskStatusResponse,
Expand All @@ -59,6 +67,7 @@
from docling.datamodel.service.sources import FileSource, HttpSource, S3Coordinates
from docling.datamodel.service.targets import (
InBodyTarget,
PresignedUrlTarget,
PutTarget,
S3Target,
Target,
Expand All @@ -67,9 +76,13 @@
from docling.datamodel.service.tasks import TaskProcessingMeta, TaskType

__all__ = [
"AnyHttpSourceRequest",
"ArtifactRef",
"BaseChunkDocumentsRequest",
"BaseChunkerOptions",
"BaseProgress",
"BatchConvertSourcesRequest",
"BatchSourceRequestItem",
"CallbackSpec",
"ChunkDocumentResponse",
"ChunkedDocumentResult",
Expand All @@ -78,11 +91,13 @@
"ClearResponse",
"ConvertDocumentErrorResponse",
"ConvertDocumentResponse",
"ConvertDocumentResult",
"ConvertDocumentsOptions",
"ConvertDocumentsRequest",
"ConvertSourcesRequest",
"DoclingTaskResult",
"DocumentArtifactItem",
"DocumentCompletedItem",
"DocumentResultItem",
"ExportDocumentResponse",
"ExportResult",
"FailedDocsItem",
Expand All @@ -98,7 +113,10 @@
"MessageKind",
"PictureDescriptionApi",
"PictureDescriptionLocal",
"PresignedArtifactResult",
"PresignedUrlConvertDocumentResponse",
"PresignedUrlConvertResponse",
"PresignedUrlTarget",
"ProgressCallbackRequest",
"ProgressCallbackResponse",
"ProgressDocumentCompleted",
Expand Down
39 changes: 34 additions & 5 deletions docling/datamodel/service/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import cache
from typing import Annotated, Generic, Literal

from pydantic import BaseModel, Field
from pydantic import AnyHttpUrl, BaseModel, Field, field_validator
from typing_extensions import TypeVar

from docling.datamodel.service.callbacks import CallbackSpec
Expand All @@ -13,6 +13,7 @@
from docling.datamodel.service.sources import FileSource, HttpSource, S3Coordinates
from docling.datamodel.service.targets import (
InBodyTarget,
PresignedUrlTarget,
PutTarget,
S3Target,
ZipTarget,
Expand All @@ -25,39 +26,67 @@ class FileSourceRequest(FileSource):
kind: Literal["file"] = "file"


class HttpSourceRequest(HttpSource):
class AnyHttpSourceRequest(HttpSource):
kind: Literal["http"] = "http"


class HttpSourceRequest(AnyHttpSourceRequest):
"""HTTP source for convert endpoints — rejects ZIP URLs."""

@field_validator("url")
@classmethod
def reject_zip_url(cls, value: AnyHttpUrl) -> AnyHttpUrl:
path = str(value).lower().split("?", maxsplit=1)[0]
if path.endswith(".zip"):
raise ValueError("ZIP URLs are not accepted on the convert endpoint")
return value


class S3SourceRequest(S3Coordinates):
kind: Literal["s3"] = "s3"


## Multipart targets
class TargetName(str, enum.Enum):
INBODY = InBodyTarget().kind
PRESIGNED_URL = PresignedUrlTarget().kind
ZIP = ZipTarget().kind


## Aliases
BatchSourceRequestItem = Annotated[
FileSourceRequest | AnyHttpSourceRequest | S3SourceRequest,
Field(discriminator="kind"),
]

SourceRequestItem = Annotated[
FileSourceRequest | HttpSourceRequest | S3SourceRequest, Field(discriminator="kind")
FileSourceRequest | HttpSourceRequest, Field(discriminator="kind")
]

TargetRequest = Annotated[
InBodyTarget | ZipTarget | S3Target | PutTarget,
InBodyTarget | ZipTarget | S3Target | PutTarget | PresignedUrlTarget,
Field(discriminator="kind"),
]


## Complete Source request
class ConvertDocumentsRequest(BaseModel):
class BatchConvertSourcesRequest(BaseModel):
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
sources: list[BatchSourceRequestItem]
target: TargetRequest = InBodyTarget()
callbacks: list[CallbackSpec] = []


class ConvertSourcesRequest(BaseModel):
options: ConvertDocumentsOptions = ConvertDocumentsOptions()
sources: list[SourceRequestItem]
target: TargetRequest = InBodyTarget()
callbacks: list[CallbackSpec] = []


## Deprecated aliases — will be removed in a future release
ConvertDocumentsRequest = BatchConvertSourcesRequest

## Source chunking requests


Expand Down
106 changes: 90 additions & 16 deletions docling/datamodel/service/responses.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import enum
import warnings
from datetime import datetime
from typing import Annotated, Literal, Optional

from docling_core.types.doc.document import DoclingDocument
from pydantic import BaseModel, Field
from pydantic import AliasChoices, AnyUrl, BaseModel, ConfigDict, Field

from docling.datamodel.base_models import ConversionStatus, ErrorItem
from docling.datamodel.service.tasks import TaskProcessingMeta, TaskType
Expand All @@ -19,15 +20,41 @@ class ExportDocumentResponse(BaseModel):
doctags_content: Optional[str] = None


class ExportResult(BaseModel):
"""Container of all exported content."""
class DocumentResultItem(BaseModel):
"""Canonical document-level result with legacy ExportResult wire compatibility."""

model_config = ConfigDict(populate_by_name=True, serialize_by_alias=True)

kind: Literal["ExportResult"] = "ExportResult"
content: ExportDocumentResponse
document: ExportDocumentResponse = Field(
validation_alias=AliasChoices("document", "content"),
serialization_alias="content",
)
status: ConversionStatus
errors: list[ErrorItem] = []
timings: dict[str, ProfilingItem] = {}

@property
def content(self) -> ExportDocumentResponse:
warnings.warn(
"DocumentResultItem.content is deprecated; use .document instead.",
DeprecationWarning,
stacklevel=2,
)
return self.document

@content.setter
def content(self, value: ExportDocumentResponse) -> None:
warnings.warn(
"DocumentResultItem.content is deprecated; use .document instead.",
DeprecationWarning,
stacklevel=2,
)
self.document = value


ExportResult = DocumentResultItem


class ZipArchiveResult(BaseModel):
"""Container for a zip archive of the conversion."""
Expand All @@ -42,6 +69,27 @@ class RemoteTargetResult(BaseModel):
kind: Literal["RemoteTargetResult"] = "RemoteTargetResult"


class ArtifactRef(BaseModel):
artifact_type: Literal[
"json", "html", "markdown", "text", "doctags", "resource_bundle"
]
mime_type: str
uri: AnyUrl
url_expires_at: datetime | None = None


class DocumentArtifactItem(BaseModel):
"""Per-document result item for PresignedUrlTarget responses."""

source_index: int
source_uri: str
filename: str
status: ConversionStatus
errors: list[ErrorItem] = []
timings: dict[str, ProfilingItem] = {}
artifacts: list[ArtifactRef] = []


class ChunkedDocumentResultItem(BaseModel):
"""A single chunk of a document with its metadata and content."""

Expand Down Expand Up @@ -91,8 +139,19 @@ class ChunkedDocumentResult(BaseModel):
chunking_info: Optional[dict] = None


class PresignedArtifactResult(BaseModel):
"""Internal DoclingTaskResult.result union member for PresignedUrlTarget."""

kind: Literal["PresignedArtifactResult"] = "PresignedArtifactResult"
documents: list[DocumentArtifactItem]


ResultType = Annotated[
ExportResult | ZipArchiveResult | RemoteTargetResult | ChunkedDocumentResult,
ExportResult
| ZipArchiveResult
| RemoteTargetResult
| ChunkedDocumentResult
| PresignedArtifactResult,
Field(discriminator="kind"),
]

Expand All @@ -105,17 +164,6 @@ class DoclingTaskResult(BaseModel):
num_failed: int


class ConvertDocumentResult(DoclingTaskResult):
def __init__(self, *args, **kwargs):
warnings.warn(
"ConvertDocumentResult is deprecated and will be removed in a future version. "
"Use DoclingTaskResult instead.",
DeprecationWarning,
stacklevel=2,
)
super().__init__(*args, **kwargs)


class HealthCheckResponse(BaseModel):
status: str = "ok"

Expand All @@ -129,14 +177,40 @@ class ClearResponse(BaseModel):


class ConvertDocumentResponse(BaseModel):
"""Single-document inline response with task-level timing flattened in."""

document: ExportDocumentResponse
status: ConversionStatus
errors: list[ErrorItem] = []
# Inline convert responses have no outer DoclingTaskResult envelope, so the
# task-level elapsed time is flattened onto this response model.
processing_time: float
timings: dict[str, ProfilingItem] = {}


def _to_convert_document_response(
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question as for ExportResult, why is ConvertDocumentResponse and DocumentResultItem both needed? Appears to have 100% overlap.

If any of this is for backward-compatibility, please explain.

item: DocumentResultItem, processing_time: float
) -> "ConvertDocumentResponse":
return ConvertDocumentResponse(
document=item.document,
status=item.status,
errors=item.errors,
processing_time=processing_time,
timings=item.timings,
)


class PresignedUrlConvertDocumentResponse(BaseModel):
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently used for S3Target and PutTarget response. How can this be deprecated? PresignedUrlConvertResponse is not going to replace it, since we said that an S3 target cannot list documents produced by default as it may be very very large, need pagination etc.

"""Counts-only response model for remote targets without per-document artifacts."""

processing_time: float
num_converted: int
num_succeeded: int
num_failed: int


class PresignedUrlConvertResponse(BaseModel):
documents: list[DocumentArtifactItem]
processing_time: float
num_converted: int
num_succeeded: int
Expand Down
6 changes: 5 additions & 1 deletion docling/datamodel/service/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ class PutTarget(BaseModel):
url: AnyHttpUrl


class PresignedUrlTarget(BaseModel):
kind: Literal["presigned_url"] = "presigned_url"


Target = Annotated[
InBodyTarget | ZipTarget | S3Target | PutTarget,
InBodyTarget | ZipTarget | S3Target | PutTarget | PresignedUrlTarget,
Field(discriminator="kind"),
]
Loading
Loading