Skip to content

Docling fails to convert docx document #2515

@vishaldasnewtide

Description

@vishaldasnewtide

Bug

Even when I have set OCR=False, it is failing asking to install VLM model

`def _initialize_converter(self) -> None:
"""Initialize the document converter with appropriate settings."""
if self.optimize_pdf:
pdf_pipeline_options = PdfPipelineOptions(ocr_options=EasyOcrOptions())
pdf_pipeline_options.do_ocr = self.enable_ocr
pdf_pipeline_options.do_table_structure = True
pdf_pipeline_options.table_structure_options.do_cell_matching = True
pdf_pipeline_options.do_picture_description = False
pdf_pipeline_options.picture_description_options = None

        # Use GPU only when OCR is enabled, otherwise use CPU
        device = AcceleratorDevice.AUTO if self.enable_ocr else AcceleratorDevice.CPU
        cuda_flash = self.enable_ocr

        pdf_pipeline_options.accelerator_options = AcceleratorOptions(
            num_threads=self.pdf_threads,
            device=device,
            cuda_use_flash_attention2=cuda_flash
        )
    else:
        pdf_pipeline_options = PdfPipelineOptions()
        pdf_pipeline_options.do_picture_description = False
        pdf_pipeline_options.do_ocr = False
        pdf_pipeline_options.do_table_structure = False

    # Create converter instance with separate PDF and image options
    self.converter = DoclingConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
            InputFormat.DOCX,
            InputFormat.HTML,
            InputFormat.PPTX,
            InputFormat.ASCIIDOC,
            InputFormat.CSV,
            InputFormat.MD,
            InputFormat.XLSX
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline,
                backend=PyPdfiumDocumentBackend,
                pipeline_options=pdf_pipeline_options
            ),
            InputFormat.DOCX: WordFormatOption(
                pipeline_cls=SimplePipeline
            ),
            InputFormat.IMAGE: ImageFormatOption(
                pipeline_cls=StandardPdfPipeline,
                backend=PyPdfiumDocumentBackend,
                pipeline_options=pdf_pipeline_options
            ),

        },
    )

@file_operation(error_code=ErrorCodes.DOCUMENT_CONVERSION_ERROR)
def convert_to_markdown(self, input_path: Union[str, Path]) -> str:
    """
    Convert a single file to markdown format.

    Args:
        input_path: Path to the input file or URL

    Returns:
        Markdown content as string

    Raises:
        ValueError: If the file doesn't exist, has an unsupported extension, or conversion fails
    """
    # Handle .txt and .json files directly without using docling converter
    if self._is_text_file(input_path):
        content = self._read_text_file(input_path)
        if content is None:
            raise ValueError(f"Failed to read text file: {self._safe_log_path(input_path)}")
        return content

    # For all other file types, use the docling converter
    conv_result = self.converter.convert(input_path)

    # Get markdown content
    markdown_content = conv_result.document.export_to_markdown()

    if not markdown_content:
        raise ValueError(f"Conversion returned empty content for {self._safe_log_path(input_path)}")

    return markdown_content`

...

Steps to reproduce

File "/app/src/objects_core/utils.py", line 34, in wrapper

return func(*args, **kwargs)

       ^^^^^^^^^^^^^^^^^^^^^

File "/app/src/vector_store_processor/core/docling_converter.py", line 142, in convert_to_markdown

conv_result = self.converter.convert(input_path)

              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/pydantic/_internal/_validate_call.py", line 39, in wrapper_function

return wrapper(*args, **kwargs)

       ^^^^^^^^^^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/pydantic/_internal/_validate_call.py", line 136, in call

res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, kwargs))

      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 237, in convert

return next(all_res)

       ^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 260, in convert_all

for conv_res in conv_res_iter:

File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 332, in _convert

for item in map(

File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 379, in _process_document

conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)

           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 400, in _execute_pipeline

pipeline = self._get_pipeline(in_doc.format)

           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 362, in _get_pipeline

self.initialized_pipelines[cache_key] = pipeline_class(

                                        ^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/docling/pipeline/simple_pipeline.py", line 24, in init

super().__init__(pipeline_options)

File "/usr/local/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 144, in init

picture_description_model := self._get_picture_description_model(

                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 170, in _get_picture_description_model

return factory.create_instance(

       ^^^^^^^^^^^^^^^^^^^^^^^^

File "/usr/local/lib/python3.11/site-packages/docling/models/factories/base_factory.py", line 59, in create_instance

raise RuntimeError(self._err_msg_on_class_not_found(options.kind))

RuntimeError: No class found with the name 'vlm', known classes are:

same error
...

Docling version

2.58.0
...

Python version

3.11.9
...

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workinggood first issueIssues and pull requests for new contributors

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions