Bug
Even when I have set OCR=False, it is failing asking to install VLM model
`def _initialize_converter(self) -> None:
"""Initialize the document converter with appropriate settings."""
if self.optimize_pdf:
pdf_pipeline_options = PdfPipelineOptions(ocr_options=EasyOcrOptions())
pdf_pipeline_options.do_ocr = self.enable_ocr
pdf_pipeline_options.do_table_structure = True
pdf_pipeline_options.table_structure_options.do_cell_matching = True
pdf_pipeline_options.do_picture_description = False
pdf_pipeline_options.picture_description_options = None
# Use GPU only when OCR is enabled, otherwise use CPU
device = AcceleratorDevice.AUTO if self.enable_ocr else AcceleratorDevice.CPU
cuda_flash = self.enable_ocr
pdf_pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=self.pdf_threads,
device=device,
cuda_use_flash_attention2=cuda_flash
)
else:
pdf_pipeline_options = PdfPipelineOptions()
pdf_pipeline_options.do_picture_description = False
pdf_pipeline_options.do_ocr = False
pdf_pipeline_options.do_table_structure = False
# Create converter instance with separate PDF and image options
self.converter = DoclingConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
InputFormat.PPTX,
InputFormat.ASCIIDOC,
InputFormat.CSV,
InputFormat.MD,
InputFormat.XLSX
],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
backend=PyPdfiumDocumentBackend,
pipeline_options=pdf_pipeline_options
),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimplePipeline
),
InputFormat.IMAGE: ImageFormatOption(
pipeline_cls=StandardPdfPipeline,
backend=PyPdfiumDocumentBackend,
pipeline_options=pdf_pipeline_options
),
},
)
@file_operation(error_code=ErrorCodes.DOCUMENT_CONVERSION_ERROR)
def convert_to_markdown(self, input_path: Union[str, Path]) -> str:
"""
Convert a single file to markdown format.
Args:
input_path: Path to the input file or URL
Returns:
Markdown content as string
Raises:
ValueError: If the file doesn't exist, has an unsupported extension, or conversion fails
"""
# Handle .txt and .json files directly without using docling converter
if self._is_text_file(input_path):
content = self._read_text_file(input_path)
if content is None:
raise ValueError(f"Failed to read text file: {self._safe_log_path(input_path)}")
return content
# For all other file types, use the docling converter
conv_result = self.converter.convert(input_path)
# Get markdown content
markdown_content = conv_result.document.export_to_markdown()
if not markdown_content:
raise ValueError(f"Conversion returned empty content for {self._safe_log_path(input_path)}")
return markdown_content`
...
Steps to reproduce
File "/app/src/objects_core/utils.py", line 34, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/app/src/vector_store_processor/core/docling_converter.py", line 142, in convert_to_markdown
conv_result = self.converter.convert(input_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/pydantic/_internal/_validate_call.py", line 39, in wrapper_function
return wrapper(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/pydantic/_internal/_validate_call.py", line 136, in call
res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, kwargs))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 237, in convert
return next(all_res)
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 260, in convert_all
for conv_res in conv_res_iter:
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 332, in _convert
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 379, in _process_document
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 400, in _execute_pipeline
pipeline = self._get_pipeline(in_doc.format)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 362, in _get_pipeline
self.initialized_pipelines[cache_key] = pipeline_class(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/docling/pipeline/simple_pipeline.py", line 24, in init
super().__init__(pipeline_options)
File "/usr/local/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 144, in init
picture_description_model := self._get_picture_description_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 170, in _get_picture_description_model
return factory.create_instance(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/docling/models/factories/base_factory.py", line 59, in create_instance
raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
RuntimeError: No class found with the name 'vlm', known classes are:
same error
...
Docling version
2.58.0
...
Python version
3.11.9
...
Bug
Even when I have set OCR=False, it is failing asking to install VLM model
`def _initialize_converter(self) -> None:
"""Initialize the document converter with appropriate settings."""
if self.optimize_pdf:
pdf_pipeline_options = PdfPipelineOptions(ocr_options=EasyOcrOptions())
pdf_pipeline_options.do_ocr = self.enable_ocr
pdf_pipeline_options.do_table_structure = True
pdf_pipeline_options.table_structure_options.do_cell_matching = True
pdf_pipeline_options.do_picture_description = False
pdf_pipeline_options.picture_description_options = None
...
Steps to reproduce
File "/app/src/objects_core/utils.py", line 34, in wrapper
File "/app/src/vector_store_processor/core/docling_converter.py", line 142, in convert_to_markdown
File "/usr/local/lib/python3.11/site-packages/pydantic/_internal/_validate_call.py", line 39, in wrapper_function
File "/usr/local/lib/python3.11/site-packages/pydantic/_internal/_validate_call.py", line 136, in call
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 237, in convert
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 260, in convert_all
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 332, in _convert
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 379, in _process_document
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 400, in _execute_pipeline
File "/usr/local/lib/python3.11/site-packages/docling/document_converter.py", line 362, in _get_pipeline
File "/usr/local/lib/python3.11/site-packages/docling/pipeline/simple_pipeline.py", line 24, in init
File "/usr/local/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 144, in init
File "/usr/local/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 170, in _get_picture_description_model
File "/usr/local/lib/python3.11/site-packages/docling/models/factories/base_factory.py", line 59, in create_instance
RuntimeError: No class found with the name 'vlm', known classes are:
same error
...
Docling version
2.58.0
...
Python version
3.11.9
...