Skip to content

Commit ceb5aaa

Browse files
authored
fix: raise clear errors for missing input files and unavailable Java runtime
Objective: Users get cryptic CalledProcessError when passing non-existent paths or running without Java installed. Approach: Add early validation in lazy_load_data() — FileNotFoundError for missing paths, RuntimeError with install guidance for missing Java. Cache only positive Java probe results; catch OSError for PermissionError. Evidence: 33 unit tests passed (30 existing + 3 new). Before: CalledProcessError / bare FileNotFoundError from subprocess. After: actionable error messages with specific path and install instructions.
1 parent a01da1f commit ceb5aaa

2 files changed

Lines changed: 113 additions & 0 deletions

File tree

llama_index/readers/opendataloader_pdf/base.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import re
66
import shutil
7+
import subprocess
78
import tempfile
89
from collections import defaultdict
910
from pathlib import Path
@@ -14,6 +15,34 @@
1415

1516
logger = logging.getLogger(__name__)
1617

18+
19+
_java_found: Optional[bool] = None
20+
21+
22+
def _java_available() -> bool:
23+
"""Return True if a working Java runtime is found on the system PATH.
24+
25+
A positive result is cached for the lifetime of the process; a negative
26+
result is re-probed on every call so that installing Java mid-session
27+
is recognised without a restart.
28+
"""
29+
global _java_found # noqa: PLW0603
30+
if _java_found:
31+
return True
32+
try:
33+
result = subprocess.run(
34+
["java", "-version"],
35+
capture_output=True,
36+
text=True,
37+
check=False,
38+
timeout=10,
39+
)
40+
_java_found = result.returncode == 0
41+
return _java_found
42+
except (OSError, subprocess.TimeoutExpired):
43+
return False
44+
45+
1746
_FORMAT_TO_EXT = {
1847
"json": "json",
1948
"text": "txt",
@@ -180,6 +209,19 @@ def lazy_load_data(
180209
else:
181210
paths = [str(p) for p in file_path]
182211

212+
for p in paths:
213+
if not Path(p).exists():
214+
raise FileNotFoundError(f"Input path does not exist: {p}")
215+
216+
if not _java_available():
217+
raise RuntimeError(
218+
"Java is not found on the system PATH. "
219+
"OpenDataLoader PDF requires Java 11+. "
220+
"Install Java from https://adoptium.net/ and ensure "
221+
"'java' is on your system PATH. "
222+
"Verify with: java -version"
223+
)
224+
183225
ext = _FORMAT_TO_EXT[fmt]
184226

185227
try:

tests/test_readers_opendataloader_pdf.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,33 @@
88
from llama_index.core.schema import Document
99
from llama_index.readers.opendataloader_pdf import OpenDataLoaderPDFReader
1010

11+
# Save original before any monkeypatching.
12+
_original_path_exists = Path.exists
13+
14+
15+
@pytest.fixture(autouse=True)
16+
def _bypass_input_validation(monkeypatch):
17+
"""Bypass file-existence and Java checks in unit tests.
18+
19+
Tests that verify validation behaviour override these explicitly.
20+
Only dummy PDF paths are faked; all other Path.exists calls delegate
21+
to the real implementation.
22+
"""
23+
monkeypatch.setattr(
24+
"llama_index.readers.opendataloader_pdf.base._java_available",
25+
lambda: True,
26+
)
27+
monkeypatch.setattr(
28+
"llama_index.readers.opendataloader_pdf.base._java_found", None
29+
)
30+
31+
def _fake_exists(self):
32+
if self.suffix == ".pdf":
33+
return True
34+
return _original_path_exists(self)
35+
36+
monkeypatch.setattr("pathlib.Path.exists", _fake_exists)
37+
1138

1239
# ---------------------------------------------------------------------------
1340
# TestInit: Verify default and custom parameter values
@@ -552,3 +579,47 @@ def mock_import(name, *args, **kwargs):
552579
with patch("builtins.__import__", side_effect=mock_import):
553580
with pytest.raises(ImportError, match="opendataloader_pdf"):
554581
list(reader.lazy_load_data(file_path="doc.pdf"))
582+
583+
584+
# ---------------------------------------------------------------------------
585+
# TestFileNotFound: Missing input file raises FileNotFoundError
586+
# ---------------------------------------------------------------------------
587+
class TestFileNotFound:
588+
"""Test that missing input files raise FileNotFoundError."""
589+
590+
@pytest.fixture(autouse=True)
591+
def _real_path_exists(self, monkeypatch):
592+
"""Restore real Path.exists so file-not-found is testable."""
593+
monkeypatch.setattr("pathlib.Path.exists", _original_path_exists)
594+
595+
def test_single_missing_file(self) -> None:
596+
reader = OpenDataLoaderPDFReader()
597+
with pytest.raises(
598+
FileNotFoundError,
599+
match=r"Input path does not exist: nonexistent\.pdf",
600+
):
601+
list(reader.lazy_load_data(file_path="nonexistent.pdf"))
602+
603+
def test_missing_file_in_list(self) -> None:
604+
reader = OpenDataLoaderPDFReader()
605+
with pytest.raises(
606+
FileNotFoundError,
607+
match=r"Input path does not exist: nonexistent\.pdf",
608+
):
609+
list(reader.lazy_load_data(file_path=["nonexistent.pdf"]))
610+
611+
612+
# ---------------------------------------------------------------------------
613+
# TestJavaCheck: Missing Java raises RuntimeError
614+
# ---------------------------------------------------------------------------
615+
class TestJavaCheck:
616+
"""Test that missing Java runtime raises RuntimeError."""
617+
618+
@patch(
619+
"llama_index.readers.opendataloader_pdf.base._java_available",
620+
return_value=False,
621+
)
622+
def test_no_java_raises_runtime_error(self, _: MagicMock) -> None:
623+
reader = OpenDataLoaderPDFReader()
624+
with pytest.raises(RuntimeError, match="Java is not found"):
625+
list(reader.lazy_load_data(file_path="doc.pdf"))

0 commit comments

Comments
 (0)