Skip to content

Commit 22d08bd

Browse files
fix: dont pass debug as argument as it bypasses the cache
1 parent 6abc778 commit 22d08bd

1 file changed

Lines changed: 2 additions & 10 deletions

File tree

WDoc/utils/loaders.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,6 @@ def load_one_doc(
382382
"""choose the appropriate loader for a file, then load it,
383383
split into documents, add some metadata then return.
384384
The loader is cached"""
385-
debug = is_debug
386385
text_splitter = get_splitter(task, modelname=llm_name)
387386
assert kwargs, "Received an empty dict of arguments to load. Maybe --path is empty?"
388387

@@ -404,7 +403,6 @@ def load_one_doc(
404403

405404
elif filetype == "pdf":
406405
docs = load_pdf(
407-
debug=debug,
408406
text_splitter=text_splitter,
409407
file_hash=file_hash,
410408
doccheck_min_lang_prob=doccheck_min_lang_prob,
@@ -416,7 +414,6 @@ def load_one_doc(
416414
elif filetype == "online_pdf":
417415
docs = load_online_pdf(
418416
text_splitter=text_splitter,
419-
debug=debug,
420417
file_hash=file_hash,
421418
doccheck_min_lang_prob=doccheck_min_lang_prob,
422419
doccheck_min_token=doccheck_min_token,
@@ -456,7 +453,6 @@ def load_one_doc(
456453

457454
elif filetype == "logseq_markdown":
458455
docs = load_logseq_markdown(
459-
debug=debug,
460456
file_hash=file_hash,
461457
text_splitter=text_splitter,
462458
**kwargs,
@@ -797,7 +793,6 @@ def load_youtube_video(
797793
def load_online_pdf(
798794
path: str,
799795
text_splitter: TextSplitter,
800-
debug: bool,
801796
file_hash: str,
802797
pdf_parsers: Union[str, List[str]] = 'pymupdf', # used only if online loading fails
803798
doccheck_min_lang_prob: float = min_lang_prob,
@@ -835,7 +830,6 @@ def load_online_pdf(
835830
docs = load_pdf(
836831
path=temp_file.name,
837832
text_splitter=text_splitter,
838-
debug=debug,
839833
file_hash=file_hasher({"path": temp_file.name}),
840834
pdf_parsers=pdf_parsers,
841835
doccheck_min_lang_prob=doccheck_min_lang_prob,
@@ -1542,7 +1536,6 @@ def eval_load_functions(
15421536
@optional_strip_unexp_args
15431537
@doc_loaders_cache.cache(ignore=["path"])
15441538
def load_logseq_markdown(
1545-
debug: bool,
15461539
path: str,
15471540
file_hash: str,
15481541
text_splitter: TextSplitter,
@@ -2264,7 +2257,6 @@ def _pdf_loader(loader_name: str, path: str, file_hash: str) -> List[Document]:
22642257
def load_pdf(
22652258
path: str,
22662259
text_splitter: TextSplitter,
2267-
debug: bool,
22682260
file_hash: str,
22692261
pdf_parsers: Union[str, List[str]] = 'pymupdf',
22702262
doccheck_min_lang_prob: float = min_lang_prob,
@@ -2304,7 +2296,7 @@ def load_pdf(
23042296
for loader_name in pdf_parsers:
23052297
pbar.desc = f"Parsing PDF {name} with {loader_name}"
23062298
try:
2307-
if debug:
2299+
if is_debug:
23082300
red(f"Trying to parse {path} using {loader_name}")
23092301

23102302
if pdf_loader_max_timeout > 0:
@@ -2390,7 +2382,7 @@ def load_pdf(
23902382

23912383
max_prob = max([v for v in probs.values()])
23922384

2393-
if debug:
2385+
if is_debug:
23942386
yel(f"Language probability after parsing {path}: {probs}")
23952387

23962388
return loaded_docs[[name for name in probs if probs[name] == max_prob][0]]

0 commit comments

Comments
 (0)