Skip to content

Commit bd3c3cc

Browse files
rename loaddoc to doc_loaders
1 parent 360f956 commit bd3c3cc

3 files changed

Lines changed: 23 additions & 23 deletions

File tree

DocToolsLLM/utils/batch_file_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import json
2424
import dill
2525

26-
from .misc import loaddoc_cache, file_hasher, min_token, get_tkn_length, unlazyload_modules, doc_kwargs_keys, cache_dir
26+
from .misc import doc_loaders_cache, file_hasher, min_token, get_tkn_length, unlazyload_modules, doc_kwargs_keys, cache_dir
2727
from .typechecker import optional_typecheck
2828
from .logger import red, whi, log
2929
from .loaders import load_one_doc, yt_link_regex, load_youtube_playlist, markdownlink_regex, global_temp_dir
@@ -78,7 +78,7 @@ def batch_load_doc(
7878
"""load the input"""
7979
# # remove cache files older than 90 days
8080
# try:
81-
# loaddoc_cache.reduce_size(age_limit=timedelta(90))
81+
# doc_loaders_cache.reduce_size(age_limit=timedelta(90))
8282
# except Exception as err:
8383
# # red(f"Error when reducing cache size: '{err}'")
8484
# pass

DocToolsLLM/utils/loaders.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
from unstructured.cleaners.core import clean_extra_whitespace
4545

46-
from .misc import (loaddoc_cache, html_to_text, hasher,
46+
from .misc import (doc_loaders_cache, html_to_text, hasher,
4747
file_hasher, get_splitter, check_docs_tkn_length,
4848
average_word_length, wpm)
4949
from .typechecker import optional_typecheck
@@ -468,7 +468,7 @@ def load_youtube_video(
468468
return docs
469469

470470
@optional_typecheck
471-
@loaddoc_cache.cache
471+
@doc_loaders_cache.cache
472472
def load_online_pdf(debug: bool, task: str, path: str, **kwargs) -> List[Document]:
473473
whi(f"Loading online pdf: '{path}'")
474474

@@ -733,7 +733,7 @@ def load_anki(
733733
return docs
734734

735735
@optional_typecheck
736-
@loaddoc_cache.cache
736+
@doc_loaders_cache.cache
737737
def load_string() -> List[Document]:
738738
whi("Loading string")
739739
content = prompt(
@@ -750,7 +750,7 @@ def load_string() -> List[Document]:
750750
return docs
751751

752752
@optional_typecheck
753-
@loaddoc_cache.cache(ignore=["path"])
753+
@doc_loaders_cache.cache(ignore=["path"])
754754
def load_txt(path: str, file_hash: str) -> List[Document]:
755755
whi(f"Loading txt: '{path}'")
756756
assert Path(path).exists(), f"file not found: '{path}'"
@@ -760,7 +760,7 @@ def load_txt(path: str, file_hash: str) -> List[Document]:
760760
return docs
761761

762762
@optional_typecheck
763-
@loaddoc_cache.cache(ignore=["path"])
763+
@doc_loaders_cache.cache(ignore=["path"])
764764
def load_local_html(
765765
path: str,
766766
file_hash: str,
@@ -810,7 +810,7 @@ def load_local_html(
810810
]
811811
return docs
812812

813-
@loaddoc_cache.cache
813+
@doc_loaders_cache.cache
814814
def eval_load_functions(
815815
load_functions: str,
816816
) -> List[Callable]:
@@ -827,7 +827,7 @@ def eval_load_functions(
827827

828828

829829
@optional_typecheck
830-
@loaddoc_cache.cache(ignore=["path"])
830+
@doc_loaders_cache.cache(ignore=["path"])
831831
def load_logseq_markdown(debug: bool, path: str, file_hash: str) -> List[Document]:
832832
whi(f"Loading logseq markdown file: '{path}'")
833833
assert Path(path).exists(), f"file not found: '{path}'"
@@ -877,7 +877,7 @@ def load_logseq_markdown(debug: bool, path: str, file_hash: str) -> List[Documen
877877
return docs
878878

879879
@optional_typecheck
880-
@loaddoc_cache.cache(ignore=["path"])
880+
@doc_loaders_cache.cache(ignore=["path"])
881881
def load_local_audio(
882882
path: str,
883883
file_hash: str,
@@ -987,7 +987,7 @@ def load_local_audio(
987987
return docs
988988

989989
@optional_typecheck
990-
@loaddoc_cache.cache(ignore=["path"])
990+
@doc_loaders_cache.cache(ignore=["path"])
991991
def load_local_video(
992992
path: str,
993993
file_hash: str,
@@ -1046,7 +1046,7 @@ def load_local_video(
10461046

10471047

10481048
@optional_typecheck
1049-
@loaddoc_cache.cache(ignore=["audio_path"])
1049+
@doc_loaders_cache.cache(ignore=["audio_path"])
10501050
def transcribe_audio_deepgram(
10511051
audio_path: str,
10521052
audio_hash: str,
@@ -1115,7 +1115,7 @@ def transcribe_audio_deepgram(
11151115
return d
11161116

11171117
@optional_typecheck
1118-
@loaddoc_cache.cache(ignore=["audio_path"])
1118+
@doc_loaders_cache.cache(ignore=["audio_path"])
11191119
def transcribe_audio_whisper(
11201120
audio_path: str,
11211121
audio_hash: str,
@@ -1143,7 +1143,7 @@ def transcribe_audio_whisper(
11431143
return transcript
11441144

11451145
@optional_typecheck
1146-
@loaddoc_cache.cache(ignore=["path"])
1146+
@doc_loaders_cache.cache(ignore=["path"])
11471147
def load_epub(
11481148
path: str,
11491149
file_hash: str,
@@ -1161,7 +1161,7 @@ def load_epub(
11611161
return docs
11621162

11631163
@optional_typecheck
1164-
@loaddoc_cache.cache(ignore=["path"])
1164+
@doc_loaders_cache.cache(ignore=["path"])
11651165
def load_powerpoint(
11661166
path: str,
11671167
file_hash: str,
@@ -1178,7 +1178,7 @@ def load_powerpoint(
11781178
]
11791179
return docs
11801180
@optional_typecheck
1181-
@loaddoc_cache.cache(ignore=["path"])
1181+
@doc_loaders_cache.cache(ignore=["path"])
11821182
def load_word_document(
11831183
path: str,
11841184
file_hash: str,
@@ -1198,7 +1198,7 @@ def load_word_document(
11981198
return docs
11991199

12001200
@optional_typecheck
1201-
@loaddoc_cache.cache
1201+
@doc_loaders_cache.cache
12021202
def load_url(path: str, title=None) -> List[Document]:
12031203
whi(f"Loading url: '{path}'")
12041204

@@ -1348,7 +1348,7 @@ def load_url(path: str, title=None) -> List[Document]:
13481348

13491349

13501350
@optional_typecheck
1351-
@loaddoc_cache.cache
1351+
@doc_loaders_cache.cache
13521352
def load_youtube_playlist(playlist_url: str) -> Any:
13531353
with youtube_dl.YoutubeDL({"quiet": False}) as ydl:
13541354
try:
@@ -1362,7 +1362,7 @@ def load_youtube_playlist(playlist_url: str) -> Any:
13621362

13631363

13641364
@optional_typecheck
1365-
@loaddoc_cache.cache(ignore=["loader"])
1365+
@doc_loaders_cache.cache(ignore=["loader"])
13661366
def cached_yt_loader(
13671367
loader: Any,
13681368
path: str,
@@ -1380,7 +1380,7 @@ def cached_yt_loader(
13801380

13811381

13821382
@optional_typecheck
1383-
@loaddoc_cache.cache(ignore=["path"])
1383+
@doc_loaders_cache.cache(ignore=["path"])
13841384
def _pdf_loader(loader_name: str, path: str, file_hash: str) -> str:
13851385
loader = pdf_loaders[loader_name](path)
13861386
content = loader.load()

DocToolsLLM/utils/misc.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ def language_detector(text: str) -> float:
5454
def language_detector(text: str) -> None:
5555
return None
5656

57-
loaddoc_cache_dir = (cache_dir / "loaddoc_cache")
58-
loaddoc_cache_dir.mkdir(exist_ok=True)
59-
loaddoc_cache = Memory(loaddoc_cache_dir, verbose=0)
57+
doc_loaders_cache_dir = (cache_dir / "doc_loaders_cache")
58+
doc_loaders_cache_dir.mkdir(exist_ok=True)
59+
doc_loaders_cache = Memory(doc_loaders_cache_dir, verbose=0)
6060
hashdoc_cache_dir = (cache_dir / "hashdoc_cache")
6161
hashdoc_cache_dir.mkdir(exist_ok=True)
6262
hashdoc_cache = Memory(hashdoc_cache_dir, verbose=0)

0 commit comments

Comments
 (0)