Skip to content

Commit fce5ea8

Browse files
fix: storing the loader temp dir to a file is simpler
1 parent 564cc28 commit fce5ea8

4 files changed

Lines changed: 19 additions & 12 deletions

File tree

DocToolsLLM/DocToolsLLM.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
ankiconnect, debug_chain, model_name_matcher,
3131
average_word_length, wpm, get_splitter,
3232
check_docs_tkn_length, get_tkn_length,
33-
extra_args_keys, disable_internet)
33+
extra_args_keys, disable_internet, loaders_temp_dir_file)
3434
from .utils.prompts import PR_CONDENSE_QUESTION, PR_EVALUATE_DOC, PR_ANSWER_ONE_DOC, PR_COMBINE_INTERMEDIATE_ANSWERS
3535
from .utils.tasks.query import format_chat_history, refilter_docs, check_intermediate_answer, parse_eval_output, query_eval_cache
3636

@@ -154,6 +154,9 @@ def p(message: str) -> None:
154154
red(pyfiglet.figlet_format("DocToolsLLM"))
155155
log.info("Starting DocToolsLLM")
156156

157+
# erases content that links to the loaders temporary files at startup
158+
loaders_temp_dir_file.write_text("")
159+
157160
# make sure the extra args are valid
158161
for k in cli_kwargs:
159162
if k not in self.allowed_extra_keys:

DocToolsLLM/utils/batch_file_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from .misc import doc_loaders_cache, file_hasher, min_token, get_tkn_length, unlazyload_modules, doc_kwargs_keys, cache_dir
2727
from .typechecker import optional_typecheck
2828
from .logger import red, whi, log
29-
from .loaders import load_one_doc, yt_link_regex, load_youtube_playlist, markdownlink_regex, global_temp_dir
29+
from .loaders import load_one_doc, yt_link_regex, load_youtube_playlist, markdownlink_regex, loaders_temp_dir_file
3030
from .flags import is_debug
3131

3232

@@ -274,7 +274,7 @@ def load_one_doc_wrapped(**doc_kwargs):
274274
shutil.rmtree(f)
275275
temp_dir = cache_dir / load_temp_name
276276
temp_dir.mkdir(exist_ok=False)
277-
global_temp_dir[0] = temp_dir
277+
loaders_temp_dir_file.write_text(str(temp_dir.absolute().resolve()))
278278

279279
docs = []
280280
t_load = time.time()

DocToolsLLM/utils/loaders.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
from .misc import (doc_loaders_cache, html_to_text, hasher,
4747
file_hasher, get_splitter, check_docs_tkn_length,
48-
average_word_length, wpm, global_temp_dir)
48+
average_word_length, wpm, loaders_temp_dir_file)
4949
from .typechecker import optional_typecheck
5050
from .logger import whi, yel, red, log
5151
from .flags import is_verbose, is_linux
@@ -189,7 +189,11 @@ def load_one_doc(
189189
The loader is cached"""
190190
text_splitter = get_splitter(task)
191191

192-
assert global_temp_dir[0] is temp_dir, f"Error handling temp dir: temp_dir is {temp_dir} but global_temp_dir is {global_temp_dir}"
192+
expected_global_dir = loaders_temp_dir_file.read_text().strip()
193+
assert expected_global_dir, f"Empty loaders_temp_dir_file at {loaders_temp_dir_file}"
194+
expected_global_dir = Path(expected_global_dir)
195+
assert expected_global_dir.exists(), f"File loaders_temp_dir_file not found in {loaders_temp_dir_file} pointing at '{expected_global_dir}'"
196+
assert expected_global_dir == temp_dir, f"Error handling temp dir: temp_dir is {temp_dir} but loaders_temp_dir is {expected_global_dir}"
193197

194198
if filetype == "youtube":
195199
docs = load_youtube_video(**kwargs)
@@ -395,7 +399,7 @@ def load_youtube_video(
395399
)
396400
else:
397401
whi(f"Downloading audio from url: '{path}'")
398-
file_name = global_temp_dir[0] / f"youtube_audio_{uuid.uuid4()}" # without extension!
402+
file_name = load_temp_dir / f"youtube_audio_{uuid.uuid4()}" # without extension!
399403
ydl_opts = {
400404
'format': 'bestaudio/best',
401405
'postprocessors': [{
@@ -410,7 +414,7 @@ def load_youtube_video(
410414
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
411415
ydl.download([path])
412416
candidate = []
413-
for f in global_temp_dir[0].iterdir():
417+
for f in load_temp_dir.iterdir():
414418
if file_name.name in f.name:
415419
candidate.append(f)
416420
assert len(candidate), f"Audio file of {path} failed to download?"
@@ -530,7 +534,7 @@ def load_anki(
530534
original_db = akp.find_db(user=anki_profile)
531535
name = f"{anki_profile}".replace(" ", "_")
532536
random_val = str(uuid.uuid4()).split("-")[-1]
533-
new_db_path = global_temp_dir[0] / f"anki_collection_{name.replace('/', '_')}_{random_val}"
537+
new_db_path = load_temp_dir / f"anki_collection_{name.replace('/', '_')}_{random_val}"
534538
assert not Path(new_db_path).exists(
535539
), f"{new_db_path} already existing!"
536540
shutil.copy(original_db, new_db_path)
@@ -922,8 +926,8 @@ def load_local_audio(
922926
)
923927
red(f"Removed silence from {path.name}: {dur:.1f} -> {new_dur:.1f} in {elapsed:.1f}s")
924928

925-
unsilenced_path_wav = global_temp_dir[0] / f"unsilenced_audio_{uuid.uuid4()}.wav"
926-
unsilenced_path_ogg = global_temp_dir[0] / f"unsilenced_audio_{uuid.uuid4()}.ogg"
929+
unsilenced_path_wav = load_temp_dir / f"unsilenced_audio_{uuid.uuid4()}.wav"
930+
unsilenced_path_ogg = load_temp_dir / f"unsilenced_audio_{uuid.uuid4()}.ogg"
927931
assert not unsilenced_path_wav.exists()
928932
assert not unsilenced_path_ogg.exists()
929933
torchaudio.save(
@@ -1007,7 +1011,7 @@ def load_local_video(
10071011
) -> List[Document]:
10081012
assert Path(path).exists(), f"file not found: '{path}'"
10091013

1010-
audio_path = global_temp_dir[0] / f"audio_from_video_{uuid.uuid4()}.mp3"
1014+
audio_path = load_temp_dir / f"audio_from_video_{uuid.uuid4()}.mp3"
10111015
assert not audio_path.exists()
10121016

10131017
# extract audio from video

DocToolsLLM/utils/misc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
RecursiveCharacterTextSplitter = lazy_import.lazy_class('langchain.text_splitter.RecursiveCharacterTextSplitter')
3333

3434
# will be replaced when load_one_doc is called, by the path to the file where the loaders can store temporary file
35-
global_temp_dir = [None]
35+
loaders_temp_dir_file = cache_dir / "loaders_temp_dir.txt"
3636

3737
try:
3838
import ftlangdetect

0 commit comments

Comments
 (0)