Skip to content

Commit ef3eaf0

Browse files
committed
improve process output
1 parent 9b30a83 commit ef3eaf0

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

text_preprocessing/preprocessor.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ def process_batch_texts(
7777
nlp = load_language_model(language_model, normalize_options)
7878
results = []
7979
text_fetcher = TextFetcher(nlp, **text_fetcher_args) # Initialize text_fetcher with required params
80+
previous_philo_id = None
81+
doc_count = 0
8082
for tokens, _ in text_fetcher(batch_texts, do_nlp=do_nlp, keep_all=keep_all, progress=False):
8183
if isinstance(tokens, PreparedDoc):
8284
spacy_doc = make_spacy_doc(nlp, tokens)
@@ -93,9 +95,18 @@ def process_batch_texts(
9395
results.append(tokens)
9496
if using_gpu:
9597
check_gpu_ram()
98+
current_doc_id = results[-1].metadata.get("philo_id").split()[0]
99+
if previous_philo_id != current_doc_id:
100+
doc_count += 1
96101
if progress:
97102
count += 1
98-
print(f"\r{progress_prefix} {count} texts processed...", end="", flush=True)
103+
if text_fetcher_args["text_object_type"] == "doc":
104+
print(f"\r{progress_prefix} {count} texts processed...", end="", flush=True)
105+
else:
106+
print(
107+
f"\r{progress_prefix} {count} text chunks of {doc_count} documents processed...", end="", flush=True
108+
)
109+
previous_philo_id = current_doc_id
99110
return results
100111

101112

0 commit comments

Comments
 (0)