Skip to content

Commit d0a1f8b

Browse files
committed
another fix to process strings
1 parent cb0442c commit d0a1f8b

File tree

1 file changed

+28
-25
lines changed

1 file changed

+28
-25
lines changed

text_preprocessing/preprocessor.py

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -81,24 +81,25 @@ def process_batch_texts(
8181
tokens = Tokens(tokens, keep_all=keep_all)
8282
if using_gpu:
8383
check_gpu_ram()
84-
current_doc_id = tokens.metadata.get("philo_id").split()[0]
85-
if previous_philo_id != current_doc_id:
86-
progress_info["doc_count"] += 1
87-
if progress_info["progress"] is True:
88-
progress_info["count"] += 1
89-
if text_fetcher_args["text_object_type"] == "doc":
90-
print(
91-
f"\r{progress_info['progress_prefix']} {progress_info['count']} texts processed...",
92-
end="",
93-
flush=True,
94-
)
95-
else:
96-
print(
97-
f"\r{progress_info['progress_prefix']} {progress_info['count']} text chunks of {progress_info['doc_count']} documents processed...",
98-
end="",
99-
flush=True,
100-
)
101-
previous_philo_id = current_doc_id
84+
if text_fetcher_args["is_philo_db"] is True:
85+
current_doc_id = tokens.metadata.get("philo_id").split()[0]
86+
if previous_philo_id != current_doc_id:
87+
progress_info["doc_count"] += 1
88+
if progress_info["progress"] is True:
89+
progress_info["count"] += 1
90+
if text_fetcher_args["text_object_type"] == "doc":
91+
print(
92+
f"\r{progress_info['progress_prefix']} {progress_info['count']} texts processed...",
93+
end="",
94+
flush=True,
95+
)
96+
else:
97+
print(
98+
f"\r{progress_info['progress_prefix']} {progress_info['count']} text chunks of {progress_info['doc_count']} documents processed...",
99+
end="",
100+
flush=True,
101+
)
102+
previous_philo_id = current_doc_id
102103
queue.put(tokens)
103104
queue.put(None)
104105

@@ -282,7 +283,7 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
282283
"""Take a string and return a list of preprocessed tokens"""
283284
progress_info = {"count": 0, "doc_count": 0, "progress": False, "progress_prefix": ""}
284285
self.text_fetcher_args["is_philo_db"] = False # Ensure string processing does not expect PhiloLogic format
285-
286+
self.text_fetcher_args["is_string"] = True # Ensure string processing
286287
result = self.__process_batch([text], keep_all, progress_info)
287288
return next(result)
288289

@@ -314,7 +315,7 @@ def __init__(
314315
text_object_type="doc",
315316
ngram_config=None,
316317
workers=None,
317-
is_text=False,
318+
is_string=False,
318319
**_, # this is meant to make the constructor accept invalid keywords
319320
):
320321
cls.language = language
@@ -334,7 +335,7 @@ def __init__(
334335
else:
335336
cls.workers = workers
336337
cls.ngram_config = ngram_config
337-
cls.is_text = is_text
338+
cls.is_string = is_string
338339

339340
@classmethod
340341
def __call__(
@@ -363,7 +364,7 @@ def __local_process(cls, args) -> Iterable[PreparedDoc | Tokens]:
363364
text, do_nlp, keep_all, post_func = args
364365
if cls.is_philo_db is True:
365366
text_objects, sent_starts_list, metadata = cls.process_philo_text(text)
366-
elif cls.is_text is True:
367+
elif cls.is_string is True:
367368
text_objects, sent_starts_list, metadata = cls.process_string(text)
368369
else:
369370
text_objects, sent_starts_list, metadata = cls.process_text(text)
@@ -401,14 +402,16 @@ def process_text(cls, text: str):
401402
"""Process one document. Return the transformed document"""
402403
with open(text, encoding="utf-8") as input_text:
403404
doc: str = input_text.read()
404-
return cls.process_string(doc)
405+
tokens, sent_starts = cls.tokenize_text(doc)
406+
metadata: dict[str, Any] = {"filename": text}
407+
return [tokens], [sent_starts], [metadata]
405408

406409
@classmethod
407-
def process_string(cls, text: str) -> tuple[list[tuple[str, dict[str, str]]], list[bool], dict[str, Any]]:
410+
def process_string(cls, text: str):
408411
"""Process one string. Return the transformed document"""
409412
tokens, sent_starts = cls.tokenize_text(text)
410413
metadata: dict[str, Any] = {"filename": text}
411-
return tokens, sent_starts, metadata
414+
return [tokens], [sent_starts], [metadata]
412415

413416
@classmethod
414417
def tokenize_text(cls, doc: str) -> tuple[list[tuple[str, dict[str, str]]], list[bool]]:

0 commit comments

Comments
 (0)