Skip to content

Commit cb0442c

Browse files
committed
another fix to process strings
1 parent a896e12 commit cb0442c

File tree

2 files changed

+11
-9
lines changed

2 files changed

+11
-9
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name="text_preprocessing",
8-
version="1.1.1",
8+
version="1.1.1.2",
99
author="The ARTFL Project",
1010
author_email="clovisgladstone@gmail.com",
1111
packages=["text_preprocessing", "text_preprocessing.lang"],

text_preprocessing/preprocessor.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ def __init__(
219219
"text_object_type": text_object_type,
220220
"workers": self.workers,
221221
"ngram_config": self.ngram_config,
222+
"is_string": False
222223
}
223224

224225
def __process_batch(self, batch, keep_all, progress_info):
@@ -281,6 +282,7 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
281282
"""Take a string and return a list of preprocessed tokens"""
282283
progress_info = {"count": 0, "doc_count": 0, "progress": False, "progress_prefix": ""}
283284
self.text_fetcher_args["is_philo_db"] = False # Ensure string processing does not expect PhiloLogic format
285+
284286
result = self.__process_batch([text], keep_all, progress_info)
285287
return next(result)
286288

@@ -312,6 +314,7 @@ def __init__(
312314
text_object_type="doc",
313315
ngram_config=None,
314316
workers=None,
317+
is_text=False,
315318
**_, # this is meant to make the constructor accept invalid keywords
316319
):
317320
cls.language = language
@@ -331,6 +334,7 @@ def __init__(
331334
else:
332335
cls.workers = workers
333336
cls.ngram_config = ngram_config
337+
cls.is_text = is_text
334338

335339
@classmethod
336340
def __call__(
@@ -359,6 +363,8 @@ def __local_process(cls, args) -> Iterable[PreparedDoc | Tokens]:
359363
text, do_nlp, keep_all, post_func = args
360364
if cls.is_philo_db is True:
361365
text_objects, sent_starts_list, metadata = cls.process_philo_text(text)
366+
elif cls.is_text is True:
367+
text_objects, sent_starts_list, metadata = cls.process_string(text)
362368
else:
363369
text_objects, sent_starts_list, metadata = cls.process_text(text)
364370
docs = cls.__prepare_docs(text_objects, sent_starts_list, metadata)
@@ -395,18 +401,14 @@ def process_text(cls, text: str):
395401
"""Process one document. Return the transformed document"""
396402
with open(text, encoding="utf-8") as input_text:
397403
doc: str = input_text.read()
398-
tokens, sent_starts = cls.tokenize_text(doc)
399-
metadata: dict[str, Any] = {"filename": text}
400-
return tokens, sent_starts, metadata
404+
return cls.process_string(doc)
401405

402406
@classmethod
403-
def process_string(cls, text: str) -> Doc:
407+
def process_string(cls, text: str) -> tuple[list[tuple[str, dict[str, str]]], list[bool], dict[str, Any]]:
404408
"""Process one string. Return the transformed document"""
405409
tokens, sent_starts = cls.tokenize_text(text)
406-
doc = Doc(cls.model.vocab, [word for word, _ in tokens], sent_starts=sent_starts) # type: ignore
407-
for pos, (_, ext) in enumerate(tokens):
408-
doc[pos]._.ext = ext
409-
return doc
410+
metadata: dict[str, Any] = {"filename": text}
411+
return tokens, sent_starts, metadata
410412

411413
@classmethod
412414
def tokenize_text(cls, doc: str) -> tuple[list[tuple[str, dict[str, str]]], list[bool]]:

0 commit comments

Comments
 (0)