@@ -219,6 +219,7 @@ def __init__(
219219 "text_object_type" : text_object_type ,
220220 "workers" : self .workers ,
221221 "ngram_config" : self .ngram_config ,
222+ "is_string" : False
222223 }
223224
224225 def __process_batch (self , batch , keep_all , progress_info ):
@@ -281,6 +282,7 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
281282 """Take a string and return a list of preprocessed tokens"""
282283 progress_info = {"count" : 0 , "doc_count" : 0 , "progress" : False , "progress_prefix" : "" }
283284 self .text_fetcher_args ["is_philo_db" ] = False # Ensure string processing does not expect PhiloLogic format
285+
284286 result = self .__process_batch ([text ], keep_all , progress_info )
285287 return next (result )
286288
@@ -312,6 +314,7 @@ def __init__(
312314 text_object_type = "doc" ,
313315 ngram_config = None ,
314316 workers = None ,
317+ is_text = False ,
315318 ** _ , # this is meant to make the constructor accept invalid keywords
316319 ):
317320 cls .language = language
@@ -331,6 +334,7 @@ def __init__(
331334 else :
332335 cls .workers = workers
333336 cls .ngram_config = ngram_config
337+ cls .is_text = is_text
334338
335339 @classmethod
336340 def __call__ (
@@ -359,6 +363,8 @@ def __local_process(cls, args) -> Iterable[PreparedDoc | Tokens]:
359363 text , do_nlp , keep_all , post_func = args
360364 if cls .is_philo_db is True :
361365 text_objects , sent_starts_list , metadata = cls .process_philo_text (text )
366+ elif cls .is_text is True :
367+ text_objects , sent_starts_list , metadata = cls .process_string (text )
362368 else :
363369 text_objects , sent_starts_list , metadata = cls .process_text (text )
364370 docs = cls .__prepare_docs (text_objects , sent_starts_list , metadata )
@@ -395,18 +401,14 @@ def process_text(cls, text: str):
395401 """Process one document. Return the transformed document"""
396402 with open (text , encoding = "utf-8" ) as input_text :
397403 doc : str = input_text .read ()
398- tokens , sent_starts = cls .tokenize_text (doc )
399- metadata : dict [str , Any ] = {"filename" : text }
400- return tokens , sent_starts , metadata
404+ return cls .process_string (doc )
401405
402406 @classmethod
403- def process_string (cls , text : str ) -> Doc :
407+ def process_string (cls , text : str ) -> tuple [ list [ tuple [ str , dict [ str , str ]]], list [ bool ], dict [ str , Any ]] :
404408 """Process one string. Return the transformed document"""
405409 tokens , sent_starts = cls .tokenize_text (text )
406- doc = Doc (cls .model .vocab , [word for word , _ in tokens ], sent_starts = sent_starts ) # type: ignore
407- for pos , (_ , ext ) in enumerate (tokens ):
408- doc [pos ]._ .ext = ext
409- return doc
410+ metadata : dict [str , Any ] = {"filename" : text }
411+ return tokens , sent_starts , metadata
410412
411413 @classmethod
412414 def tokenize_text (cls , doc : str ) -> tuple [list [tuple [str , dict [str , str ]]], list [bool ]]:
0 commit comments