@@ -81,24 +81,25 @@ def process_batch_texts(
8181 tokens = Tokens (tokens , keep_all = keep_all )
8282 if using_gpu :
8383 check_gpu_ram ()
84- current_doc_id = tokens .metadata .get ("philo_id" ).split ()[0 ]
85- if previous_philo_id != current_doc_id :
86- progress_info ["doc_count" ] += 1
87- if progress_info ["progress" ] is True :
88- progress_info ["count" ] += 1
89- if text_fetcher_args ["text_object_type" ] == "doc" :
90- print (
91- f"\r { progress_info ['progress_prefix' ]} { progress_info ['count' ]} texts processed..." ,
92- end = "" ,
93- flush = True ,
94- )
95- else :
96- print (
97- f"\r { progress_info ['progress_prefix' ]} { progress_info ['count' ]} text chunks of { progress_info ['doc_count' ]} documents processed..." ,
98- end = "" ,
99- flush = True ,
100- )
101- previous_philo_id = current_doc_id
84+ if text_fetcher_args ["is_philo_db" ] is True :
85+ current_doc_id = tokens .metadata .get ("philo_id" ).split ()[0 ]
86+ if previous_philo_id != current_doc_id :
87+ progress_info ["doc_count" ] += 1
88+ if progress_info ["progress" ] is True :
89+ progress_info ["count" ] += 1
90+ if text_fetcher_args ["text_object_type" ] == "doc" :
91+ print (
92+ f"\r { progress_info ['progress_prefix' ]} { progress_info ['count' ]} texts processed..." ,
93+ end = "" ,
94+ flush = True ,
95+ )
96+ else :
97+ print (
98+ f"\r { progress_info ['progress_prefix' ]} { progress_info ['count' ]} text chunks of { progress_info ['doc_count' ]} documents processed..." ,
99+ end = "" ,
100+ flush = True ,
101+ )
102+ previous_philo_id = current_doc_id
102103 queue .put (tokens )
103104 queue .put (None )
104105
@@ -282,7 +283,7 @@ def process_string(self, text: str, keep_all: bool = True) -> Tokens:
282283 """Take a string and return a list of preprocessed tokens"""
283284 progress_info = {"count" : 0 , "doc_count" : 0 , "progress" : False , "progress_prefix" : "" }
284285 self .text_fetcher_args ["is_philo_db" ] = False # Ensure string processing does not expect PhiloLogic format
285-
286+ self . text_fetcher_args [ "is_string" ] = True # Ensure string processing
286287 result = self .__process_batch ([text ], keep_all , progress_info )
287288 return next (result )
288289
@@ -314,7 +315,7 @@ def __init__(
314315 text_object_type = "doc" ,
315316 ngram_config = None ,
316317 workers = None ,
317- is_text = False ,
318+ is_string = False ,
318319 ** _ , # this is meant to make the constructor accept invalid keywords
319320 ):
320321 cls .language = language
@@ -334,7 +335,7 @@ def __init__(
334335 else :
335336 cls .workers = workers
336337 cls .ngram_config = ngram_config
337- cls .is_text = is_text
338+ cls .is_string = is_string
338339
339340 @classmethod
340341 def __call__ (
@@ -363,7 +364,7 @@ def __local_process(cls, args) -> Iterable[PreparedDoc | Tokens]:
363364 text , do_nlp , keep_all , post_func = args
364365 if cls .is_philo_db is True :
365366 text_objects , sent_starts_list , metadata = cls .process_philo_text (text )
366- elif cls .is_text is True :
367+ elif cls .is_string is True :
367368 text_objects , sent_starts_list , metadata = cls .process_string (text )
368369 else :
369370 text_objects , sent_starts_list , metadata = cls .process_text (text )
@@ -401,14 +402,16 @@ def process_text(cls, text: str):
401402 """Process one document. Return the transformed document"""
402403 with open (text , encoding = "utf-8" ) as input_text :
403404 doc : str = input_text .read ()
404- return cls .process_string (doc )
405+ tokens , sent_starts = cls .tokenize_text (doc )
406+ metadata : dict [str , Any ] = {"filename" : text }
407+ return [tokens ], [sent_starts ], [metadata ]
405408
406409 @classmethod
407- def process_string (cls , text : str ) -> tuple [ list [ tuple [ str , dict [ str , str ]]], list [ bool ], dict [ str , Any ]] :
410+ def process_string (cls , text : str ):
408411 """Process one string. Return the transformed document"""
409412 tokens , sent_starts = cls .tokenize_text (text )
410413 metadata : dict [str , Any ] = {"filename" : text }
411- return tokens , sent_starts , metadata
414+ return [ tokens ], [ sent_starts ], [ metadata ]
412415
413416 @classmethod
414417 def tokenize_text (cls , doc : str ) -> tuple [list [tuple [str , dict [str , str ]]], list [bool ]]:
0 commit comments