File tree Expand file tree Collapse file tree
torchTextClassifiers/tokenizers Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -103,6 +103,11 @@ def __repr__(self):
103103 def __call__ (self , text : Union [str , List [str ]], ** kwargs ) -> list :
104104 return self .tokenize (text , ** kwargs )
105105
106+ @classmethod
107+ @abstractmethod
108+ def load_from_s3 (cls , s3_path : str , filesystem ):
109+ pass
110+
106111
107112class HuggingFaceTokenizer (BaseTokenizer ):
108113 def __init__ (
Original file line number Diff line number Diff line change @@ -432,11 +432,24 @@ def save_pretrained(self, save_directory: str):
432432 print (f"✓ Tokenizer saved to { save_directory } " )
433433
434434 @classmethod
435- def from_pretrained (cls , directory : str ):
435+ def load_from_s3 (cls , s3_path : str , filesystem ):
436436 """Load tokenizer from saved configuration."""
437- with open (f"{ directory } /tokenizer.json" , "r" ) as f :
437+
438+ config = json .load (filesystem .open (s3_path , "r" ))
439+ tokenizer = cls .build_from_config (config )
440+ return tokenizer
441+
442+ @classmethod
443+ def load (cls , path : str ):
444+ """Load tokenizer from saved configuration."""
445+
446+ with open (path , "r" ) as f :
438447 config = json .load (f )
448+ tokenizer = cls .build_from_config (config )
449+ return tokenizer
439450
451+ @classmethod
452+ def build_from_config (cls , config ):
440453 tokenizer = cls (
441454 min_count = config ["min_count" ],
442455 min_n = config ["min_n" ],
@@ -468,5 +481,4 @@ def from_pretrained(cls, directory: str):
468481 )
469482 print ("✓ Subword cache built" )
470483
471- print (f"✓ Tokenizer loaded from { directory } " )
472484 return tokenizer
You can’t perform that action at this time.
0 commit comments