Skip to content

Commit 593ce32

Browse files
chore: force all tokenizers to have a load_from_s3 method
1 parent 1ee4798 commit 593ce32

2 files changed

Lines changed: 20 additions & 3 deletions

File tree

torchTextClassifiers/tokenizers/base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ def __repr__(self):
103103
def __call__(self, text: Union[str, List[str]], **kwargs) -> list:
104104
return self.tokenize(text, **kwargs)
105105

106+
@classmethod
107+
@abstractmethod
108+
def load_from_s3(cls, s3_path: str, filesystem):
109+
pass
110+
106111

107112
class HuggingFaceTokenizer(BaseTokenizer):
108113
def __init__(

torchTextClassifiers/tokenizers/ngram.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -432,11 +432,24 @@ def save_pretrained(self, save_directory: str):
432432
print(f"✓ Tokenizer saved to {save_directory}")
433433

434434
@classmethod
435-
def from_pretrained(cls, directory: str):
435+
def load_from_s3(cls, s3_path: str, filesystem):
436436
"""Load tokenizer from saved configuration."""
437-
with open(f"{directory}/tokenizer.json", "r") as f:
437+
438+
config = json.load(filesystem.open(s3_path, "r"))
439+
tokenizer = cls.build_from_config(config)
440+
return tokenizer
441+
442+
@classmethod
443+
def load(cls, path: str):
444+
"""Load tokenizer from saved configuration."""
445+
446+
with open(path, "r") as f:
438447
config = json.load(f)
448+
tokenizer = cls.build_from_config(config)
449+
return tokenizer
439450

451+
@classmethod
452+
def build_from_config(cls, config):
440453
tokenizer = cls(
441454
min_count=config["min_count"],
442455
min_n=config["min_n"],
@@ -468,5 +481,4 @@ def from_pretrained(cls, directory: str):
468481
)
469482
print("✓ Subword cache built")
470483

471-
print(f"✓ Tokenizer loaded from {directory}")
472484
return tokenizer

0 commit comments

Comments
 (0)