1616import itertools
1717import os
1818import string
19+ import warnings
1920from abc import ABC , abstractmethod
2021from contextlib import contextmanager
2122from typing import List , Optional , Union
@@ -386,6 +387,14 @@ def __init__(
386387class HindiCharsTokenizer (BaseCharsTokenizer ):
387388 """Hindi grapheme tokenizer (character-based, no phonemes).
388389 Args:
390+ chars: Explicit character set string. When provided, ``charset_version`` is ignored.
391+ charset_version: Controls which default character set to use (only when ``chars`` is None).
392+ ``2`` (default) — ``case="upper"`` Devanagari + ``ascii_letters``.
393+ Hindi/Devanagari has no case distinction, so ``case="upper"`` avoids duplicating
394+ every code-point. ``ascii_letters`` covers both upper- and lower-case English for
395+ mixed-language text.
396+ ``1`` — legacy ``case="mixed"`` Devanagari + ``ascii_lowercase``. Use this value to
397+ restore models that were trained before the charset fix.
389398 punct: Whether to reserve grapheme for basic punctuation or not.
390399 apostrophe: Whether to use apostrophe or not.
391400 add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
@@ -404,24 +413,36 @@ class HindiCharsTokenizer(BaseCharsTokenizer):
404413
405414 _LOCALE = "hi-IN"
406415 _PUNCT_LIST = get_ipa_punctuation_list (_LOCALE )
407- _CHARSET_STR = get_grapheme_character_set (locale = _LOCALE , case = "mixed" )
408- _CHARSET_STR += string .ascii_lowercase
416+ _CHARSET_STR = get_grapheme_character_set (locale = _LOCALE , case = "upper" ) + string . ascii_letters
417+ _CHARSET_STR_V1 = get_grapheme_character_set ( locale = _LOCALE , case = "mixed" ) + string .ascii_lowercase
409418
410419 def __init__ (
411420 self ,
412- chars = _CHARSET_STR ,
421+ chars = None ,
422+ charset_version = 2 ,
413423 punct = True ,
414424 apostrophe = True ,
415425 add_blank_at = None ,
416426 pad_with_space = False ,
417427 non_default_punct_list = _PUNCT_LIST ,
418428 text_preprocessing_func = any_locale_text_preprocessing ,
419429 ):
420- logging .warning (
421- "HindiCharsTokenizer: The default character set (case='mixed' + ascii_lowercase) "
422- "is deprecated and will change to (case='upper' + ascii_letters) in the next release. "
423- "Please pass 'chars' explicitly to avoid unexpected behavior."
424- )
430+ if chars is None :
431+ if charset_version == 1 :
432+ warnings .warn (
433+ "HindiCharsTokenizer charset_version=1 (case='mixed' + ascii_lowercase) is deprecated "
434+ "and will be removed in a future release. "
435+ "Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain." ,
436+ DeprecationWarning ,
437+ stacklevel = 2 ,
438+ )
439+ chars = self ._CHARSET_STR_V1
440+ elif charset_version == 2 :
441+ chars = self ._CHARSET_STR
442+ else :
443+ raise ValueError (
444+ f"HindiCharsTokenizer: unsupported charset_version={ charset_version !r} . Use 1 (legacy) or 2."
445+ )
425446 super ().__init__ (
426447 chars = chars ,
427448 punct = punct ,
@@ -466,6 +487,14 @@ def encode(self, text):
466487class ArabicCharsTokenizer (BaseCharsTokenizer ):
467488 """Arabic grapheme tokenizer (character-based, no phonemes).
468489 Args:
490+ chars: Explicit character set string. When provided, ``charset_version`` is ignored.
491+ charset_version: Controls which default character set to use (only when ``chars`` is None).
492+ ``2`` (default) — ``case="upper"`` Arabic + ``ascii_letters``.
493+ Arabic script has no case distinction, so ``case="upper"`` avoids duplicating
494+ every code-point. ``ascii_letters`` covers both upper- and lower-case English for
495+ mixed-language text.
496+ ``1`` — legacy ``case="mixed"`` Arabic + ``ascii_letters``. Use this value to
497+ restore models that were trained before the charset fix.
469498 punct: Whether to reserve grapheme for basic punctuation or not.
470499 apostrophe: Whether to use apostrophe or not.
471500 add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
@@ -483,19 +512,36 @@ class ArabicCharsTokenizer(BaseCharsTokenizer):
483512
484513 _LOCALE = "ar-MSA"
485514 _PUNCT_LIST = get_ipa_punctuation_list (_LOCALE )
486- _CHARSET_STR = get_grapheme_character_set (locale = _LOCALE , case = "mixed" )
487- _CHARSET_STR += string .ascii_letters
515+ _CHARSET_STR = get_grapheme_character_set (locale = _LOCALE , case = "upper" ) + string . ascii_letters
516+ _CHARSET_STR_V1 = get_grapheme_character_set ( locale = _LOCALE , case = "mixed" ) + string .ascii_letters
488517
489518 def __init__ (
490519 self ,
491- chars = _CHARSET_STR ,
520+ chars = None ,
521+ charset_version = 2 ,
492522 punct = True ,
493523 apostrophe = True ,
494524 add_blank_at = None ,
495525 pad_with_space = False ,
496526 non_default_punct_list = _PUNCT_LIST ,
497527 text_preprocessing_func = any_locale_text_preprocessing ,
498528 ):
529+ if chars is None :
530+ if charset_version == 1 :
531+ warnings .warn (
532+ "ArabicCharsTokenizer charset_version=1 (case='mixed' + ascii_letters) is deprecated "
533+ "and will be removed in a future release. "
534+ "Migrate to charset_version=2 (case='upper' + ascii_letters) and retrain." ,
535+ DeprecationWarning ,
536+ stacklevel = 2 ,
537+ )
538+ chars = self ._CHARSET_STR_V1
539+ elif charset_version == 2 :
540+ chars = self ._CHARSET_STR
541+ else :
542+ raise ValueError (
543+ f"ArabicCharsTokenizer: unsupported charset_version={ charset_version !r} . Use 1 (legacy) or 2."
544+ )
499545 super ().__init__ (
500546 chars = chars ,
501547 punct = punct ,
0 commit comments