Skip to content

Commit aa2692e

Browse files
authored
fix(evo2): replace removed _HuggingFaceTokenizer import (NVIDIA-BioNeMo#1595)
## Summary Fix `ImportError: cannot import name '_HuggingFaceTokenizer' from 'megatron.bridge.training.tokenizers.tokenizer'` in the evo2 lora fine-tuning notebook CI test. ### Root Cause Megatron-Bridge v0.4.1 removed the `_HuggingFaceTokenizer` class from `megatron.bridge.training.tokenizers.tokenizer`. The evo2_classifier.py script still imported and used it. ### Changes - **Import**: `_HuggingFaceTokenizer` → `HuggingFaceTokenizer` from `megatron.core.tokenizers.text.libraries.huggingface_tokenizer` - **All type annotations and constructor calls** updated to use the new class name - **Pad token access**: `tokenizer.pad` → `tokenizer.pad_id` (with broadened except for backwards compat) - The existing `hasattr(tokenizer, "tokenize")` guard already handles the method rename (`tokenize()` → `text_to_ids()`) ### Testing - Pre-commit passes - 1 file changed, 8 insertions, 8 deletions (minimal diff) ### CI failure https://github.com/NVIDIA-BioNeMo/bionemo-framework/actions/runs/26790293271/job/78975043946 Signed-off-by: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com> Co-authored-by: svc-bionemo <267129667+svc-bionemo@users.noreply.github.com>
1 parent 40d64e0 commit aa2692e

1 file changed

Lines changed: 8 additions & 8 deletions

File tree

bionemo-recipes/recipes/evo2_megatron/examples/evo2_classifier.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
from megatron.bridge.training.mixed_precision import get_mixed_precision_config
6969
from megatron.bridge.training.pretrain import pretrain
7070
from megatron.bridge.training.state import GlobalState
71-
from megatron.bridge.training.tokenizers.tokenizer import _HuggingFaceTokenizer
7271
from megatron.bridge.training.utils.checkpoint_utils import (
7372
get_checkpoint_run_config_filename,
7473
read_run_config,
@@ -77,6 +76,7 @@
7776
from megatron.bridge.utils.vocab_utils import calculate_padded_vocab_size
7877
from megatron.core import dist_checkpointing, parallel_state
7978
from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
79+
from megatron.core.tokenizers.text.libraries.huggingface_tokenizer import HuggingFaceTokenizer
8080
from megatron.core.transformer.module import Float16Module
8181
from megatron.core.transformer.spec_utils import ModuleSpec
8282
from torch import Tensor
@@ -315,15 +315,15 @@ def __init__(
315315
self,
316316
sequences: Sequence[str],
317317
labels: Sequence[int],
318-
tokenizer: _HuggingFaceTokenizer,
318+
tokenizer: HuggingFaceTokenizer,
319319
seq_length: int,
320320
pad_token_id: Optional[int] = None,
321321
) -> None:
322322
"""Tokenize once into pre-allocated tensors so ``__getitem__`` is just a slice."""
323323
if pad_token_id is None:
324324
try:
325-
pad_token_id = tokenizer.pad
326-
except NotImplementedError:
325+
pad_token_id = tokenizer.pad_id
326+
except (NotImplementedError, AttributeError):
327327
pad_token_id = None
328328
if pad_token_id is None:
329329
pad_token_id = 1
@@ -378,7 +378,7 @@ class Evo2ClassifierDatasetProvider(DatasetProvider):
378378

379379
def build_datasets(self, context: DatasetBuildContext) -> tuple[Optional[Any], Optional[Any], Optional[Any]]:
380380
"""Tokenize the JSONL splits into :class:`Evo2ClassifierDataset` objects."""
381-
tokenizer = context.tokenizer if context.tokenizer is not None else _HuggingFaceTokenizer(self.tokenizer_path)
381+
tokenizer = context.tokenizer if context.tokenizer is not None else HuggingFaceTokenizer(self.tokenizer_path)
382382

383383
def _build(path: Optional[str]) -> Optional[Evo2ClassifierDataset]:
384384
if path is None:
@@ -706,7 +706,7 @@ def _cleanup_inference_distributed() -> None:
706706

707707
def _build_classifier_from_checkpoint(
708708
trained_ckpt_dir: Path,
709-
) -> tuple[list[nn.Module], _HuggingFaceTokenizer, int]:
709+
) -> tuple[list[nn.Module], HuggingFaceTokenizer, int]:
710710
"""Rebuild a trained classifier model from a saved checkpoint.
711711
712712
Reads the ``run_config.yaml`` saved alongside the trained checkpoint to
@@ -750,9 +750,9 @@ def _build_classifier_from_checkpoint(
750750
# -------------------------------------------------------------------------
751751
tokenizer_dir = resolved_ckpt_dir / "tokenizer"
752752
if tokenizer_dir.exists():
753-
tokenizer = _HuggingFaceTokenizer(str(tokenizer_dir))
753+
tokenizer = HuggingFaceTokenizer(str(tokenizer_dir))
754754
else:
755-
tokenizer = _HuggingFaceTokenizer(DEFAULT_HF_TOKENIZER_MODEL_PATH_512)
755+
tokenizer = HuggingFaceTokenizer(DEFAULT_HF_TOKENIZER_MODEL_PATH_512)
756756

757757
model_provider.vocab_size = tokenizer.vocab_size
758758
model_provider.should_pad_vocab = True

0 commit comments

Comments
 (0)