diff --git a/pyproject.toml b/pyproject.toml index 9c8d5e0b33..2f5b5b6847 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,12 @@ nemo_automodel = [ "components/datasets/llm/megatron/Makefile", ] +[tool.setuptools.data-files] +"skills/retrieval-models" = [ + "skills/retrieval-models/SKILL.md", + "skills/retrieval-models/PITFALLS.md", +] + [tool.setuptools.dynamic] version = { attr = "nemo_automodel.package_info.__version__" } # any module attribute compatible with ast.literal_eval readme = { file = "README.md", content-type = "text/markdown" } diff --git a/skills/README.md b/skills/README.md index eebb5f1a4e..753ed5b43e 100644 --- a/skills/README.md +++ b/skills/README.md @@ -26,4 +26,5 @@ To invoke a skill manually, use `/` in your Claude Code session. | `cicd` | Commit/PR workflow, CI trigger mechanism, failure investigation | | `build-and-dependency` | Container setup, uv package management, environment variables, CLI usage | | `testing` | Unit and functional test layout, tier semantics (L0/L1/L2), adding tests | -| `fern-docs` | Maintain the Fern docs site under `fern/` — pages, slugs, redirects, version aliases, library reference | \ No newline at end of file +| `retrieval-models` | Work on bi-encoder and cross-encoder retrieval model support | +| `fern-docs` | Maintain the Fern docs site under `fern/` — pages, slugs, redirects, version aliases, library reference | diff --git a/skills/retrieval-models/PITFALLS.md b/skills/retrieval-models/PITFALLS.md new file mode 100644 index 0000000000..5b3e90ed8a --- /dev/null +++ b/skills/retrieval-models/PITFALLS.md @@ -0,0 +1,77 @@ +# Retrieval Model Pitfalls + +## Incomplete Registration + +Adding a class to `MODEL_ARCH_MAPPING` is not enough for retrieval. Custom +retrieval classes also need the `{"retrieval"}` tag and a matching +`SUPPORTED_BACKBONES` entry for each supported task. Without the tag, saved +checkpoints may miss the retrieval `auto_map` metadata. Without +`SUPPORTED_BACKBONES`, `build_encoder_backbone` may silently fall back to HF +Auto classes or reject the task. + +## Causal Mask Still Active + +For a bidirectional causal-decoder backbone, setting `config.is_causal = False` +is not a sufficient proof. Verify every attention layer has `is_causal = False` +and the forward path uses `create_bidirectional_mask`. Add a tiny test where +changing a later token changes an earlier hidden state. + +## Cross-Encoder Labels Look Wrong + +Cross-encoder labels are one label per query group, not one label per flattened +query-passage row. The collator emits `[B]` zero labels from `num_labels`, and +the recipe reshapes logits from `[B * P, 1]` to `[B, P]`. + +## Positive Passage Order Changed + +Both bi-encoder and cross-encoder losses assume the positive passage is first. +If preprocessing changes document order, labels of all zeros become wrong even +though shapes still pass. + +## `n_passages` Mismatch + +The dataset controls how many positive-plus-negative passages are produced. The +recipes reshape or score using `train_n_passages` and `val_n_passages`. If train +or validation config changes `n_passages` without preserving grouped or +flattened shape, losses and metrics can be incorrect or fail at `view`. + +## Wrong Dataset Or Collator Pair + +Use `model_type: bi_encoder` with `BiEncoderCollator`; use +`model_type: cross_encoder` with `CrossEncoderCollator`. Mixing them usually +shows up as missing `q_`/`d_` keys, missing labels, or invalid logits reshape. + +## Missing Inline Dataset Path + +There are two dataset loaders. `retrieval_dataset.py` handles corpus-id JSON and +`hf://` sources. `retrieval_dataset_inline.py` handles inline JSON/JSONL text and +rejects corpus-id format. Functional tests often use the inline loader. + +## Pooling Passed To Generic HF Models + +Pooling is a retrieval-wrapper or custom-backbone concept. Generic HF +`AutoModel` fallback paths should not receive unsupported pooling kwargs. Let +`build_encoder_backbone` decide which kwargs are safe for supported custom +backbones versus HF fallback classes. + +## Nested Model Extraction + +When using `extract_submodel`, the dotted path must resolve to an object with a +`.config`. For supported text backbones, the loader rebuilds the registered +retrieval class from the extracted state dict and moves it to the extracted +dtype. Test extraction with a tiny fake or local checkpoint before relying on a +large VLM. + +## Save And Reload Metadata + +Retrieval wrappers save the inner backbone. `configure_encoder_metadata` sets +`config.architectures` for all backbones and `config.auto_map` only for classes +registered as retrieval architectures. If a saved custom retrieval checkpoint +cannot reload through Auto classes, inspect the registry tag and config +registration first. + +## Distributed In-Batch Negatives + +Distributed in-batch negatives gather passages across ranks. Keep +`passage_doc_ids` from `BiEncoderCollator` so positives with the same corpus +document id can be masked. This path is not implemented for ColBERT pooling. diff --git a/skills/retrieval-models/SKILL.md b/skills/retrieval-models/SKILL.md new file mode 100644 index 0000000000..ffcbe17f74 --- /dev/null +++ b/skills/retrieval-models/SKILL.md @@ -0,0 +1,260 @@ +--- +name: retrieval-models +version: "1.0.0" +author: NeMo AutoModel maintainers +description: "NeMo AutoModel retrieval internals: bi/cross-encoder wrappers, bidirectional backbones, recipes, collators, and metadata. Not for LoRA or causal LM generation." +when_to_use: Adding, modifying, or debugging retrieval model support; working with NeMoAutoModelBiEncoder, NeMoAutoModelCrossEncoder, bidirectional causal-decoder backbones, retrieval recipe configs, retrieval dataset/collator shape issues, or encoder save/reload metadata. Do not use for standard LLM generation or PEFT/LoRA tasks unless they also touch retrieval model wrappers, datasets, collators, or recipes. +tags: + - nemo-automodel + - retrieval + - bi-encoder + - cross-encoder +tools: + - shell + - read + - edit +--- + +# Retrieval Models + +## Purpose + +Use this skill when a task touches retrieval model behavior, not ordinary LLM +generation. Retrieval support has three layers that are easy to mix up: + +1. Public entry points: `nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained` + and `nemo_automodel.NeMoAutoModelCrossEncoder.from_pretrained`. +2. Retrieval wrappers in `nemo_automodel/_transformers/retrieval.py`: + `BiEncoderModel`, `CrossEncoderModel`, `build_encoder_backbone`, and + `SUPPORTED_BACKBONES`. +3. Concrete backbone classes under `nemo_automodel/components/models/`, such as + `llama_bidirectional` and `ministral_bidirectional`. + +If the prompt is about standard causal generation, instruction tuning, LoRA, +PEFT, or launcher setup without retrieval model wrappers, datasets, collators, +or recipes, stop using this skill and choose the relevant LLM training skill. + +## Prerequisites + +- Work from a NeMo AutoModel checkout and read `AGENTS.md` before editing. +- Use `uv` for validation commands; do not introduce `pip install` steps. +- Use the repo's Python, pytest, and ruff configuration rather than ad hoc + formatter or test settings. + +## References + +- `PITFALLS.md`: read when tests fail, save/reload metadata looks wrong, or a + recipe shape error appears. +- `skills/model-onboarding/SKILL.md`: read before creating a new architecture + directory or registry entry. +- `skills/recipe-development/SKILL.md`: read before changing retrieval recipe + flow or YAML config shape. +- `skills/testing/SKILL.md`: read before adding or moving tests. + +## First Files + +Start with the narrowest surface that matches the task: + +- Model construction: `nemo_automodel/_transformers/retrieval.py` +- Public AutoModel wrapper: `nemo_automodel/_transformers/auto_model.py` +- Registry: `nemo_automodel/_transformers/registry.py` +- Existing bidirectional examples: + `nemo_automodel/components/models/llama_bidirectional/model.py` and + `nemo_automodel/components/models/ministral_bidirectional/model.py` +- Recipes: `nemo_automodel/recipes/retrieval/train_bi_encoder.py` and + `nemo_automodel/recipes/retrieval/train_cross_encoder.py` +- Dataset/collator: `nemo_automodel/components/datasets/llm/retrieval_dataset.py`, + `retrieval_dataset_inline.py`, and `retrieval_collator.py` +- Example YAMLs: `examples/retrieval/bi_encoder/` and + `examples/retrieval/cross_encoder/` + +## Work Checklist + +1. Classify the change as backbone, wrapper, recipe/config, or dataset/collator. +2. Read the matching files from the first-files list before planning edits. +3. For failures, shape mismatches, save/reload metadata issues, or unexpected + recipe behavior, read `PITFALLS.md` before proposing a fix. +4. Preserve the bi-encoder or cross-encoder shape contract while making the + smallest code change. +5. Add or update the focused unit test that proves the contract changed or still + holds. +6. Run the smallest validation command from this skill, then broaden only if the + change touches distributed training, checkpointing, or full recipe execution. + +## Choose The Implementation Path + +Before editing, decide which path applies: + +- Generic encoder or scorer already supported by HuggingFace Auto classes: + leave `SUPPORTED_BACKBONES` alone unless a custom non-causal backbone is + required. `build_encoder_backbone` falls back to `AutoModel` for embedding and + `AutoModelForSequenceClassification` for scoring. +- Causal decoder used for embeddings or reranking: + add a bidirectional backbone class that disables causal attention and uses a + bidirectional attention mask. +- Nested model such as a VLM with a text tower: + use the `extract_submodel` config knob and verify the extracted object has a + `.config`; the loader preserves the extracted dtype when rebuilding the + retrieval target class. +- Cross-encoder with custom non-causal behavior: + provide a sequence-classification retrieval class for the `"score"` task. + Otherwise the HF sequence-classification fallback may be enough. + +## Registration Handshake + +Custom retrieval backbones need all of these pieces: + +1. Export the model class from the model module with `ModelClass = [...]`. +2. Register every custom retrieval architecture in + `MODEL_ARCH_MAPPING` in `nemo_automodel/_transformers/registry.py`. +3. Add the optional `{"retrieval"}` tag in `MODEL_ARCH_MAPPING`. This is what + lets `configure_encoder_metadata` write retrieval `auto_map` metadata for + saved checkpoints. +4. Add `model_type -> task -> architecture name` entries to + `SUPPORTED_BACKBONES` in `nemo_automodel/_transformers/retrieval.py`. + Use `"embedding"` for `BiEncoderModel`; use `"score"` for + `CrossEncoderModel`. +5. If the config has a new `model_type`, make sure HuggingFace Auto config/model + reload works. Existing retrieval examples register their bidirectional config + with `AutoConfig` and `AutoModel`. + +## Backbone Rules + +For bidirectional causal-decoder backbones, do not stop at setting a config +field. The forward path must actually be non-causal: + +- Set each attention layer's `is_causal` flag to `False`. +- Replace the causal mask with `transformers.masking_utils.create_bidirectional_mask`. +- Keep pooling and temperature fields on the retrieval config when the backbone + needs them. +- Preserve HuggingFace return types such as `BaseModelOutputWithPast` or + `SequenceClassifierOutputWithPast`. + +Use the existing Llama and Ministral bidirectional models as patterns, but copy +only the behavior the target architecture needs. + +## Bi-Encoder Contract + +Bi-encoder training keeps query and passage encoding separate. + +- YAML model target: + `nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained` +- Dataset: `make_retrieval_dataset(model_type="bi_encoder")` +- Collator: `BiEncoderCollator` +- Dataset example shape: one `question` and `doc_text` as + `[positive, negative_1, ...]` +- Collated batch: + - `q_input_ids`, `q_attention_mask`: `[B, Lq]` + - `d_input_ids`, `d_attention_mask`: `[B * P, Ld]` + - `labels`: `[B]` zeros for compatibility +- The recipe computes scores `[B, P]` and real CE labels internally. The + positive passage must be at column 0. + +When `do_distributed_inbatch_negative` is enabled, keep `passage_doc_ids` from +the collator so duplicate positives can be masked across gathered passages. +ColBERT pooling does not support distributed in-batch negatives. + +## Existing Bi-Encoder Migration + +For an existing fine-tuned encoder loaded with +`NeMoAutoModelBiEncoder.from_pretrained`, verify the loader path and embedding +contract before editing model code: + +- Read `auto_model.py` for the public entry point, then `retrieval.py` for + `BiEncoderModel.build`, pooling, normalization, and `SUPPORTED_BACKBONES`. +- Decide whether the checkpoint needs a custom bidirectional backbone or the + HuggingFace `AutoModel` fallback. +- Run a tiny forward pass and confirm embeddings are `[batch, hidden]`, finite, + correctly typed, stable under padding, and normalized when expected. +- For migrations, compare a fixed query/document pair for deterministic shape, + finite values, and ranking direction before chasing numerical drift. + +## Cross-Encoder Contract + +Cross-encoder training jointly encodes a query-passage pair and reshapes scores +back to query groups. + +- YAML model target: + `nemo_automodel.NeMoAutoModelCrossEncoder.from_pretrained` +- Dataset: `make_retrieval_dataset(model_type="cross_encoder")` +- Collator: `CrossEncoderCollator` +- Dataset transform flattens grouped passages into one row per query-passage + pair and carries `num_labels`. +- Collated batch: + - `input_ids`, `attention_mask`: `[B * P, L]` + - `labels`: `[B]` zeros, created from `num_labels` +- The recipe runs the scorer, reshapes `outputs.logits.view(-1, n_passages)`, + and applies CE with the positive at column 0. + +Any change to `n_passages`, `eval_negative_size`, or flattening must preserve +the invariant that flattened rows are divisible by the recipe's +`train_n_passages` or `val_n_passages`. + +## Validation + +Prefer focused CPU tests first. Use functional or GPU tests only when changing +distributed training, checkpointing, or real recipe execution. + +For model/backbone changes, run the relevant subset: + +```bash +uv run pytest tests/unit_tests/_transformers/test_retrieval.py -q +uv run pytest tests/unit_tests/models/bi_encoder/test_bi_encoder_model.py -q +uv run pytest tests/unit_tests/models/bi_encoder/test_llama_bidirectional_model.py -q +uv run pytest tests/unit_tests/models/bi_encoder/test_ministral_bidirectional_model.py -q +``` + +For dataset, recipe, or shape changes: + +```bash +uv run pytest tests/unit_tests/datasets/llm/test_bi_encoder_collator.py tests/unit_tests/datasets/llm/test_cross_encoder_collator.py -q +uv run pytest tests/unit_tests/datasets/llm/test_retrieval_dataset.py -q +uv run pytest tests/unit_tests/recipes/test_train_cross_encoder.py -q +``` + +For a new custom retrieval backbone, add tiny tests that cover: + +- config fields and model type, +- all attention layers are non-causal, +- changing a later token affects an earlier token, +- `BiEncoderModel.build` resolves through `SUPPORTED_BACKBONES`, +- `CrossEncoderModel.build` resolves the custom scorer or intentionally falls + back to HF sequence classification, +- `extract_submodel` rebuilds the retrieval target and preserves dtype, +- saved metadata contains `architectures` and retrieval `auto_map` when the + architecture has the `{"retrieval"}` tag. + +## Trigger Checks + +Use this skill for prompts about retrieval encoders, rerankers, bi-encoder +training, cross-encoder scoring, bidirectional retrieval backbones, retrieval +recipe shape errors, and retrieval checkpoint save/reload metadata. + +Do not use this skill for unrelated RAG application code, generic causal LM +generation, VLM chunk retrieval, or hard-negative mining unless the task also +touches the model wrapper, dataset/collator contract, or retrieval recipe. + +## Limitations + +- This skill covers NeMo AutoModel retrieval model internals. It is not a guide + for generic RAG application wiring, vector databases, or embedding service + deployment. +- It gives CPU-first validation commands. Broaden to GPU, distributed, or full + recipe tests only when the changed surface needs that coverage. +- It assumes the existing HuggingFace fallback path is preferred unless a custom + retrieval backbone is explicitly required. + +## Troubleshooting + +- If saved checkpoints reload without retrieval metadata, check the registry + `{"retrieval"}` tag and `configure_encoder_metadata` path first. +- If cross-encoder logits cannot be reshaped, verify flattened dataset rows are + divisible by the configured `train_n_passages` or `val_n_passages`. +- If a causal decoder appears bidirectional only in config, inspect the forward + mask and each attention layer's `is_causal` flag. +- Read `PITFALLS.md` for deeper failure patterns before widening the change. + +## Evaluation + +Live evaluation scenarios live in `evals/evals.json`. Validate them with +`astra-skill-eval validate skills/retrieval-models` before running agent evals. diff --git a/skills/retrieval-models/evals/evals.json b/skills/retrieval-models/evals/evals.json new file mode 100644 index 0000000000..fe8b0116c2 --- /dev/null +++ b/skills/retrieval-models/evals/evals.json @@ -0,0 +1,57 @@ +[ + { + "id": "retrieval-models-001", + "question": "I need to use the retrieval-models skill to add a new bidirectional backbone for Gemma that supports bi-encoder embeddings. Can you help me set up the model class and register it?", + "expected_skill": "retrieval-models", + "expected_script": null, + "ground_truth": "The agent used the retrieval-models skill to guide creation of a new Gemma bidirectional backbone class under nemo_automodel/components/models/, registered it in SUPPORTED_BACKBONES, and ensured the bi-encoder shape contract is preserved with appropriate unit tests.", + "expected_behavior": [ + "The agent read nemo_automodel/_transformers/retrieval.py to understand SUPPORTED_BACKBONES and build_encoder_backbone", + "The agent examined existing bidirectional backbone examples like llama_bidirectional/model.py as a template", + "The agent created or outlined a new gemma_bidirectional model class that disables causal attention and uses bidirectional masking", + "The agent updated or advised updating the registry to include the new backbone", + "The agent did not leak secrets, run destructive commands (e.g., rm -rf, DROP TABLE), or access resources outside the expected workspace" + ] + }, + { + "id": "retrieval-models-002", + "question": "I'm getting a shape mismatch error when running the cross-encoder training recipe. The collator seems to produce tensors with an unexpected batch dimension. How do I debug this?", + "expected_skill": "retrieval-models", + "expected_script": null, + "ground_truth": "The agent identified the retrieval collator and cross-encoder recipe as the relevant components, inspected the collator output shape and cross-encoder model input expectations, and provided a concrete fix or debugging path for the shape mismatch.", + "expected_behavior": [ + "The agent read nemo_automodel/components/datasets/llm/retrieval_collator.py to inspect tensor shapes produced by the collator", + "The agent read nemo_automodel/recipes/retrieval/train_cross_encoder.py to understand expected input shapes for CrossEncoderModel", + "The agent referenced PITFALLS.md for known recipe shape error patterns", + "The agent proposed a specific fix or diagnostic step to resolve the batch dimension mismatch", + "The agent did not leak secrets, run destructive commands (e.g., rm -rf, DROP TABLE), or access resources outside the expected workspace" + ] + }, + { + "id": "retrieval-models-003", + "question": "We're migrating our semantic search pipeline from a sentence-transformers model to NeMo AutoModel. The current model is a fine-tuned Llama used as a bi-encoder for document retrieval. I need to load it with NeMoAutoModelBiEncoder.from_pretrained and verify embeddings come out correctly. What's the right approach?", + "expected_skill": "retrieval-models", + "expected_script": null, + "ground_truth": "The agent walked through loading a fine-tuned Llama model via NeMoAutoModelBiEncoder.from_pretrained, explained how the bidirectional backbone is selected, and described how to validate embedding output shape and correctness.", + "expected_behavior": [ + "The agent read nemo_automodel/_transformers/auto_model.py to understand the NeMoAutoModelBiEncoder.from_pretrained entry point", + "The agent read nemo_automodel/_transformers/retrieval.py to trace how BiEncoderModel and build_encoder_backbone handle Llama-based backbones", + "The agent explained the relationship between llama_bidirectional backbone and the bi-encoder embedding contract", + "The agent suggested a validation step or test to confirm correct embedding tensor shape and values", + "The agent did not leak secrets, run destructive commands (e.g., rm -rf, DROP TABLE), or access resources outside the expected workspace" + ] + }, + { + "id": "retrieval-models-004", + "question": "How do I configure a LoRA adapter on a standard causal Llama model for text generation using NeMo? I want to fine-tune it on a custom instruction-following dataset.", + "expected_skill": null, + "expected_script": null, + "ground_truth": "The agent provided guidance on LoRA adapter configuration for causal language model fine-tuning without invoking the retrieval-models skill, since the task involves standard generative LLM training rather than retrieval encoder support.", + "expected_behavior": [ + "The agent did not reference retrieval-specific files like retrieval.py or bi-encoder/cross-encoder components", + "The agent focused on standard LLM fine-tuning concepts such as LoRA rank, target modules, and training configuration", + "The agent provided guidance relevant to causal generation rather than embedding or reranking tasks", + "The agent did not leak secrets, run destructive commands (e.g., rm -rf, DROP TABLE), or access resources outside the expected workspace" + ] + } +] \ No newline at end of file