Merge branch 'main' into yash-tfv5

dushyantbehl · web-flow · commit 13b501c9dc52 · 2026-02-24T19:17:31.000+05:30
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
   - [Advanced Data Processing](./docs/advanced-data-preprocessing.md#data-config)
   - [Guidelines on supported data formats](./docs/advanced-data-preprocessing.md#use-cases-supported-via-command-line-argument-training_data_path)
   - [Offline data processing](#offline-data-preprocessing)
-  - [Online data mixing](./docs/online-data-mixing.md)
+  - [Online data mixing](./docs/advanced-data-preprocessing.md#online-data-mixing-section)
 - [Additional Frameworks](#additional-frameworks)
   - [Inference](#inference)
   - [Validation](#validation)
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -25,6 +25,7 @@ ARG ENABLE_MLFLOW=false
 ARG ENABLE_FMS_ACCELERATION=true
 ARG ENABLE_SCANNER=false
 ARG ENABLE_CLEARML=false
+ARG ENABLE_RECOMMENDER=true
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
@@ -188,6 +189,9 @@ RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
 RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \
         python -m pip install --user "$(head bdist_name)[clearml]"; \
     fi
+RUN if [[ "${ENABLE_RECOMMENDER}" == "true" ]]; then \
+        python -m pip install --user "$(head bdist_name)[tuning-config-recommender]"; \
+    fi
 
     # Clean up the wheel module. It's only needed by flash-attn install
 RUN python -m pip uninstall wheel build -y && \
diff --git a/build/nvcr.Dockerfile b/build/nvcr.Dockerfile
@@ -34,6 +34,7 @@ ARG ENABLE_MLFLOW=false
 ARG ENABLE_SCANNER=false
 ARG ENABLE_CLEARML=true
 ARG ENABLE_TRITON_KERNELS=true
+ARG ENABLE_RECOMMENDER=true
 
 # Ensures to always build mamba_ssm from source
 ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm
@@ -76,6 +77,9 @@ RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \
 RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
         pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \
     fi
+RUN if [[ "${ENABLE_RECOMMENDER}" == "true" ]]; then \
+        python -m pip install --user "$(head bdist_name)[tuning-config-recommender]"; \
+    fi
 
 # cleanup
 RUN rm -rf /root/.cache /tmp/* /opt/pytorch
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,7 @@ fms-accel-all = [
     "fms-acceleration-moe",
     "fms-acceleration-odm"
 ]
+tuning-config-recommender=["tuning-config-recommender>=0.1.5"]
 
 [tool.setuptools.packages.find]
 exclude = ["tests", "tests.*"]
diff --git a/scripts/convert_to_hf_checkpoint.py b/scripts/convert_to_hf_checkpoint.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+import argparse
+import os
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoTokenizer
+
+from fms_acceleration_moe.utils import recover_safetensors_from_dcp
+
+
+HF_CACHE = "/workspace/.hf"
+os.environ.setdefault("HF_HOME", HF_CACHE)
+
+
+def has_weights(p: Path) -> bool:
+    return (
+        (p / "model.safetensors").exists()
+        or (p / "model.safetensors.index.json").exists()
+        or any(p.glob("model-*.safetensors"))
+    )
+
+
+def get_base_model(model_id_or_path: str, allow_download: bool) -> Path:
+    p = Path(model_id_or_path)
+
+    if p.exists():
+        if not has_weights(p):
+            raise RuntimeError(f"No base weights found in {p}")
+        return p.resolve()
+
+    if not allow_download:
+        raise RuntimeError("Base model not found locally and downloads disabled")
+
+    local_dir = snapshot_download(
+        repo_id=model_id_or_path,
+        allow_patterns=[
+            "config.json",
+            "model*.safetensors",
+            "tokenizer*",
+            "special_tokens_map.json",
+            "generation_config.json",
+        ],
+    )
+
+    local_dir = Path(local_dir).resolve()
+    if not has_weights(local_dir):
+        raise RuntimeError(f"Downloaded base model but weights missing in {local_dir}")
+
+    return local_dir
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dcp_checkpoint_dir", required=True, type=Path)
+    ap.add_argument("--pretrained_model_name_or_path", required=True)
+    ap.add_argument("--output_dir", required=True, type=Path)
+    ap.add_argument("--allow_model_download", action="store_true")
+    ap.add_argument(
+        "--additional_special_tokens",
+        nargs="*",
+        default=[],
+    )
+    ap.add_argument("--chat_template", type=str, default=None)
+    args = ap.parse_args()
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # base model (local snapshot)
+    base_model_dir = get_base_model(
+        args.pretrained_model_name_or_path,
+        args.allow_model_download,
+    )
+
+    # dcp to hf compatible
+    recover_safetensors_from_dcp(
+        str(args.dcp_checkpoint_dir),
+        str(base_model_dir),
+        str(args.output_dir),
+    )
+
+    # tokenizer chat_template plus additional tokens
+    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name_or_path)
+    if args.chat_template is not None:
+        tokenizer.chat_template = args.chat_template
+    if args.additional_special_tokens:
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": args.additional_special_tokens}
+        )
+    tokenizer.save_pretrained(args.output_dir)
+
+    config = AutoConfig.from_pretrained(base_model_dir)
+    config.vocab_size = len(tokenizer)
+    config.save_pretrained(args.output_dir)
+
+    print(f"[OK] HF checkpoint written to {args.output_dir}")
+    print(f"[OK] vocab_size = {len(tokenizer)}")
+
+
+if __name__ == "__main__":
+    main()
+

Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,7 @@ fms-accel-all = [`
`60`	`60`	`"fms-acceleration-moe",`
`61`	`61`	`"fms-acceleration-odm"`
`62`	`62`	`]`
	`63`	`+tuning-config-recommender=["tuning-config-recommender>=0.1.5"]`
`63`	`64`
`64`	`65`	`[tool.setuptools.packages.find]`
`65`	`66`	`exclude = ["tests", "tests.*"]`