Merge branch 'main' into dcp-hf-util

dushyantbehl · web-flow · commit 24a942e2502c · 2026-02-24T18:25:31.000+05:30
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
   - [Advanced Data Processing](./docs/advanced-data-preprocessing.md#data-config)
   - [Guidelines on supported data formats](./docs/advanced-data-preprocessing.md#use-cases-supported-via-command-line-argument-training_data_path)
   - [Offline data processing](#offline-data-preprocessing)
-  - [Online data mixing](./docs/online-data-mixing.md)
+  - [Online data mixing](./docs/advanced-data-preprocessing.md#online-data-mixing-section)
 - [Additional Frameworks](#additional-frameworks)
   - [Inference](#inference)
   - [Validation](#validation)
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -25,6 +25,7 @@ ARG ENABLE_MLFLOW=false
 ARG ENABLE_FMS_ACCELERATION=true
 ARG ENABLE_SCANNER=false
 ARG ENABLE_CLEARML=false
+ARG ENABLE_RECOMMENDER=true
 
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
@@ -188,6 +189,9 @@ RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
 RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \
         python -m pip install --user "$(head bdist_name)[clearml]"; \
     fi
+RUN if [[ "${ENABLE_RECOMMENDER}" == "true" ]]; then \
+        python -m pip install --user "$(head bdist_name)[tuning-config-recommender]"; \
+    fi
 
     # Clean up the wheel module. It's only needed by flash-attn install
 RUN python -m pip uninstall wheel build -y && \
diff --git a/build/nvcr.Dockerfile b/build/nvcr.Dockerfile
@@ -34,6 +34,7 @@ ARG ENABLE_MLFLOW=false
 ARG ENABLE_SCANNER=false
 ARG ENABLE_CLEARML=true
 ARG ENABLE_TRITON_KERNELS=true
+ARG ENABLE_RECOMMENDER=true
 
 # Ensures to always build mamba_ssm from source
 ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm
@@ -76,6 +77,9 @@ RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \
 RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
         pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \
     fi
+RUN if [[ "${ENABLE_RECOMMENDER}" == "true" ]]; then \
+        python -m pip install --user "$(head bdist_name)[tuning-config-recommender]"; \
+    fi
 
 # cleanup
 RUN rm -rf /root/.cache /tmp/* /opt/pytorch
diff --git a/docs/advanced-data-preprocessing.md b/docs/advanced-data-preprocessing.md
@@ -162,7 +162,8 @@ Each data handler has:
     - `sampling` (optional, float): The sampling ratio (0.0 to 1.0) with which to sample a dataset in case of interleaving.
     - `split` (optional, dict[str: float]): Defines how to split the dataset into training and validation sets. Requires both `train` and `validation` keys.
     - `data_handlers` (optional, list): A list of data handler configurations which preprocess the dataset.
-
+    - `dataset_split_name` (optional, str): Name of the dataset split. This is useful for loading HuggingFace datasets with split names that are different from the standard (eg: `train_sft` instead of `train`). If no `dataset_split_name` is provided, `train` is used.
+    - `shuffle` (optional, bool): If the dataset should be shuffled while splitting into train and validation split. Defaults to `True`. Use caution when using this field and only use when the dataset is already shuffled.
 
 We do provide some sample `data_configs` here, [predefined_data_configs](../tests/artifacts/predefined_data_configs/).
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,12 +38,12 @@ dependencies = [
 "peft>=0.18.0,< 0.19.0",
 "datasets>=4.0.0,<5.0.0",
 "simpleeval>=0.9.13,<2.0",
-"pillow>=11.0.0,<12.0",
+"pillow>=12.1.1",
 "kernels<=0.9.0",
 ]
 
 [project.optional-dependencies]
-dev = ["wheel>=0.42.0,<1.0", "packaging>=23.2,<25", "ninja>=1.11.1.1,<2.0", "scikit-learn>=1.0, <2.0", "boto3>=1.34, <2.0"]
+dev = ["wheel>=0.42.0,<1.0", "packaging>=23.2,<25", "ninja>=1.11.1.1,<2.0", "scikit-learn>=1.0, <2.0", "boto3>=1.34, <2.0", "hf_transfer>=0.1.9"]
 flash-attn = ["flash-attn>=2.8.3"]
 aim = ["aim>=3.19.0,<4.0"]
 mlflow = ["mlflow"]
@@ -60,6 +60,7 @@ fms-accel-all = [
     "fms-acceleration-moe",
     "fms-acceleration-odm"
 ]
+tuning-config-recommender=["tuning-config-recommender>=0.1.5"]
 
 [tool.setuptools.packages.find]
 exclude = ["tests", "tests.*"]
diff --git a/tests/artifacts/predefined_data_configs/__init__.py b/tests/artifacts/predefined_data_configs/__init__.py
@@ -86,3 +86,6 @@
 DATA_CONFIG_SKIP_LARGE_COLUMNS_HANDLER = os.path.join(
     PREDEFINED_DATA_CONFIGS, "skip_large_columns_data_handler_template.yaml"
 )
+DATA_CONFIG_CUSTOM_SPLIT_NAME = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "dataset_with_custom_split.yaml"
+)
diff --git a/tests/artifacts/predefined_data_configs/dataset_with_custom_split.yaml b/tests/artifacts/predefined_data_configs/dataset_with_custom_split.yaml
@@ -0,0 +1,34 @@
+dataprocessor:
+    type: default
+    sampling_stopping_strategy: first_exhausted
+    seed: 66
+datasets:
+  - name: dataset_split_custom_split_name
+    split:
+      train: 0.8
+      validation: 0.2
+    sampling: 0.5
+    dataset_split_name: "train_sft"
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            formatted_text_column_name: "formatted_chat_data"
+            conversation_column: "messages"
+  - name: dataset_wo_split_custom_split_name
+    sampling: 0.5
+    dataset_split_name: "train_sft"
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_chat_template_with_masking
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            formatted_text_column_name: "formatted_chat_data"
+            conversation_column: "messages"
diff --git a/tests/artifacts/testdata/__init__.py b/tests/artifacts/testdata/__init__.py
@@ -86,6 +86,7 @@
 IMAGE_DATASET = os.path.join(JSONL_DATA_DIR, "image_dataset.jsonl")
 EMPTY_DATA = os.path.join(JSON_DATA_DIR, "empty_data.json")
 MALFORMATTED_DATA = os.path.join(JSON_DATA_DIR, "malformatted_data.json")
+CHAT_DATA_HF_HOSTED_CUSTOM_SPLIT = "rom7/test-OpenHermes-2.5-H4"
 
 # Other constants
 CUSTOM_TOKENIZER_TINYLLAMA = os.path.join(
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -42,6 +42,7 @@
 from tests.artifacts.language_models import MAYKEYE_TINY_LLAMA_CACHED, TINYMIXTRAL_MOE
 from tests.artifacts.predefined_data_configs import (
     CHAT_TEMPLATE_JINJA,
+    DATA_CONFIG_CUSTOM_SPLIT_NAME,
     DATA_CONFIG_DUPLICATE_COLUMNS,
     DATA_CONFIG_INVALID_BASE64_CHAT_TEMPLATE,
     DATA_CONFIG_MULTIPLE_DATASETS_ODM_YAML,
@@ -61,6 +62,7 @@
     GRANITE_3_1_B_CHAT_TEMPLATE,
 )
 from tests.artifacts.testdata import (
+    CHAT_DATA_HF_HOSTED_CUSTOM_SPLIT,
     CHAT_DATA_MULTI_TURN,
     CHAT_DATA_MULTI_TURN_CONVERSATIONS,
     CHAT_DATA_MULTI_TURN_GRANITE_3_1B,
@@ -949,16 +951,16 @@ def test_run_causallm_lora_add_special_tokens():
             ["lm_head"],
             ["embed_tokens"],
             marks=pytest.mark.skipif(
-                version.parse(peft.__version__) <= version.parse("0.18.0"),
-                reason="Not released in PEFT <= 0.18.0",
+                version.parse(peft.__version__) <= version.parse("0.18.1"),
+                reason="Not released in PEFT <= 0.18.1",
             ),
         ),
         pytest.param(
             ["embed_tokens", "lm_head"],
             ["embed_tokens"],
             marks=pytest.mark.skipif(
-                version.parse(peft.__version__) <= version.parse("0.18.0"),
-                reason="Not released in PEFT <= 0.18.0",
+                version.parse(peft.__version__) <= version.parse("0.18.1"),
+                reason="Not released in PEFT <= 0.18.1",
             ),
         ),
     ],
@@ -1010,8 +1012,8 @@ def test_run_causallm_lora_tied_weights_in_modules_to_save(modules_to_save, expe
     ],
 )
 @pytest.mark.skipif(
-    version.parse(peft.__version__) <= version.parse("0.18.0"),
-    reason="Not released in PEFT <= 0.18.0",
+    version.parse(peft.__version__) <= version.parse("0.18.1"),
+    reason="Not released in PEFT <= 0.18.1",
 )
 def test_run_causallm_lora_tied_weights_in_target_modules(target_modules, expected):
     """Check if a model with tied weights in target_modules is correctly trained"""
@@ -1916,6 +1918,61 @@ def test_run_moe_ft_with_save_model_dir(dataset_path):
         assert os.path.exists(os.path.join(save_model_dir))
 
 
+@pytest.mark.parametrize(
+    "datafiles, dataconfigfile",
+    [
+        (
+            [CHAT_DATA_HF_HOSTED_CUSTOM_SPLIT, CHAT_DATA_HF_HOSTED_CUSTOM_SPLIT],
+            DATA_CONFIG_CUSTOM_SPLIT_NAME,
+        )
+    ],
+)
+def test_run_chat_style_ft_using_custom_split_name(datafiles, dataconfigfile):
+    """Check if we can select custom split for a dataset."""
+    with tempfile.TemporaryDirectory() as tempdir:
+        data_args = copy.deepcopy(DATA_ARGS)
+        data_args.training_data_path = None
+        data_args.response_template = None
+        data_args.dataset_text_field = None
+        data_args.chat_template = CHAT_TEMPLATE_JINJA
+
+        model_args = copy.deepcopy(MODEL_ARGS)
+        model_args.model_name_or_path = TINYMIXTRAL_MOE
+        model_args.tokenizer_name_or_path = TINYMIXTRAL_MOE
+
+        train_args = copy.deepcopy(TRAIN_ARGS)
+        train_args.output_dir = tempdir
+
+        with tempfile.NamedTemporaryFile(
+            "w", delete=False, suffix=".yaml"
+        ) as temp_yaml_file:
+            with open(dataconfigfile, "r", encoding="utf-8") as f:
+                data = yaml.safe_load(f)
+            datasets = data["datasets"]
+            for i, d in enumerate(datasets):
+                d["data_paths"] = [datafiles[i]]
+            yaml.dump(data, temp_yaml_file)
+            data_args.data_config_path = temp_yaml_file.name
+
+        sft_trainer.train(model_args, data_args, train_args)
+
+        # validate the configs
+        _validate_training(tempdir)
+        checkpoint_path = _get_checkpoint_path(tempdir)
+
+        # Load the model
+        loaded_model = TunedCausalLM.load(checkpoint_path, MODEL_NAME)
+
+        # Run inference on the text
+        output_inference = loaded_model.run(
+            '<|user|>\nProvide two rhyming words for the word "love"\n\
+            <nopace></s><|assistant|>',
+            max_new_tokens=50,
+        )
+        assert len(output_inference) > 0
+        assert 'Provide two rhyming words for the word "love"' in output_inference
+
+
 ############################# Helper functions #############################
 def _test_run_causallm_ft(training_args, model_args, data_args, tempdir):
     train_args = copy.deepcopy(training_args)
diff --git a/tox.ini b/tox.ini
@@ -5,7 +5,7 @@ envlist = py, lint, fmt
 description = run unit tests
 deps =
     pytest>=7
-    .[aim,mlflow,clearml,scanner-dev]
+    .[mlflow,clearml,scanner-dev]
 commands =
     pytest {posargs:tests}
 
@@ -56,9 +56,9 @@ commands =
 
 [testenv:accel]
 description = run all unit tests including requring GPU support
-deps = 
+deps =
     pytest>=7
-    .[aim,mlflow,clearml,scanner-dev,fms-accel-all]
+    .[mlflow,clearml,scanner-dev,fms-accel-all]
 setenv =
     CUDA_VISIBLE_DEVICES=0
 commands_pre = 
@@ -74,9 +74,9 @@ commands =
 
 [testenv:gpu]
 description = run all unit tests including requring GPU support
-deps = 
+deps =
     pytest>=7
-    .[aim,mlflow,clearml,scanner-dev,fms-accel-all]
+    .[mlflow,clearml,scanner-dev,fms-accel-all]
 setenv =
     CUDA_VISIBLE_DEVICES=0
 commands_pre = 
diff --git a/tuning/data/data_config.py b/tuning/data/data_config.py
@@ -40,6 +40,8 @@ class DataSetConfig:
     sampling: Optional[float] = None
     data_handlers: Optional[List[DataHandlerConfig]] = None
     split: Optional[Dict[str, float]] = None
+    dataset_split_name: Optional[str] = None
+    shuffle: Optional[bool] = True
 
 
 @dataclass
@@ -123,6 +125,10 @@ def _validate_dataset_config(dataset_config) -> DataSetConfig:
                 isinstance(value, (float, int)) and 0.0 <= value <= 1.0
             ), f"split ratio for '{key}' must be a float in [0.0, 1.0], got {value}"
         c.split = {k: float(v) for k, v in split.items()}
+    if "dataset_split_name" in kwargs:
+        c.dataset_split_name = kwargs["dataset_split_name"]
+    if "shuffle" in kwargs:
+        c.shuffle = kwargs["shuffle"]
     return c
 
 
diff --git a/tuning/data/data_processors.py b/tuning/data/data_processors.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # Standard
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 import logging
 import os
 
@@ -102,8 +102,8 @@ def load_dataset(
         self,
         datasetconfig: DataSetConfig,
         streaming: bool,
-        splitName: str = None,
-        datafile: str = None,
+        splitName: Optional[str] = None,
+        datafile: Optional[str] = None,
         **kwargs,
     ):
 
@@ -112,6 +112,10 @@ def load_dataset(
         if (not datafile) and (not datasetconfig):
             raise ValueError("Either datafile or datasetconfig must be set")
 
+        effective_split = splitName
+        if datasetconfig and datasetconfig.dataset_split_name:
+            effective_split = datasetconfig.dataset_split_name
+
         def _load_dataset(
             data_path=None,
             builder=None,
@@ -133,8 +137,8 @@ def _load_dataset(
             """
 
             load_kwargs = {**kwargs}
-            if splitName is not None:
-                load_kwargs["split"] = splitName
+            if effective_split is not None:
+                load_kwargs["split"] = effective_split
             if data_dir is not None:
                 load_kwargs["data_dir"] = data_dir
             if data_files is not None:
@@ -402,6 +406,7 @@ def split_dataset(
 
         if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
             splits = dataset.keys()
+            # Other splits are ignored and only "train" or user provided split name is used
             if len(splits) == 1 and train_split in splits:
                 d = dataset[train_split]
             else:
@@ -434,7 +439,7 @@ def split_dataset(
         split_datasets = d.train_test_split(
             train_size=train_size if train_size > 0.0 else None,
             test_size=eval_size if eval_size > 0.0 else None,
-            shuffle=True,
+            shuffle=dataset_config.shuffle,
             seed=seed,
         )
 
diff --git a/tuning/data/utils.py b/tuning/data/utils.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # Standard
-from typing import List, Union
+from typing import List, Optional, Union
 import io
 import json
 import logging
@@ -39,7 +39,7 @@ def get_extension(file_path: str) -> str:
     return ext.lower()
 
 
-def get_loader_for_filepath(file_path: str) -> str:
+def get_loader_for_filepath(file_path: str) -> Optional[str]:
     ext = get_extension(file_path)
     if ext in (".txt", ".md"):
         return "text"
@@ -49,7 +49,8 @@ def get_loader_for_filepath(file_path: str) -> str:
         return "arrow"
     if ext in (".parquet",):
         return "parquet"
-    return ext
+
+    return None
 
 
 def load_yaml_or_json(file_path: str) -> dict:
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
@@ -23,6 +23,7 @@
 import traceback
 
 # Third Party
+from accelerate.utils import set_seed
 from huggingface_hub.utils._validators import HFValidationError
 from peft import LoraConfig
 from peft.utils.other import fsdp_auto_wrap_policy
@@ -501,6 +502,9 @@ def train(
         peft_config=peft_config,
     )
 
+    # Set seed for accelerate processes
+    set_seed(training_args.seed, device_specific=True)
+
     # We track additional metrics and experiment metadata after trainer object creation
     # this ensure that the process is not repeated multiple times for FSDP runs.
     if trainer.is_world_process_zero():

Original file line number	Diff line number	Diff line change
`@@ -86,3 +86,6 @@`
`86`	`86`	`DATA_CONFIG_SKIP_LARGE_COLUMNS_HANDLER = os.path.join(`
`87`	`87`	`PREDEFINED_DATA_CONFIGS, "skip_large_columns_data_handler_template.yaml"`
`88`	`88`	`)`
	`89`	`+DATA_CONFIG_CUSTOM_SPLIT_NAME = os.path.join(`
	`90`	`+ PREDEFINED_DATA_CONFIGS, "dataset_with_custom_split.yaml"`
	`91`	`+)`