From 3803eccd9e71992894121a31b9d71343561bd985 Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Tue, 22 Apr 2025 16:47:04 -0400 Subject: [PATCH 1/2] deps: upgrade trl Signed-off-by: Will Johnson --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f67483d550..09732281ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "sentencepiece>=0.1.99,<0.3", "tokenizers>=0.13.3,<1.0", "tqdm>=4.66.2,<5.0", -"trl>=0.13,<0.17", +"trl>=0.13,<0.18", "peft>=0.8.0,<0.14", "protobuf>=5.28.0,<6.0.0", "datasets>=2.15.0,<4.0", From 364a82cc802b96307f203872e09bacffac819c9f Mon Sep 17 00:00:00 2001 From: Will Johnson Date: Fri, 25 Apr 2025 00:01:51 -0400 Subject: [PATCH 2/2] docs: offline data preprocessing note Signed-off-by: Will Johnson --- docs/offline-data-preprocessing.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/offline-data-preprocessing.md b/docs/offline-data-preprocessing.md index f235d5aa44..53e77d1950 100644 --- a/docs/offline-data-preprocessing.md +++ b/docs/offline-data-preprocessing.md @@ -37,6 +37,8 @@ python scripts/offline_data_processing.py \ Additionally, once the offline data processing is complete, users can leverage the shards stored in `output_dir` for tuning by passing it through the `--training_data_path` flag or passing it via `data_paths` argument in data config yaml, provided they find the sharded datasets beneficial for training. +**NOTE**: The offline data preprocessing script is not compatible with processing image datasets for vision models. + ## Example Usage ### Applying Chat Template