From 245bb1e27b4567e7520e181bf7d3b0a0f2b2ab6d Mon Sep 17 00:00:00 2001 From: KKKKKKKevin <115385420+kevin-mindverse@users.noreply.github.com> Date: Wed, 30 Apr 2025 16:13:18 +0800 Subject: [PATCH 1/6] fix:hotfix use_previous_params problem (#320) Co-authored-by: Ye Xiangle --- lpm_kernel/api/domains/trainprocess/routes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lpm_kernel/api/domains/trainprocess/routes.py b/lpm_kernel/api/domains/trainprocess/routes.py index af6425e4..c8a12f2e 100644 --- a/lpm_kernel/api/domains/trainprocess/routes.py +++ b/lpm_kernel/api/domains/trainprocess/routes.py @@ -288,7 +288,6 @@ def retrain(): data_synthesis_mode: Mode for data synthesis (optional) use_cuda: Whether to use CUDA for training (optional) is_cot: Whether to use Chain of Thought (optional) - use_previous_params: Whether to use previous training parameters (optional, default True) Returns: Response: JSON response @@ -318,7 +317,7 @@ def retrain(): is_cot = data.get("is_cot", None) # Log the received parameters - logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}, use_previous_params={use_previous_params}") + logger.info(f"Retrain parameters: model_name={model_name}, learning_rate={learning_rate}, number_of_epochs={number_of_epochs}, concurrency_threads={concurrency_threads}, data_synthesis_mode={data_synthesis_mode}, use_cuda={use_cuda}, is_cot={is_cot}") # Create training service instance train_service = TrainProcessService(current_model_name=model_name) From be2f2d3084d2ddde6680e63d7c4166d90d3bf1fe Mon Sep 17 00:00:00 2001 From: Zachary Pitroda <30330004+zpitroda@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:18:27 -0400 Subject: [PATCH 2/6] Update LoRA config Updated LoRA config and removed deepspeed references --- lpm_kernel/L2/dpo/dpo_train.py | 22 ++++---- lpm_kernel/L2/train.py | 95 +++++++++++++--------------------- 2 files changed, 45 insertions(+), 72 deletions(-) diff --git a/lpm_kernel/L2/dpo/dpo_train.py b/lpm_kernel/L2/dpo/dpo_train.py index fb5d0bc1..6fba2f58 100644 --- a/lpm_kernel/L2/dpo/dpo_train.py +++ b/lpm_kernel/L2/dpo/dpo_train.py @@ -5,7 +5,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from datasets import Dataset from trl import DPOConfig, DPOTrainer -from peft import LoraConfig, AutoPeftModelForCausalLM, get_peft_model +from peft import LoraConfig from datetime import datetime, timedelta # from clearml import Task @@ -61,12 +61,14 @@ def train(args): lora_config = None else: lora_config = LoraConfig( - r=args.lora_r, - lora_alpha=args.lora_alpha, - lora_dropout=args.lora_dropout, + r=16, # Reduced from 64 for better efficiency + lora_alpha=16, # Adjusted to maintain 2:1 alpha:r ratio + lora_dropout=0.0, # Set to 0.0 for Unsloth compatibility bias="none", - target_modules="all-linear", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # Target specific attention layers instead of all linear task_type="CAUSAL_LM", + inference_mode=False, + fan_in_fan_out=False ) training_args = DPOConfig( @@ -126,13 +128,9 @@ def train(args): parser.add_argument('--beta', type=float, default=0.1) # LoRA arguments - parser.add_argument('--lora_r', type=int, default=64) - parser.add_argument('--lora_alpha', type=int, default=128) - parser.add_argument('--lora_dropout', type=float, default=0.1) - - # DeepSpeed arguments - parser.add_argument('--deepspeed', type=str, default=None) - parser.add_argument('--local_rank', type=int, default=-1) + parser.add_argument('--lora_r', type=int, default=16) + parser.add_argument('--lora_alpha', type=int, default=16) + parser.add_argument('--lora_dropout', type=float, default=0.0) args = parser.parse_args() diff --git a/lpm_kernel/L2/train.py b/lpm_kernel/L2/train.py index 54a00ee6..a70cd7b8 100644 --- a/lpm_kernel/L2/train.py +++ b/lpm_kernel/L2/train.py @@ -204,30 +204,27 @@ def main(model_args, data_args, training_args): training_args = memory_manager.optimize_training_args(training_args) # --- Accelerate optimizer state offloading logic --- - # Enable optimizer state offload to CPU if VRAM is low and not using DeepSpeed + # Enable optimizer state offload to CPU if VRAM is low vram_total = memory_manager.get_memory_info().get("vram_total_gb", 0) use_accelerate_offload = False if torch.cuda.is_available() and model_args.use_cuda and vram_total > 0 and vram_total < 16: - # Only set if not already using DeepSpeed - if not hasattr(training_args, "deepspeed") or training_args.deepspeed is None: - logger.info("Enabling Hugging Face Accelerate optimizer state offload to CPU for low VRAM GPUs") - accelerate_config = { - "compute_environment": "LOCAL_MACHINE", - "deepspeed_config": None, - "distributed_type": "NO", - "downcast_bf16": False, - "fsdp_config": {}, - "main_training_function": "main", - "mixed_precision": "no", - "num_machines": 1, - "num_processes": 1, - "use_cpu": False, - "zero3_init_flag": False, - "offload_optimizer_device": "cpu", - "offload_param_device": "none" - } - training_args.accelerate_config = accelerate_config - use_accelerate_offload = True + logger.info("Enabling Hugging Face Accelerate optimizer state offload to CPU for low VRAM GPUs") + accelerate_config = { + "compute_environment": "LOCAL_MACHINE", + "distributed_type": "NO", + "downcast_bf16": False, + "fsdp_config": {}, + "main_training_function": "main", + "mixed_precision": "no", + "num_machines": 1, + "num_processes": 1, + "use_cpu": False, + "zero3_init_flag": False, + "offload_optimizer_device": "cpu", + "offload_param_device": "none" + } + training_args.accelerate_config = accelerate_config + use_accelerate_offload = True # Model loading with device_map="auto" for automatic offloading logger.info(f"Loading model with automatic memory management from {model_args.model_name_or_path}") @@ -306,45 +303,23 @@ def main(model_args, data_args, training_args): "add_special_tokens": data_args.add_special_tokens, } - # Use DeepSpeed to handle meta tensors if available - try: - # Only configure DeepSpeed if meta tensors are present and DeepSpeed is available - if hasattr(model, "is_meta") and model.is_meta: - logger.info("Model has meta tensors, checking DeepSpeed availability") - # First verify DeepSpeed is properly installed and importable - try: - import deepspeed - logger.info("DeepSpeed is available, configuring for meta tensor handling") - - # Configure with appropriate settings for meta tensors - training_args.deepspeed = { - "zero_stage": 3, - "offload_optimizer": { - "device": "cpu" - }, - "offload_param": { - "device": "cpu" - }, - "zero3_init_flag": True, - "zero_force_ds_cpu_optimizer": False - } - logger.info("DeepSpeed configured for meta tensor handling") - except ImportError: - logger.warning("DeepSpeed is not available, meta tensors will be handled differently") - # If DeepSpeed isn't available, use alternative approach to handle meta tensors - if torch.cuda.is_available() and model_args.use_cuda: - logger.info("Initializing meta tensors on GPU") - # Use device_map instead of DeepSpeed for meta tensor initialization - from accelerate import init_empty_weights - with init_empty_weights(): - model.to_empty(device="cuda") - else: - logger.info("Initializing meta tensors on CPU") - model.to_empty(device="cpu") - except Exception as e: - logger.warning(f"Could not configure meta tensor handling: {e}") - logger.warning(traceback.format_exc()) - + # Handle meta tensors if present without using DeepSpeed + if hasattr(model, "is_meta") and model.is_meta: + logger.info("Model has meta tensors, initializing properly") + try: + # Initialize meta tensors on appropriate device + if torch.cuda.is_available() and model_args.use_cuda: + logger.info("Initializing meta tensors on GPU") + from accelerate import init_empty_weights + with init_empty_weights(): + model.to_empty(device="cuda") + else: + logger.info("Initializing meta tensors on CPU") + model.to_empty(device="cpu") + except Exception as e: + logger.warning(f"Could not initialize meta tensors: {e}") + logger.warning(traceback.format_exc()) + trainer = SFTTrainer( model=model, tokenizer=tokenizer, From a7bec9313c2b1e414e5c623cdc4d710cb1cd4b9b Mon Sep 17 00:00:00 2001 From: Zachary Pitroda <30330004+zpitroda@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:41:46 -0400 Subject: [PATCH 3/6] Add unsloth --- lpm_kernel/L2/dpo/dpo_train.py | 59 ++++++++++++++++++++++++++-------- pyproject.toml | 1 + 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/lpm_kernel/L2/dpo/dpo_train.py b/lpm_kernel/L2/dpo/dpo_train.py index 6fba2f58..95cf095b 100644 --- a/lpm_kernel/L2/dpo/dpo_train.py +++ b/lpm_kernel/L2/dpo/dpo_train.py @@ -2,7 +2,13 @@ import argparse import torch -from transformers import AutoTokenizer, AutoModelForCausalLM +try: + from unsloth import FastLanguageModel, FastTokenizer + UNSLOTH_AVAILABLE = True +except ImportError: + from transformers import AutoTokenizer, AutoModelForCausalLM + UNSLOTH_AVAILABLE = False + from datasets import Dataset from trl import DPOConfig, DPOTrainer from peft import LoraConfig @@ -20,6 +26,19 @@ def get_east_eight_time_formatted(): # task = Task.init(project_name="mind_dpo", task_name="qwen25-instruct-" + get_east_eight_time_formatted()) +def get_supported_dtype(): + # Try bf16, fallback to f16 + if torch.cuda.is_available(): + if torch.cuda.is_bf16_supported(): + return torch.bfloat16, "bfloat16" + else: + return torch.float16, "float16" + try: + _ = torch.zeros(1, dtype=torch.bfloat16) + return torch.bfloat16, "bfloat16" + except Exception: + return torch.float16, "float16" + def training_data_processor(args, SYS = "You are a helpful assistant.\n\n"): with open(args.training_data_path, "r", encoding="utf-8") as f: data = json.load(f) @@ -33,7 +52,10 @@ def training_data_processor(args, SYS = "You are a helpful assistant.\n\n"): "chosen": [data_point["chosen"] for data_point in data], "rejected": [data_point["rejected"] for data_point in data] } - tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left") + if UNSLOTH_AVAILABLE: + tokenizer = FastTokenizer.from_pretrained(args.base_model_path, padding_side="left") + else: + tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left") training_data = { "prompt": tokenizer.apply_chat_template(training_data["prompt"], tokenize=False), "chosen": training_data["chosen"], @@ -42,13 +64,24 @@ def training_data_processor(args, SYS = "You are a helpful assistant.\n\n"): return training_data def train(args): - tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left") - model = AutoModelForCausalLM.from_pretrained( - args.base_model_path, - trust_remote_code=True, - ignore_mismatched_sizes=True, - torch_dtype=torch.float32, # CPU doesn't support bfloat16 -) + dtype, dtype_str = get_supported_dtype() + if UNSLOTH_AVAILABLE: + tokenizer = FastTokenizer.from_pretrained(args.base_model_path, padding_side="left") + model = FastLanguageModel.from_pretrained( + model_name=args.base_model_path, + dtype=dtype_str, + load_in_4bit=False, + load_in_8bit=False, + device_map="cpu" if not torch.cuda.is_available() else "auto" + ) + else: + tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left") + model = AutoModelForCausalLM.from_pretrained( + args.base_model_path, + trust_remote_code=True, + ignore_mismatched_sizes=True, + torch_dtype=dtype, + ) time_str = get_east_eight_time_formatted() # merged_model = model.merge_and_unload() @@ -61,11 +94,11 @@ def train(args): lora_config = None else: lora_config = LoraConfig( - r=16, # Reduced from 64 for better efficiency - lora_alpha=16, # Adjusted to maintain 2:1 alpha:r ratio - lora_dropout=0.0, # Set to 0.0 for Unsloth compatibility + r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, bias="none", - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # Target specific attention layers instead of all linear + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], task_type="CAUSAL_LM", inference_mode=False, fan_in_fan_out=False diff --git a/pyproject.toml b/pyproject.toml index 2de8e583..4849fc17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ trl = "0.13.0" gguf = "0.10.0" datasets = "3.3.2" jiter = "0.8.2" +unsloth = "2025.4.3" # Documentation environment dependencies # Use 'poetry install --with docs' to install documentation dependencies From 880d0cf7deba172f00dc62b9d2ab683dcd366d4e Mon Sep 17 00:00:00 2001 From: Zachary Pitroda <30330004+zpitroda@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:45:41 -0400 Subject: [PATCH 4/6] Update peft --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4849fc17..7d8d64bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ pandas = "2.2.3" fnllm = {extras = ["azure", "openai"], version = "0.1.2"} transformers = "4.47.1" torch = "2.5.1" -peft = "0.14.0" +peft = "0.15.2" trl = "0.13.0" gguf = "0.10.0" datasets = "3.3.2" From 53f3499e75ba9b8d62e96d9e5ce0908b868caec2 Mon Sep 17 00:00:00 2001 From: Zachary Pitroda <30330004+zpitroda@users.noreply.github.com> Date: Wed, 30 Apr 2025 16:51:12 -0400 Subject: [PATCH 5/6] Empty meta tensors Ensure meta tensors are all moved correctly when split between gpu and cpu --- lpm_kernel/L2/train.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lpm_kernel/L2/train.py b/lpm_kernel/L2/train.py index a70cd7b8..a85ed7aa 100644 --- a/lpm_kernel/L2/train.py +++ b/lpm_kernel/L2/train.py @@ -270,13 +270,19 @@ def main(model_args, data_args, training_args): model, peft_config, tokenizer = create_and_prepare_model( model_args, data_args, training_args, model_kwargs=model_kwargs ) - - # If model has meta tensors, handle them properly - if hasattr(model, "is_meta") and model.is_meta: - logger.info("Model has meta tensors, using to_empty() to properly initialize") + + # Robustly check for meta tensors and handle them + def has_meta_tensors(model): + return any( + (hasattr(p, 'device') and getattr(p, 'device', None) is not None and getattr(p, 'device').type == 'meta') + for p in list(model.parameters()) + list(model.buffers()) + ) + + if has_meta_tensors(model): + logger.info("Model has parameters on meta device, using to_empty() to properly initialize") device = "cuda" if torch.cuda.is_available() and model_args.use_cuda else "cpu" model = model.to_empty(device=device) - + # Apply gradient checkpointing for memory efficiency if training_args.gradient_checkpointing and hasattr(model, "gradient_checkpointing_enable"): logger.info("Enabling gradient checkpointing for memory efficiency") @@ -303,7 +309,6 @@ def main(model_args, data_args, training_args): "add_special_tokens": data_args.add_special_tokens, } - # Handle meta tensors if present without using DeepSpeed if hasattr(model, "is_meta") and model.is_meta: logger.info("Model has meta tensors, initializing properly") try: From 6599ab1eff85320c4aba27a04da1498c5ad615cc Mon Sep 17 00:00:00 2001 From: Zachary Pitroda <30330004+zpitroda@users.noreply.github.com> Date: Wed, 30 Apr 2025 18:47:51 -0400 Subject: [PATCH 6/6] Update dpo_train.py --- lpm_kernel/L2/dpo/dpo_train.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/lpm_kernel/L2/dpo/dpo_train.py b/lpm_kernel/L2/dpo/dpo_train.py index 95cf095b..0a729556 100644 --- a/lpm_kernel/L2/dpo/dpo_train.py +++ b/lpm_kernel/L2/dpo/dpo_train.py @@ -72,8 +72,25 @@ def train(args): dtype=dtype_str, load_in_4bit=False, load_in_8bit=False, - device_map="cpu" if not torch.cuda.is_available() else "auto" + device_map="auto" if torch.cuda.is_available() else "cpu" ) + # Apply LoRA with Unsloth's optimized method if requested + if args.lora_r > 0: + model = FastLanguageModel.get_peft_model( + model, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + use_gradient_checkpointing=False, + random_state=42, + max_seq_length=args.max_length, + ) + # Use FastDPOTrainer if available, else fallback + try: + from unsloth import FastDPOTrainer as DPOTrainerImpl + except ImportError: + from trl import DPOTrainer as DPOTrainerImpl else: tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left") model = AutoModelForCausalLM.from_pretrained( @@ -82,17 +99,15 @@ def train(args): ignore_mismatched_sizes=True, torch_dtype=dtype, ) + DPOTrainerImpl = DPOTrainer time_str = get_east_eight_time_formatted() - # merged_model = model.merge_and_unload() - # merged_model.save_pretrained(merged_model) - data_dict = training_data_processor(args) dataset = Dataset.from_dict(data_dict) - if args.lora_r == 0: - lora_config = None - else: + # Only use LoRA config for non-Unsloth + lora_config = None + if not UNSLOTH_AVAILABLE and args.lora_r > 0: lora_config = LoraConfig( r=args.lora_r, lora_alpha=args.lora_alpha, @@ -128,7 +143,7 @@ def train(args): beta=args.beta, ) - dpo_trainer = DPOTrainer( + dpo_trainer = DPOTrainerImpl( model, tokenizer=tokenizer, args=training_args, @@ -137,7 +152,6 @@ def train(args): ) dpo_trainer.train() - dpo_trainer.save_model()