diff --git a/configs/lfm2.5-1.2b-instruct-dflash.json b/configs/lfm2.5-1.2b-instruct-dflash.json new file mode 100755 index 00000000..ad48edc5 --- /dev/null +++ b/configs/lfm2.5-1.2b-instruct-dflash.json @@ -0,0 +1,42 @@ +{ + "architectures": ["DFlashDraftModel"], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoModel": "dflash.DFlashDraftModel" + }, + "block_size": 16, + "bos_token_id": 1, + "dflash_config": { + "mask_token_id": 16, + "target_layer_ids": [1, 4, 7, 10, 13] + }, + "dtype": "bfloat16", + "eos_token_id": 7, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 128000, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 8, + "num_key_value_heads": 8, + "num_target_layers": 16, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "tie_word_embeddings": true, + "use_cache": true, + "vocab_size": 65536 +} \ No newline at end of file diff --git a/examples/run_lfm2.5_1.2b_instruct_dflash_online.sh b/examples/run_lfm2.5_1.2b_instruct_dflash_online.sh new file mode 100755 index 00000000..449add4e --- /dev/null +++ b/examples/run_lfm2.5_1.2b_instruct_dflash_online.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels +export SPECFORGE_DATA_NUM_PROC=32 +NUM_GPUS=${1:-8} + +ATTENTION_BACKEND=${2:-flex_attention} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_dflash.py \ + --target-model-path LiquidAI/LFM2.5-1.2B-Instruct \ + --draft-config-path $ROOT_DIR/configs/lfm2.5-1.2b-instruct-dflash.json \ + --train-data-path $ROOT_DIR/cache/dataset/perfectblend_lfm2.5-1.2b_regen.jsonl \ + --output-dir $ROOT_DIR/outputs/lfm2.5-1.2b-instruct-dflash-perfectblend-8layers \ + --num-epochs 6 \ + --batch-size 2 \ + --accumulation-steps 2 \ + --learning-rate 6e-4 \ + --warmup-ratio 0.04 \ + --max-grad-norm 1.0 \ + --max-length 4096 \ + --chat-template lfm \ + --attention-backend $ATTENTION_BACKEND \ + --num-anchors 512 \ + --loss-decay-gamma 7.0 \ + --log-interval 50 \ + --save-interval 10000 \ + --report-to wandb \ + --wandb-project specforge-lfm2.5-1.2b-instruct-dflash \ + --target-model-backend sglang \ + --block-size 16 \ + --num-anchors 512 \ + --wandb-name lfm2.5-1.2b-instruct-dflash-perfectblend-8layers \ + # --resume diff --git a/specforge/data/template.py b/specforge/data/template.py index 50be7e77..0e50dda9 100644 --- a/specforge/data/template.py +++ b/specforge/data/template.py @@ -119,6 +119,16 @@ def get_all_template_names(self) -> List[str]: ), ) +TEMPLATE_REGISTRY.register( + name="lfm", + template=ChatTemplate( + assistant_header="<|im_start|>assistant\n", + user_header="<|im_start|>user\n", + system_prompt="", + end_of_turn_token="<|im_end|>\n", + ), +) + TEMPLATE_REGISTRY.register( name="qwen2-vl", template=ChatTemplate(