rt-llm-eng-cert-week3/config.json at main · readytensor/rt-llm-eng-cert-week3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{
    "model_name": "meta-llama/Llama-3.2-1B",
    "save_model_name": "llama-1b-legal-qlora",
    "assistant_only_masking": false,
    "use_qlora": true,
    "deepspeed_version": 1,
    "dataset_config": {
        "dataset_name": "NebulaSense/Legal_Clause_Instructions",
        "instruction_column": "Instruction",
        "input_column": "Input",
        "output_column": "Output",
        "max_length": 2048,
        "sample_size": null,
        "validation_size": 0.1,
        "test_size": null
    },
    "quantization_config": {
        "load_in_4bit": true
    },
    "lora_config": {
        "r": 8,
        "lora_alpha": 32,
        "target_modules": [
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj"
        ],
        "lora_dropout": 0.05,
        "bias": "none"
    },
    "training_args": {
        "output_dir": "./checkpoints",
        "per_device_train_batch_size": 4,
        "num_train_epochs": 2,
        "learning_rate": 5e-4,
        "logging_steps": 4,
        "save_strategy": "epoch",
        "eval_strategy": "epoch",
        "warmup_steps": 0,
        "lr_scheduler_type": "cosine",
        "optim": "adamw_torch",
        "report_to": null,
        "remove_unused_columns": false,
        "dataloader_drop_last": true,
        "gradient_checkpointing": false,
        "max_grad_norm": 1.0,
        "metric_for_best_model": "eval_loss",
        "greater_is_better": false,
        "fp16": true,
        "logging_dir": "./logs",
        "logging_first_step": true,
        "log_level": "info",
        "disable_tqdm": false
    },
    "early_stopping": {
        "early_stopping_patience": 2,
        "early_stopping_threshold": 0.05
    }
}