-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDockerfile.train
More file actions
208 lines (144 loc) · 4.27 KB
/
Copy pathDockerfile.train
File metadata and controls
208 lines (144 loc) · 4.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
FROM cnstark/pytorch:1.13.0-py3.9.12-cuda11.7.1-ubuntu20.04
# Model parameters
ARG dataset_name="wasertech/OneOS"
ENV DATASET_NAME=$dataset_name
ARG base_model_name="TinyPixel/Llama-2-7B-bf16-sharded"
ENV BASE_MODEL_NAME=$base_model_name
ARG output_dir="output"
ENV OUTPUT_MODEL_PATH=$output_dir
ARG model_name="assistant-llama2-7b-bf16"
ENV OUTPUT_MODEL_NAME=$model_name
ARG disable_no_ignore_characters_warning=0
ENV DONT_WARN_IGNORE_CHARS=$disable_no_ignore_characters_warning
# Training hyper-parameters
ARG distribute_training=0
ENV DISTRIBUTE_TRAIN=$distribute_training
ARG train_batch_size=16
ENV TRAIN_BATCH_SIZE=$train_batch_size
ARG eval_batch_size=8
ENV EVAL_BATCH_SIZE=$eval_batch_size
ARG opm_nproc=1
ENV OMP_NUM_THREADS=$opm_nproc
ARG epochs=1
ENV EPOCHS=$epochs
ARG learning_rate="1.41e-5"
ENV LEARNING_RATE=$learning_rate
ARG seq_len=8192
ENV SEQENCE_LENGTH=$seq_len
ARG dropout=0.0
ENV DROPOUT=$dropout
ARG amp=1
ENV AMP=$amp
ARG freeze_encoder=1
ENV FREEZE_ENCODER=$freeze_encoder
# Gradient Accumulation Steps
ARG gas=2
ENV GAS=$gas
ARG gradien_checkpointing=1
ENV GRAD_CHECK=$gradien_checkpointing
ARG use_peft=0
ENV USE_PEFT=$use_peft
ARG use_4bit=0
ENV USE_4BIT=$use_4bit
ARG use_8bit=0
ENV USE_8BIT=$use_8bit
ENV push_to_hub=0
ENV PUSH_TO_HUB=$push_to_hub
# Warm up uses ratio over steps
# Set WARMUP_RATIO=0 to use steps instead
ARG warm_up_steps=500
ENV WARMUP_STEPS=$warm_up_steps
ARG warm_up_ratio=0.1
ENV WARMUP_RATIO=$warm_up_ratio
ARG save_steps=400
ENV SAVE_STEPS=$save_steps
ARG eval_steps=100
ENV EVAL_STEPS=$eval_steps
ARG eval_strategy="steps"
ENV EVAL_STRAT=$eval_strategy
ARG max_amount_checkpoints=3
ENV MAX_CHECKPOINTS=$max_amount_checkpoints
# Data processing workers
# (too many might OOM) recommended at 1 worker per CPU core or lower.
# Meaning if you have 12 cores (24 threads) and 100 Gb of RAM, set it to 12 workers.
# You could probably handle more but with 1 worker per thread,
# you are likely to experience OOM issues.
ARG nproc=12
ENV NPROC=$nproc
# Only for distributed training (more than one GPU)
# DISTRIBUTE_TRAIN must be 1
# set NPROC_PER_GPU=2 to use 2 GPUs
ARG nproc_per_gpu=0
ENV NPROC_PER_GPU=$nproc_per_gpu
ARG hub_token=""
ENV HUB_API_TOKEN=$hub_token
ARG uid=999
ENV UID=$uid
ARG gid=999
ENV GID=$gid
# Make sure we can extract filenames with UTF-8 chars
ENV LANG=C.UTF-8
# Avoid keyboard-configuration step
ENV DEBIAN_FRONTEND noninteractive
ENV HOMEDIR /home/trainer
ENV WORKDIR /mnt
ENV VIRTUAL_ENV_NAME llm-train
ENV VIRTUAL_ENV $HOMEDIR/$VIRTUAL_ENV_NAME
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN env
# Get basic packages
RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
build-essential \
curl \
wget \
git \
# ffmpeg \
python3 \
python3-pip \
ca-certificates \
cmake \
# libboost-all-dev \
# zlib1g-dev \
# libbz2-dev \
# liblzma-dev \
pkg-config \
g++ \
virtualenv \
# unzip \
# pixz \
# sox \
sudo \
# libsox-fmt-all \
locales locales-all
# xz-utils
# For uploading models to HuggingFace hub
RUN apt -qq install -y --no-install-recommends git-lfs
# Setup user permissions
RUN groupadd -g $GID trainer && \
adduser --system --uid $UID --group trainer
RUN echo "trainer ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/trainer && \
chmod 0440 /etc/sudoers.d/trainer
# Below that point, nothing requires being root
USER trainer
WORKDIR $HOMEDIR
RUN virtualenv --python=/usr/bin/python3 $VIRTUAL_ENV_NAME
ENV PATH=$HOMEDIR/$VIRTUAL_ENV_NAME/bin:$PATH
WORKDIR $HOMEDIR
RUN pip install 'git+https://github.com/huggingface/trl.git'
RUN pip install 'git+https://github.com/huggingface/transformers.git'
RUN pip install accelerate
RUN pip install 'git+https://github.com/huggingface/peft.git'
RUN pip install 'git+https://github.com/huggingface/datasets.git'
RUN pip install bitsandbytes
RUN pip install einops
RUN pip install wandb
RUN pip install scipy
# Install AutoAWQ
RUN pip install autoawq
RUN pip install 'git+https://github.com/huggingface/tokenizers.git#egg=tokenizers'
# Install UnSloth
# RUN pip install "unsloth[cu121_torch211] @ git+https://github.com/unslothai/unsloth.git"
WORKDIR $WORKDIR
# Copy now so that docker build can leverage caches
COPY --chown=trainer:trainer . $HOMEDIR/
ENTRYPOINT "$HOMEDIR/run.sh"