-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsbatch_finetuning_v2.sh
More file actions
201 lines (164 loc) · 7.37 KB
/
sbatch_finetuning_v2.sh
File metadata and controls
201 lines (164 loc) · 7.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/bin/bash
#SBATCH --job-name=v2
#SBATCH --output=sbatch_output/%j_resume_v2.out
#SBATCH --error=sbatch_output/%j_resume_v2.err
#SBATCH --account=iscrc_magnify
#SBATCH --time=24:00:00
#SBATCH --mem=300G
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=8
#--output=sbatch_output/%j_from_scratch_v2.out
#--error=sbatch_output/%j_from_scratch_v2.err
echo "CUDA devices: $CUDA_VISIBLE_DEVICES"
# Use a location with more disk space - typically /leonardo_work has more quota than /leonardo_scratch/fast
LARGE_CACHE_BASE="/leonardo_work/IscrC_MAGNIFY/cassano/temp_cache"
# Increase NCCL timeout and add debugging
export NCCL_BLOCKING_WAIT=1
export NCCL_TIMEOUT=1800 # 30 minutes
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
# Add CUDA memory management
export CUDA_LAUNCH_BLOCKING=1
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
# Redirect ALL temporary files to the larger storage location
export TMPDIR="${LARGE_CACHE_BASE}/tmp"
export TMP="${LARGE_CACHE_BASE}/tmp"
export TEMP="${LARGE_CACHE_BASE}/tmp"
# HuggingFace cache directories
export HF_DATASETS_CACHE="${LARGE_CACHE_BASE}/hf_datasets"
export HF_DATASETS_DOWNLOADED_DATASETS_PATH="${LARGE_CACHE_BASE}/hf_datasets/downloads"
export HF_HOME="${LARGE_CACHE_BASE}/hf_home"
export TRANSFORMERS_CACHE="${LARGE_CACHE_BASE}/transformers"
export HF_HUB_CACHE="${LARGE_CACHE_BASE}/hf_hub"
# Torch and PyTorch caches
export TORCH_HOME="${LARGE_CACHE_BASE}/torch"
export TORCH_CACHE="${LARGE_CACHE_BASE}/torch_cache"
# Python caches
export PYTHONPYCACHEPREFIX="${LARGE_CACHE_BASE}/pycache"
export PYTHONDONTWRITEBYTECODE=1 # Disable .pyc file creation
# Weights & Biases
export WANDB_MODE="offline"
export WANDB_DIR="${LARGE_CACHE_BASE}/wandb"
export WANDB_CACHE_DIR="${LARGE_CACHE_BASE}/wandb_cache"
# Additional environment variables for popular libraries that create temp files
export MPLCONFIGDIR="${LARGE_CACHE_BASE}/matplotlib"
export NUMBA_CACHE_DIR="${LARGE_CACHE_BASE}/numba"
export JUPYTER_RUNTIME_DIR="${LARGE_CACHE_BASE}/jupyter"
# PyArrow (used by datasets library)
export ARROW_TMPDIR="${LARGE_CACHE_BASE}/arrow_tmp"
# CUDA cache (if using GPU compilation)
export CUDA_CACHE_PATH="${LARGE_CACHE_BASE}/cuda_cache"
# FIXED: Allow datasets to use the large cache directory for shuffle operations
export HF_DATASETS_OFFLINE=1
export HF_DATASETS_CACHE_MAX_SIZE="100GB" # Set reasonable cache limit
# Create all cache directories
mkdir -p $TMPDIR $HF_DATASETS_CACHE $TRANSFORMERS_CACHE $WANDB_DIR
mkdir -p $HF_HOME $TORCH_HOME $PYTHONPYCACHEPREFIX $MPLCONFIGDIR
mkdir -p $NUMBA_CACHE_DIR $JUPYTER_RUNTIME_DIR $ARROW_TMPDIR $CUDA_CACHE_PATH
mkdir -p $HF_HUB_CACHE $WANDB_CACHE_DIR $TORCH_CACHE
# Set permissions
chmod -R 755 ${LARGE_CACHE_BASE}
# Clean up any existing temporary files first
echo "Cleaning up existing temporary files..."
find ${LARGE_CACHE_BASE} -name "*.tmp" -delete 2>/dev/null || true
find ${LARGE_CACHE_BASE} -name "*.lock" -delete 2>/dev/null || true
find ${LARGE_CACHE_BASE} -name "*partial*" -delete 2>/dev/null || true
# Check available disk space in both locations
echo "Checking disk space:"
echo "Source data location:"
df -h /leonardo_scratch/fast/IscrC_MAGNIFY/cassano/
echo "Temporary files location:"
df -h /leonardo_work/IscrC_MAGNIFY/cassano/
# Name of the Python script (updated to match the modified version)
SCRIPT_NAME="/leonardo/home/userexternal/ecassano/projects/SAeUron_finetuning/scripts/sae_finetuning_v2.py"
# Path to SAE checkpoint directory
CHECKPOINT_PATH="/leonardo_work/IscrC_MAGNIFY/cassano/saeuron/sae_checkpoints/unet.up_blocks.1.attentions.1"
# Directory containing concept activations (each in separate pkl file)
ACTIVATIONS_DIR="/leonardo_scratch/fast/IscrC_MAGNIFY/cassano/finetuning_activations/objects"
# JSON file paths for scores (corrected to match Python script expectations)
SCORES_JSON_PATH="/leonardo_work/IscrC_MAGNIFY/cassano/saeuron/scores/objects/non_finetuned/scores.json"
# Directory to save models and logs - Updated to reflect Binary CE version
SAVE_DIR="/leonardo_work/IscrC_MAGNIFY/cassano/saeuron/sae_checkpoints/object_concept_optimized/v2/ce_weight_3.0_sparsity_0.01"
# Make sure directories exist
mkdir -p ${SAVE_DIR}
mkdir -p sbatch_output
# Activate the environment
source ../../envs/saeuron_cassano/bin/activate
# Display GPU info
nvidia-smi
# Print configuration for verification
echo "=== TRAINING CONFIGURATION ==="
echo "Script: ${SCRIPT_NAME}"
echo "Checkpoint: ${CHECKPOINT_PATH}"
echo "Activations: ${ACTIVATIONS_DIR}"
echo "Scores JSON: ${SCORES_JSON_PATH}"
echo "Save Directory: ${SAVE_DIR}"
echo "Loss Configuration:"
echo " - Reconstruction Weight: 1.0"
echo " - Binary Cross-Entropy Weight: 3.0 (applied BEFORE topk)"
echo " - Sparsity Weight: 0.01"
echo "Resume Mode: ENABLED (will continue from latest checkpoint)"
echo "Wandb: OFFLINE mode, incremental logging on resume"
echo "================================"
# Run training with Binary Cross-Entropy loss (computed BEFORE topk)
echo "Running training with Binary Cross-Entropy loss (pre-topk)..."
echo "Loss will only be applied to assigned concept latents"
echo "Shuffle operations will use: ${HF_DATASETS_CACHE}"
torchrun --nproc_per_node=4 ${SCRIPT_NAME} \
--checkpoint_path ${CHECKPOINT_PATH} \
--activations_dir ${ACTIVATIONS_DIR} \
--device cuda \
--learning_rate 5e-6 \
--scores_json_path ${SCORES_JSON_PATH} \
--num_epochs 100 \
--reconstruction_weight 1.0 \
--cross_entropy_weight 3.0 \
--sparsity_weight 0.01 \
--batch_size 128 \
--save_dir ${SAVE_DIR} \
--seed 42 \
--validation_split 0.2 \
--mixed_batches \
--num_gpus 4 \
--gradient_accumulation_steps 1 \
--mixed_precision \
--resume \
--patience 5 \
--use_float16
# Alternative configurations (commented out):
# For training from scratch with random concept assignments:
# --from_scratch
# For different loss weights:
# --cross_entropy_weight 1.0 # Lower BCE weight
# --cross_entropy_weight 5.0 # Higher BCE weight
# For different batch sizes if memory issues:
# --batch_size 64 # Smaller batch size
# --batch_size 256 # Larger batch size (if memory allows)
echo ""
echo "=== TRAINING COMPLETED ==="
echo "Binary Cross-Entropy loss was applied BEFORE topk selection"
echo "Only assigned concept latents were influenced by the loss function"
echo "Wandb logs are incremental and will continue previous run if resumed"
# Clean up temporary files after completion
echo "Cleaning up temporary files..."
find ${LARGE_CACHE_BASE} -name "*.tmp" -delete 2>/dev/null || true
find ${LARGE_CACHE_BASE} -name "*.lock" -delete 2>/dev/null || true
# Optionally, remove the entire temp cache directory if you want to save space
# Uncomment the next line if you want to clean everything after the job
# rm -rf ${LARGE_CACHE_BASE}
echo "Job completed at $(date)"
echo "Results have been saved to: ${SAVE_DIR}"
echo "Model checkpoints include incremental saves with optimizer state"
echo "Final disk usage:"
df -h /leonardo_work/IscrC_MAGNIFY/cassano/
df -h /leonardo_scratch/fast/IscrC_MAGNIFY/cassano/
echo ""
echo "=== NEXT STEPS ==="
echo "1. Check training logs in: ${SAVE_DIR}/wandb"
echo "2. Latest model checkpoint: ${SAVE_DIR}/latest/"
echo "3. Epoch-specific checkpoints: ${SAVE_DIR}/epoch_*/"
echo "4. To resume training, simply re-run this script with --resume flag"
echo "5. Wandb logs can be synced when online: wandb sync ${SAVE_DIR}/wandb"