Create sft.slurm
Browse files
    	
        sft.slurm
    ADDED
    
    | @@ -0,0 +1,39 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            #!/bin/bash
         | 
| 2 | 
            +
            #SBATCH --job-name=apigen-fine-tune
         | 
| 3 | 
            +
            #SBATCH --partition=hopper-prod
         | 
| 4 | 
            +
            #SBATCH --qos=normal
         | 
| 5 | 
            +
            #SBATCH --nodes=1
         | 
| 6 | 
            +
            #SBATCH --ntasks-per-node=1
         | 
| 7 | 
            +
            #SBATCH --gpus-per-node=8
         | 
| 8 | 
            +
            #SBATCH --output=./logs/%x-%j.out
         | 
| 9 | 
            +
            #SBATCH --err=./logs/%x-%j.err
         | 
| 10 | 
            +
            #SBATCH --time=02-00:00:00
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            set -ex
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            module load cuda/12.1
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            source .venv/bin/activate
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            srun --nodes=1 --ntasks=1 --export=ALL,ACCELERATE_LOG_LEVEL=info accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml examples/scripts/sft.py \
         | 
| 19 | 
            +
                --run_name=Llama-3.2-1B-Instruct-APIGen-FC-v0.1 \
         | 
| 20 | 
            +
                --model_name_or_path="meta-llama/Llama-3.2-1B-Instruct" \
         | 
| 21 | 
            +
                --dataset_name="plaguss/apigen-synth-trl" \
         | 
| 22 | 
            +
                --report_to="wandb" \
         | 
| 23 | 
            +
                --learning_rate=5.0e-06 \
         | 
| 24 | 
            +
                --lr_scheduler_type="cosine" \
         | 
| 25 | 
            +
                --per_device_train_batch_size=6 \
         | 
| 26 | 
            +
                --per_device_eval_batch_size=6 \
         | 
| 27 | 
            +
                --do_eval \
         | 
| 28 | 
            +
                --eval_strategy="steps" \
         | 
| 29 | 
            +
                --gradient_accumulation_steps=2 \
         | 
| 30 | 
            +
                --output_dir="data/Llama-3.2-1B-Instruct-APIGen-FC-v0.1" \
         | 
| 31 | 
            +
                --logging_steps=5 \
         | 
| 32 | 
            +
                --eval_steps=50 \
         | 
| 33 | 
            +
                --num_train_epochs=2 \
         | 
| 34 | 
            +
                --max_steps=-1 \
         | 
| 35 | 
            +
                --warmup_steps=50 \
         | 
| 36 | 
            +
                --max_seq_length=2048 \
         | 
| 37 | 
            +
                --push_to_hub \
         | 
| 38 | 
            +
                --gradient_checkpointing \
         | 
| 39 | 
            +
                --bf16
         | 

