Upload llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256
Browse files
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/bench.slurm
CHANGED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
#SBATCH --job-name=bench_cluster
|
| 4 |
-
#SBATCH --time=
|
| 5 |
#SBATCH --partition=hopper-prod
|
| 6 |
#SBATCH --nodes=2
|
| 7 |
#SBATCH --gres=gpu:8
|
| 8 |
-
#SBATCH --qos=
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH --cpus-per-task=96
|
| 11 |
#SBATCH --exclusive
|
| 12 |
-
#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 13 |
-
#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 14 |
|
| 15 |
# Function to update status based on squeue output
|
| 16 |
update_status() {
|
|
@@ -53,7 +53,7 @@ huggingface-cli login --token $HUGGINGFACE_TOKEN
|
|
| 53 |
|
| 54 |
|
| 55 |
NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
|
| 56 |
-
CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 57 |
|
| 58 |
LAUNCHER="torchrun \
|
| 59 |
--nproc_per_node 8 \
|
|
@@ -72,7 +72,7 @@ cd ..
|
|
| 72 |
job_id=${SLURM_JOB_ID}
|
| 73 |
|
| 74 |
# Update status to "pending" or "running" in the background
|
| 75 |
-
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 76 |
|
| 77 |
# Run the main command
|
| 78 |
srun -u $LAUNCHER $CMD
|
|
@@ -80,28 +80,28 @@ exit_status=$?
|
|
| 80 |
|
| 81 |
# Update status based on the exit status of `srun`
|
| 82 |
if [ $exit_status -eq 0 ]; then
|
| 83 |
-
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 84 |
else
|
| 85 |
-
if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 86 |
-
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 87 |
-
elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 88 |
-
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 89 |
-
elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 90 |
-
printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 91 |
else
|
| 92 |
-
printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 93 |
fi
|
| 94 |
fi
|
| 95 |
|
| 96 |
# Run the report script if the job completed successfully
|
| 97 |
if [ $exit_status -eq 0 ]; then
|
| 98 |
-
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 99 |
-
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 100 |
fi
|
| 101 |
|
| 102 |
|
| 103 |
# Push to hub the folder using huggingface_cli
|
| 104 |
-
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/
|
| 105 |
|
| 106 |
# Verify the upload
|
| 107 |
if [ $? -eq 0 ]; then
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
#SBATCH --job-name=bench_cluster
|
| 4 |
+
#SBATCH --time=01:30:00
|
| 5 |
#SBATCH --partition=hopper-prod
|
| 6 |
#SBATCH --nodes=2
|
| 7 |
#SBATCH --gres=gpu:8
|
| 8 |
+
#SBATCH --qos=normal
|
| 9 |
#SBATCH --ntasks-per-node=1
|
| 10 |
#SBATCH --cpus-per-task=96
|
| 11 |
#SBATCH --exclusive
|
| 12 |
+
#SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/log.out
|
| 13 |
+
#SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/log.out
|
| 14 |
|
| 15 |
# Function to update status based on squeue output
|
| 16 |
update_status() {
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
|
| 56 |
+
CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/config.yaml"
|
| 57 |
|
| 58 |
LAUNCHER="torchrun \
|
| 59 |
--nproc_per_node 8 \
|
|
|
|
| 72 |
job_id=${SLURM_JOB_ID}
|
| 73 |
|
| 74 |
# Update status to "pending" or "running" in the background
|
| 75 |
+
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/status.txt &
|
| 76 |
|
| 77 |
# Run the main command
|
| 78 |
srun -u $LAUNCHER $CMD
|
|
|
|
| 80 |
|
| 81 |
# Update status based on the exit status of `srun`
|
| 82 |
if [ $exit_status -eq 0 ]; then
|
| 83 |
+
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/status.txt
|
| 84 |
else
|
| 85 |
+
if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/log.out; then
|
| 86 |
+
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/status.txt
|
| 87 |
+
elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/log.out; then
|
| 88 |
+
printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/status.txt
|
| 89 |
+
elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/log.out; then
|
| 90 |
+
printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/status.txt
|
| 91 |
else
|
| 92 |
+
printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/status.txt
|
| 93 |
fi
|
| 94 |
fi
|
| 95 |
|
| 96 |
# Run the report script if the job completed successfully
|
| 97 |
if [ $exit_status -eq 0 ]; then
|
| 98 |
+
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256 --is_logs
|
| 99 |
+
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256 --is_profiler
|
| 100 |
fi
|
| 101 |
|
| 102 |
|
| 103 |
# Push to hub the folder using huggingface_cli
|
| 104 |
+
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256 llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256 --commit-message "Upload llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256"
|
| 105 |
|
| 106 |
# Verify the upload
|
| 107 |
if [ $? -eq 0 ]; then
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/log.out
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-256/status.txt
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
oom
|