move filter to before saving so it doesn't happen everytime, update runpod manual script
Browse files- README.md +1 -1
- scripts/setup-runpod.sh +3 -3
- src/axolotl/utils/data.py +12 -12
README.md
CHANGED
|
@@ -155,7 +155,7 @@ use_cpu: false
|
|
| 155 |
- Once you start your runpod, and SSH into it:
|
| 156 |
```shell
|
| 157 |
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
| 158 |
-
source <(curl -s https://raw.githubusercontent.com/
|
| 159 |
```
|
| 160 |
|
| 161 |
- Once the setup script completes
|
|
|
|
| 155 |
- Once you start your runpod, and SSH into it:
|
| 156 |
```shell
|
| 157 |
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
| 158 |
+
source <(curl -s https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/dev/scripts/setup-runpod.sh)
|
| 159 |
```
|
| 160 |
|
| 161 |
- Once the setup script completes
|
scripts/setup-runpod.sh
CHANGED
|
@@ -29,14 +29,14 @@ fi
|
|
| 29 |
# install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
|
| 30 |
mkdir -p /workspace/wheels
|
| 31 |
cd /workspace/wheels
|
| 32 |
-
curl -L -O https://github.com/
|
| 33 |
-
curl -L -O https://github.com/
|
| 34 |
pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
|
| 35 |
pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
|
| 36 |
pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
|
| 37 |
|
| 38 |
cd /workspace/
|
| 39 |
-
git clone https://github.com/
|
| 40 |
cd axolotl
|
| 41 |
pip install -e .[int4]
|
| 42 |
mkdir -p ~/.cache/huggingface/accelerate/
|
|
|
|
| 29 |
# install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
|
| 30 |
mkdir -p /workspace/wheels
|
| 31 |
cd /workspace/wheels
|
| 32 |
+
curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
|
| 33 |
+
curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
|
| 34 |
pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
|
| 35 |
pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
|
| 36 |
pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
|
| 37 |
|
| 38 |
cd /workspace/
|
| 39 |
+
git clone https://github.com/OpenAccess-AI-Collective/axolotl.git
|
| 40 |
cd axolotl
|
| 41 |
pip install -e .[int4]
|
| 42 |
mkdir -p ~/.cache/huggingface/accelerate/
|
src/axolotl/utils/data.py
CHANGED
|
@@ -198,6 +198,18 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
|
|
| 198 |
)
|
| 199 |
dataset = Dataset.from_list([_ for _ in constant_len_dataset])
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
if cfg.local_rank == 0:
|
| 202 |
logging.info(
|
| 203 |
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
|
|
@@ -208,18 +220,6 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
|
|
| 208 |
tokenizer, cfg, default_dataset_prepared_path
|
| 209 |
)
|
| 210 |
|
| 211 |
-
# filter out bad data
|
| 212 |
-
dataset = Dataset.from_list(
|
| 213 |
-
[
|
| 214 |
-
d
|
| 215 |
-
for d in dataset
|
| 216 |
-
if len(d["input_ids"]) < cfg.sequence_len
|
| 217 |
-
and len(d["input_ids"]) > 0
|
| 218 |
-
and len(d["input_ids"]) == len(d["attention_mask"])
|
| 219 |
-
and len(d["input_ids"]) == len(d["labels"])
|
| 220 |
-
]
|
| 221 |
-
)
|
| 222 |
-
|
| 223 |
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
| 224 |
logging.info(
|
| 225 |
f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
|
|
|
|
| 198 |
)
|
| 199 |
dataset = Dataset.from_list([_ for _ in constant_len_dataset])
|
| 200 |
|
| 201 |
+
# filter out bad data
|
| 202 |
+
dataset = Dataset.from_list(
|
| 203 |
+
[
|
| 204 |
+
d
|
| 205 |
+
for d in dataset
|
| 206 |
+
if len(d["input_ids"]) < cfg.sequence_len
|
| 207 |
+
and len(d["input_ids"]) > 0
|
| 208 |
+
and len(d["input_ids"]) == len(d["attention_mask"])
|
| 209 |
+
and len(d["input_ids"]) == len(d["labels"])
|
| 210 |
+
]
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
if cfg.local_rank == 0:
|
| 214 |
logging.info(
|
| 215 |
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
|
|
|
|
| 220 |
tokenizer, cfg, default_dataset_prepared_path
|
| 221 |
)
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
| 224 |
logging.info(
|
| 225 |
f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
|