shuffle and split dataset after save/load
Browse files- FAQS.md +1 -1
- ds_config.json +5 -5
- src/axolotl/utils/data.py +4 -5
- src/axolotl/utils/models.py +1 -1
FAQS.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
# FAQs
|
| 2 |
|
| 3 |
- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
|
| 4 |
-
-
|
|
|
|
| 1 |
# FAQs
|
| 2 |
|
| 3 |
- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
|
| 4 |
+
- Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
|
ds_config.json
CHANGED
|
@@ -11,11 +11,10 @@
|
|
| 11 |
"min_loss_scale": 1
|
| 12 |
},
|
| 13 |
"scheduler": {
|
| 14 |
-
"type": "
|
| 15 |
"params": {
|
| 16 |
-
"
|
| 17 |
-
"
|
| 18 |
-
"warmup_num_steps": "auto"
|
| 19 |
}
|
| 20 |
},
|
| 21 |
"zero_optimization": {
|
|
@@ -25,7 +24,8 @@
|
|
| 25 |
"allgather_bucket_size": 5e8,
|
| 26 |
"contiguous_gradients": true,
|
| 27 |
"reduce_bucket_size": "auto",
|
| 28 |
-
"reduce_scatter": true
|
|
|
|
| 29 |
},
|
| 30 |
"gradient_accumulation_steps": "auto",
|
| 31 |
"gradient_clipping": "auto",
|
|
|
|
| 11 |
"min_loss_scale": 1
|
| 12 |
},
|
| 13 |
"scheduler": {
|
| 14 |
+
"type": "OneCycle",
|
| 15 |
"params": {
|
| 16 |
+
"cycle_min_lr": 1e-7,
|
| 17 |
+
"cycle_max_lr": 1e-4
|
|
|
|
| 18 |
}
|
| 19 |
},
|
| 20 |
"zero_optimization": {
|
|
|
|
| 24 |
"allgather_bucket_size": 5e8,
|
| 25 |
"contiguous_gradients": true,
|
| 26 |
"reduce_bucket_size": "auto",
|
| 27 |
+
"reduce_scatter": true,
|
| 28 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 29 |
},
|
| 30 |
"gradient_accumulation_steps": "auto",
|
| 31 |
"gradient_clipping": "auto",
|
src/axolotl/utils/data.py
CHANGED
|
@@ -119,16 +119,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
|
|
| 119 |
seq_length=max_packed_sequence_len,
|
| 120 |
)
|
| 121 |
logging.info("merging, packing, shuffling, and splitting master dataset")
|
| 122 |
-
|
| 123 |
-
# re-split when loading again
|
| 124 |
-
dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split(
|
| 125 |
-
test_size=cfg.val_set_size, shuffle=True, seed=42
|
| 126 |
-
)
|
| 127 |
|
| 128 |
if cfg.local_rank == 0:
|
| 129 |
logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}")
|
| 130 |
dataset.save_to_disk(prepared_ds_path)
|
| 131 |
|
|
|
|
|
|
|
|
|
|
| 132 |
train_dataset = dataset["train"]
|
| 133 |
eval_dataset = dataset["test"]
|
| 134 |
|
|
|
|
| 119 |
seq_length=max_packed_sequence_len,
|
| 120 |
)
|
| 121 |
logging.info("merging, packing, shuffling, and splitting master dataset")
|
| 122 |
+
dataset = Dataset.from_list([_ for _ in constant_len_dataset]).shuffle(seed=42)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
if cfg.local_rank == 0:
|
| 125 |
logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}")
|
| 126 |
dataset.save_to_disk(prepared_ds_path)
|
| 127 |
|
| 128 |
+
dataset = dataset.train_test_split(
|
| 129 |
+
test_size=cfg.val_set_size, shuffle=False
|
| 130 |
+
)
|
| 131 |
train_dataset = dataset["train"]
|
| 132 |
eval_dataset = dataset["test"]
|
| 133 |
|
src/axolotl/utils/models.py
CHANGED
|
@@ -75,7 +75,7 @@ def load_model(
|
|
| 75 |
snapshot_download_kwargs = {}
|
| 76 |
if cfg.base_model_ignore_patterns:
|
| 77 |
snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns
|
| 78 |
-
cache_model_path = Path(snapshot_download(base_model, **
|
| 79 |
files = (
|
| 80 |
list(cache_model_path.glob("*.pt"))
|
| 81 |
+ list(cache_model_path.glob("*.safetensors"))
|
|
|
|
| 75 |
snapshot_download_kwargs = {}
|
| 76 |
if cfg.base_model_ignore_patterns:
|
| 77 |
snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns
|
| 78 |
+
cache_model_path = Path(snapshot_download(base_model, **snapshot_download_kwargs))
|
| 79 |
files = (
|
| 80 |
list(cache_model_path.glob("*.pt"))
|
| 81 |
+ list(cache_model_path.glob("*.safetensors"))
|