note pattern when using groups
Browse files
README.md
CHANGED
|
@@ -427,7 +427,8 @@ save_safetensors:
|
|
| 427 |
# whether to mask out or include the human's prompt from the training labels
|
| 428 |
train_on_inputs: false
|
| 429 |
# group similarly sized data to minimize padding
|
| 430 |
-
# may be slower to start as it must download and sort the entire dataset
|
|
|
|
| 431 |
group_by_length: false
|
| 432 |
|
| 433 |
# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
|
|
|
| 427 |
# whether to mask out or include the human's prompt from the training labels
|
| 428 |
train_on_inputs: false
|
| 429 |
# group similarly sized data to minimize padding
|
| 430 |
+
# may be slower to start, as it must download and sort the entire dataset
|
| 431 |
+
# note that training loss may have an oscillating pattern with this enabled
|
| 432 |
group_by_length: false
|
| 433 |
|
| 434 |
# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|