recommend padding when using sample packing (#531)
Browse files- examples/code-llama/13b/lora.yml +1 -0
- examples/code-llama/13b/qlora.yml +1 -0
- examples/code-llama/34b/lora.yml +1 -0
- examples/code-llama/34b/qlora.yml +1 -0
- examples/code-llama/7b/lora.yml +1 -0
- examples/code-llama/7b/qlora.yml +1 -0
- examples/llama-2/lora.yml +1 -0
- examples/llama-2/qlora.yml +1 -0
- examples/llama-2/relora.yml +1 -0
- src/axolotl/utils/config.py +5 -0
- tests/test_validation.py +14 -0
examples/code-llama/13b/lora.yml
CHANGED
|
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
| 17 |
|
| 18 |
sequence_len: 100000
|
| 19 |
sample_packing: true
|
|
|
|
| 20 |
|
| 21 |
adapter: lora
|
| 22 |
lora_model_dir:
|
|
|
|
| 17 |
|
| 18 |
sequence_len: 100000
|
| 19 |
sample_packing: true
|
| 20 |
+
pad_to_sequence_len: true
|
| 21 |
|
| 22 |
adapter: lora
|
| 23 |
lora_model_dir:
|
examples/code-llama/13b/qlora.yml
CHANGED
|
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
| 20 |
|
| 21 |
sequence_len: 100000
|
| 22 |
sample_packing: true
|
|
|
|
| 23 |
|
| 24 |
lora_r: 32
|
| 25 |
lora_alpha: 16
|
|
|
|
| 20 |
|
| 21 |
sequence_len: 100000
|
| 22 |
sample_packing: true
|
| 23 |
+
pad_to_sequence_len: true
|
| 24 |
|
| 25 |
lora_r: 32
|
| 26 |
lora_alpha: 16
|
examples/code-llama/34b/lora.yml
CHANGED
|
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
| 17 |
|
| 18 |
sequence_len: 100000
|
| 19 |
sample_packing: true
|
|
|
|
| 20 |
|
| 21 |
adapter: lora
|
| 22 |
lora_model_dir:
|
|
|
|
| 17 |
|
| 18 |
sequence_len: 100000
|
| 19 |
sample_packing: true
|
| 20 |
+
pad_to_sequence_len: true
|
| 21 |
|
| 22 |
adapter: lora
|
| 23 |
lora_model_dir:
|
examples/code-llama/34b/qlora.yml
CHANGED
|
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
| 20 |
|
| 21 |
sequence_len: 100000
|
| 22 |
sample_packing: true
|
|
|
|
| 23 |
|
| 24 |
lora_r: 32
|
| 25 |
lora_alpha: 16
|
|
|
|
| 20 |
|
| 21 |
sequence_len: 100000
|
| 22 |
sample_packing: true
|
| 23 |
+
pad_to_sequence_len: true
|
| 24 |
|
| 25 |
lora_r: 32
|
| 26 |
lora_alpha: 16
|
examples/code-llama/7b/lora.yml
CHANGED
|
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
| 17 |
|
| 18 |
sequence_len: 100000
|
| 19 |
sample_packing: true
|
|
|
|
| 20 |
|
| 21 |
adapter: lora
|
| 22 |
lora_model_dir:
|
|
|
|
| 17 |
|
| 18 |
sequence_len: 100000
|
| 19 |
sample_packing: true
|
| 20 |
+
pad_to_sequence_len: true
|
| 21 |
|
| 22 |
adapter: lora
|
| 23 |
lora_model_dir:
|
examples/code-llama/7b/qlora.yml
CHANGED
|
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
| 20 |
|
| 21 |
sequence_len: 100000
|
| 22 |
sample_packing: true
|
|
|
|
| 23 |
|
| 24 |
lora_r: 32
|
| 25 |
lora_alpha: 16
|
|
|
|
| 20 |
|
| 21 |
sequence_len: 100000
|
| 22 |
sample_packing: true
|
| 23 |
+
pad_to_sequence_len: true
|
| 24 |
|
| 25 |
lora_r: 32
|
| 26 |
lora_alpha: 16
|
examples/llama-2/lora.yml
CHANGED
|
@@ -17,6 +17,7 @@ output_dir: ./lora-out
|
|
| 17 |
|
| 18 |
sequence_len: 4096
|
| 19 |
sample_packing: true
|
|
|
|
| 20 |
|
| 21 |
adapter: lora
|
| 22 |
lora_model_dir:
|
|
|
|
| 17 |
|
| 18 |
sequence_len: 4096
|
| 19 |
sample_packing: true
|
| 20 |
+
pad_to_sequence_len: true
|
| 21 |
|
| 22 |
adapter: lora
|
| 23 |
lora_model_dir:
|
examples/llama-2/qlora.yml
CHANGED
|
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
| 20 |
|
| 21 |
sequence_len: 4096
|
| 22 |
sample_packing: true
|
|
|
|
| 23 |
|
| 24 |
lora_r: 32
|
| 25 |
lora_alpha: 16
|
|
|
|
| 20 |
|
| 21 |
sequence_len: 4096
|
| 22 |
sample_packing: true
|
| 23 |
+
pad_to_sequence_len: true
|
| 24 |
|
| 25 |
lora_r: 32
|
| 26 |
lora_alpha: 16
|
examples/llama-2/relora.yml
CHANGED
|
@@ -20,6 +20,7 @@ lora_model_dir:
|
|
| 20 |
|
| 21 |
sequence_len: 4096
|
| 22 |
sample_packing: true
|
|
|
|
| 23 |
|
| 24 |
lora_r: 8
|
| 25 |
lora_alpha: 16
|
|
|
|
| 20 |
|
| 21 |
sequence_len: 4096
|
| 22 |
sample_packing: true
|
| 23 |
+
pad_to_sequence_len: true
|
| 24 |
|
| 25 |
lora_r: 8
|
| 26 |
lora_alpha: 16
|
src/axolotl/utils/config.py
CHANGED
|
@@ -97,6 +97,11 @@ def validate_config(cfg):
|
|
| 97 |
)
|
| 98 |
)
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
if cfg.gradient_accumulation_steps and cfg.batch_size:
|
| 101 |
raise ValueError(
|
| 102 |
"please set only one of gradient_accumulation_steps or batch_size"
|
|
|
|
| 97 |
)
|
| 98 |
)
|
| 99 |
|
| 100 |
+
if cfg.sample_packing and not cfg.pad_to_sequence_len:
|
| 101 |
+
LOG.warning(
|
| 102 |
+
"`pad_to_sequence_len: true` is recommended when using sample_packing"
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
if cfg.gradient_accumulation_steps and cfg.batch_size:
|
| 106 |
raise ValueError(
|
| 107 |
"please set only one of gradient_accumulation_steps or batch_size"
|
tests/test_validation.py
CHANGED
|
@@ -328,6 +328,20 @@ class ValidationTest(unittest.TestCase):
|
|
| 328 |
for record in self._caplog.records
|
| 329 |
)
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
cfg = DictDefault(
|
| 332 |
{
|
| 333 |
"max_packed_sequence_len": 2048,
|
|
|
|
| 328 |
for record in self._caplog.records
|
| 329 |
)
|
| 330 |
|
| 331 |
+
cfg = DictDefault(
|
| 332 |
+
{
|
| 333 |
+
"sample_packing": True,
|
| 334 |
+
"pad_to_sequence_len": None,
|
| 335 |
+
}
|
| 336 |
+
)
|
| 337 |
+
with self._caplog.at_level(logging.WARNING):
|
| 338 |
+
validate_config(cfg)
|
| 339 |
+
assert any(
|
| 340 |
+
"`pad_to_sequence_len: true` is recommended when using sample_packing"
|
| 341 |
+
in record.message
|
| 342 |
+
for record in self._caplog.records
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
cfg = DictDefault(
|
| 346 |
{
|
| 347 |
"max_packed_sequence_len": 2048,
|