fix(preprocess): Make sure dataset not loaded from cache when using preprocess cli (#1136)
Browse files- src/axolotl/utils/data.py +10 -2
src/axolotl/utils/data.py
CHANGED
|
@@ -152,7 +152,11 @@ def load_tokenized_prepared_datasets(
|
|
| 152 |
|
| 153 |
if dataset:
|
| 154 |
...
|
| 155 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
| 157 |
dataset = load_from_disk(str(prepared_ds_path))
|
| 158 |
LOG.info("Prepared dataset loaded from disk...")
|
|
@@ -465,7 +469,11 @@ def load_prepare_datasets(
|
|
| 465 |
|
| 466 |
if dataset:
|
| 467 |
...
|
| 468 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
LOG.info(
|
| 470 |
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
| 471 |
)
|
|
|
|
| 152 |
|
| 153 |
if dataset:
|
| 154 |
...
|
| 155 |
+
elif (
|
| 156 |
+
cfg.dataset_prepared_path
|
| 157 |
+
and any(prepared_ds_path.glob("*"))
|
| 158 |
+
and not cfg.is_preprocess
|
| 159 |
+
):
|
| 160 |
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
| 161 |
dataset = load_from_disk(str(prepared_ds_path))
|
| 162 |
LOG.info("Prepared dataset loaded from disk...")
|
|
|
|
| 469 |
|
| 470 |
if dataset:
|
| 471 |
...
|
| 472 |
+
elif (
|
| 473 |
+
cfg.dataset_prepared_path
|
| 474 |
+
and any(prepared_ds_path.glob("*"))
|
| 475 |
+
and not cfg.is_preprocess
|
| 476 |
+
):
|
| 477 |
LOG.info(
|
| 478 |
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
| 479 |
)
|