Ethan Smith
commited on
Fix bug in dataset loading (#284)
Browse files* Fix bug in dataset loading
This fixes a bug when loading datasets. `d.data_files` is a list, so it cannot be directly passed to `hf_hub_download`
* Check type of data_files, and load accordingly
- src/axolotl/utils/data.py +20 -5
src/axolotl/utils/data.py
CHANGED
|
@@ -205,11 +205,26 @@ def load_tokenized_prepared_datasets(
|
|
| 205 |
use_auth_token=use_auth_token,
|
| 206 |
)
|
| 207 |
else:
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
ds = load_dataset(
|
| 214 |
"json", name=d.name, data_files=fp, streaming=False, split=None
|
| 215 |
)
|
|
|
|
| 205 |
use_auth_token=use_auth_token,
|
| 206 |
)
|
| 207 |
else:
|
| 208 |
+
if isinstance(d.data_files, str):
|
| 209 |
+
fp = hf_hub_download(
|
| 210 |
+
repo_id=d.path,
|
| 211 |
+
repo_type="dataset",
|
| 212 |
+
filename=d.data_files,
|
| 213 |
+
)
|
| 214 |
+
elif isinstance(d.data_files, list):
|
| 215 |
+
fp = []
|
| 216 |
+
for file in d.data_files:
|
| 217 |
+
fp.append(
|
| 218 |
+
hf_hub_download(
|
| 219 |
+
repo_id=d.path,
|
| 220 |
+
repo_type="dataset",
|
| 221 |
+
filename=file,
|
| 222 |
+
)
|
| 223 |
+
)
|
| 224 |
+
else:
|
| 225 |
+
raise ValueError(
|
| 226 |
+
"data_files must be either a string or list of strings"
|
| 227 |
+
)
|
| 228 |
ds = load_dataset(
|
| 229 |
"json", name=d.name, data_files=fp, streaming=False, split=None
|
| 230 |
)
|