Spaces:
Runtime error
Runtime error
add option for wikipedia
Browse files
utils.py
CHANGED
|
@@ -112,7 +112,32 @@ def load_hf_dataset(ds_name: str, ds_config: str = None, ds_split: str = "train"
|
|
| 112 |
|
| 113 |
return ds
|
| 114 |
|
|
|
|
|
|
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
|
| 117 |
"""
|
| 118 |
Load the model and tokenizer from the HuggingFace Hub.
|
|
|
|
| 112 |
|
| 113 |
return ds
|
| 114 |
|
| 115 |
+
def download_wikipedia(ds_name, ds_config):
|
| 116 |
+
ds = load_dataset(ds_name, ds_config, streaming=True, split="train")
|
| 117 |
|
| 118 |
+
def gen():
|
| 119 |
+
for example in ds:
|
| 120 |
+
yield {"text": example["text"]}
|
| 121 |
+
|
| 122 |
+
ds2 = Dataset.from_generator(gen)
|
| 123 |
+
|
| 124 |
+
chunk_size = 200_000
|
| 125 |
+
|
| 126 |
+
filenames = []
|
| 127 |
+
|
| 128 |
+
Path("wiki_chunks").mkdir(exist_ok=True)
|
| 129 |
+
|
| 130 |
+
for chunk_num, start_idx in enumerate(range(0, len(ds2), chunk_size)):
|
| 131 |
+
end_idx = min(start_idx + chunk_size, len(ds2))
|
| 132 |
+
|
| 133 |
+
temp = ds2.select(range(start_idx, end_idx))
|
| 134 |
+
|
| 135 |
+
temp.to_parquet(f"wiki_chunks/chunk_{chunk_num}")
|
| 136 |
+
filenames.append(f"wiki_chunks/chunk_{chunk_num}")
|
| 137 |
+
|
| 138 |
+
return load_dataset("parquet", data_files=filenames, split="train")
|
| 139 |
+
|
| 140 |
+
|
| 141 |
def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
|
| 142 |
"""
|
| 143 |
Load the model and tokenizer from the HuggingFace Hub.
|