Spaces:

HF-test-lab
/

bulk_embeddings

Runtime error

nbroad commited on Jul 15, 2023

Commit

67c3f28

1 Parent(s): 8f7b0ec

add option for wikipedia

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -112,7 +112,32 @@ def load_hf_dataset(ds_name: str, ds_config: str = None, ds_split: str = "train"
     return ds
 def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
     """
     Load the model and tokenizer from the HuggingFace Hub.

     return ds
+def download_wikipedia(ds_name, ds_config):
+    ds = load_dataset(ds_name, ds_config, streaming=True, split="train")
+    def gen():
+        for example in ds:
+            yield {"text": example["text"]}
+    ds2 = Dataset.from_generator(gen)
+    chunk_size = 200_000
+    filenames = []
+    Path("wiki_chunks").mkdir(exist_ok=True)
+    for chunk_num, start_idx in enumerate(range(0, len(ds2), chunk_size)):
+        end_idx = min(start_idx + chunk_size, len(ds2))
+        temp = ds2.select(range(start_idx, end_idx))
+        temp.to_parquet(f"wiki_chunks/chunk_{chunk_num}")
+        filenames.append(f"wiki_chunks/chunk_{chunk_num}")
+    return load_dataset("parquet", data_files=filenames, split="train")
 def get_model_and_tokenizer(model_name: str, optimization_level: str, progress):
     """
     Load the model and tokenizer from the HuggingFace Hub.