Upload folder using huggingface_hub

Files changed (5) hide show

jsonl.py ADDED Viewed

+import sentencepiece as spm
+input_file = "output.txt"
+model_prefix = "my_tokenizer"
+vocab_size = 18816
+model_type = "word"
+input_sentence_size = 1000000
+shuffle_input_sentence = True
+pad_token = '<pad>'
+bos_token = '<start>'
+eos_token = '<end>'
+unk_token = '<unk>'
+spm.SentencePieceTrainer.train(
+    f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type} --input_sentence_size={input_sentence_size} --shuffle_input_sentence={shuffle_input_sentence} --max_sentence_length=40000 --pad_id=0 --pad_piece={pad_token} --unk_id=1 --unk_piece={unk_token} --bos_id=2 --bos_piece={bos_token} --eos_id=3 --eos_piece={eos_token}"
+)
+tokenizer = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")

jsonl_to_parquet.py ADDED Viewed

+from datasets import load_dataset
+# Load the dataset with only the "text" column
+dataset = load_dataset("nRuaif/MusicLM", split="train", columns=["text"], streaming=True)
+# Open a file to write the text data to
+with open("output.txt", "w") as f:
+    # Stream through the dataset and write the "text" column to the file, separated by newline
+    for example in dataset:
+        f.write(example["text"] + "\n")

my_tokenizer.model ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9a99c4bc4789fe99b2d681e4580277310b6afb7c5879dd06542461054a2cad2
+size 578324

my_tokenizer.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

tmp_tf_gcs_fs_pointer_53013 ADDED Viewed

Binary file (8 Bytes). View file