Phương
commited on
Commit
·
0dc226d
1
Parent(s):
099442b
Upload folder using huggingface_hub
Browse files- jsonl.py +19 -0
- jsonl_to_parquet.py +10 -0
- my_tokenizer.model +3 -0
- my_tokenizer.vocab +0 -0
- tmp_tf_gcs_fs_pointer_53013 +0 -0
jsonl.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sentencepiece as spm
|
| 2 |
+
|
| 3 |
+
input_file = "output.txt"
|
| 4 |
+
model_prefix = "my_tokenizer"
|
| 5 |
+
vocab_size = 18816
|
| 6 |
+
model_type = "word"
|
| 7 |
+
input_sentence_size = 1000000
|
| 8 |
+
shuffle_input_sentence = True
|
| 9 |
+
|
| 10 |
+
pad_token = '<pad>'
|
| 11 |
+
bos_token = '<start>'
|
| 12 |
+
eos_token = '<end>'
|
| 13 |
+
unk_token = '<unk>'
|
| 14 |
+
|
| 15 |
+
spm.SentencePieceTrainer.train(
|
| 16 |
+
f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type} --input_sentence_size={input_sentence_size} --shuffle_input_sentence={shuffle_input_sentence} --max_sentence_length=40000 --pad_id=0 --pad_piece={pad_token} --unk_id=1 --unk_piece={unk_token} --bos_id=2 --bos_piece={bos_token} --eos_id=3 --eos_piece={eos_token}"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
tokenizer = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")
|
jsonl_to_parquet.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
|
| 3 |
+
# Load the dataset with only the "text" column
|
| 4 |
+
dataset = load_dataset("nRuaif/MusicLM", split="train", columns=["text"], streaming=True)
|
| 5 |
+
|
| 6 |
+
# Open a file to write the text data to
|
| 7 |
+
with open("output.txt", "w") as f:
|
| 8 |
+
# Stream through the dataset and write the "text" column to the file, separated by newline
|
| 9 |
+
for example in dataset:
|
| 10 |
+
f.write(example["text"] + "\n")
|
my_tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9a99c4bc4789fe99b2d681e4580277310b6afb7c5879dd06542461054a2cad2
|
| 3 |
+
size 578324
|
my_tokenizer.vocab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tmp_tf_gcs_fs_pointer_53013
ADDED
|
Binary file (8 Bytes). View file
|
|
|