| import multiprocessing | |
| import time | |
| from arguments import PretokenizationArguments | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer, HfArgumentParser | |
| def tokenize(example): | |
| output = {} | |
| output["input_ids"] = tokenizer(example["content"], truncation=False)["input_ids"] | |
| output["ratio_char_token"] = len(example["content"]) / len(output["input_ids"]) | |
| return output | |
| parser = HfArgumentParser(PretokenizationArguments) | |
| args = parser.parse_args() | |
| if args.num_workers is None: | |
| args.num_workers = multiprocessing.cpu_count() | |
| tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) | |
| t_start = time.time() | |
| ds = load_dataset(args.dataset_name, split="train") | |
| print(f"Dataset loaded in {time.time()-t_start:.2f}s") | |
| t_start = time.time() | |
| ds = ds.map( | |
| tokenize, | |
| num_proc=args.num_workers, | |
| remove_columns=[ | |
| "repo_name", | |
| "path", | |
| "copies", | |
| "size", | |
| "content", | |
| "license", | |
| "hash", | |
| "line_mean", | |
| "line_max", | |
| "alpha_frac", | |
| "autogenerated", | |
| ], | |
| ) | |
| print(f"Dataset tokenized in {time.time()-t_start:.2f}s") | |
| t_start = time.time() | |
| ds.push_to_hub(args.tokenized_data_repo) | |
| print(f"Data pushed to the hub in {time.time()-t_start:.2f}s") | |