Spaces:
Runtime error
Runtime error
| from typing import List | |
| import random | |
| import argparse | |
| import json | |
| from huggingface_hub import hf_hub_download | |
| from datasets import Dataset | |
| from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER | |
| PRETRAIN_PHRASES = [ | |
| "Repeat the content of the video <video>", | |
| "What is occuring in the video? <video>", | |
| "<video>. What happened?", | |
| "Convert <video> to text", | |
| "What is being depicted in <video>?", | |
| "What is the content of <video>?", | |
| "Describe what occurs in the video. <video>", | |
| "What is the video about? <video>", | |
| "<video>. Tell me what occurs in the video.", | |
| "What is the video about? <video>", | |
| "Give me a summary of <video>", | |
| "<video>. Detail what is happening in the video.", | |
| "Tell me about <video>", | |
| ] | |
| def _timestamp_to_seconds(timestamp: str): | |
| parts = timestamp.split(":") | |
| seconds = float(parts[-1]) | |
| seconds += float(parts[-2]) * 60 | |
| seconds += float(parts[-3]) * 60 * 60 | |
| return seconds | |
| def _write_convo(row) -> List: | |
| video = { | |
| "url": "https://www.youtube.com/watch?v=" + row["YoutubeID"], | |
| "start_time": _timestamp_to_seconds(row["Start_timestamp"]), | |
| "end_time": _timestamp_to_seconds(row["End_timestamp"]), | |
| } | |
| # test load, jk let it fail | |
| # load_video(video) | |
| example = { | |
| "videos": [video], | |
| } | |
| phrase = random.choice(PRETRAIN_PHRASES) | |
| example["messages"] = [ | |
| { | |
| "role": ROLE_USER, | |
| "content": phrase, | |
| }, | |
| { | |
| "role": ROLE_ASSISTANT, | |
| "content": row["Caption"], | |
| }, | |
| ] | |
| return example | |
| def main(args): | |
| path = hf_hub_download( | |
| repo_id="OpenGVLab/InternVid", filename="caption.jsonl", repo_type="dataset" | |
| ) | |
| rows = [] | |
| with open(path, "r") as f: | |
| for line in f: | |
| rows.append(json.loads(line)) | |
| print("Dataset size:", len(rows)) | |
| if len(rows) > args.max_examples: | |
| rows = random.sample(rows, k=args.max_examples) | |
| def gen(subset_rows): | |
| for row in subset_rows: | |
| try: | |
| yield _write_convo(row) | |
| except Exception as e: | |
| print(e) | |
| ds = Dataset.from_generator(gen, gen_kwargs={"subset_rows": rows}, num_proc=5) | |
| ds.save_to_disk(args.output_folder) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "-o", "--output_folder", type=str, default="/data/xclip-internvid-pretrain" | |
| ) | |
| parser.add_argument("-n", "--max_examples", type=int, default=500_000) | |
| args = parser.parse_args() | |
| main(args) | |