Spaces:
Runtime error
Runtime error
| import os as __os # add "__" if not want to be exported | |
| from copy import deepcopy as __deepcopy | |
| import itertools as __itertools | |
| data_root = "DATAS/TRAIN_TEST" | |
| anno_root_it = f"{data_root}/magic_jsons" | |
| # ============== pretraining datasets================= | |
| available_corpus = dict( | |
| # image | |
| # caption_coco=[ | |
| # f"{anno_root_it}/image/caption/coco/train.json", | |
| # f"{data_root}/images/coco", | |
| # ], | |
| # caption_llava=[ | |
| # f"{anno_root_it}/image/caption/llava/train.json", | |
| # f"{data_root}/images/coco", | |
| # ], | |
| # caption_minigpt4=[ | |
| # f"{anno_root_it}/image/caption/minigpt4/train.json", | |
| # f"{data_root}/images/minigpt4_align/image", | |
| # ], | |
| # caption_paragraph_captioning=[ | |
| # f"{anno_root_it}/image/caption/paragraph_captioning/train.json", | |
| # f"{data_root}/images/m3it/image-paragraph-captioning", | |
| # ], | |
| # caption_textcaps=[ | |
| # f"{anno_root_it}/image/caption/textcaps/train.json", | |
| # f"{data_root}/images/textcaps", | |
| # ], | |
| # classification_imagenet=[ | |
| # f"{anno_root_it}/image/classification/imagenet/train.json", | |
| # f"{data_root}/images/m3it/imagenet", | |
| # ], | |
| # classification_coco_itm=[ | |
| # f"{anno_root_it}/image/classification/coco_itm/train.json", | |
| # f"{data_root}/images/coco", | |
| # ], | |
| # conversation_llava=[ | |
| # f"{anno_root_it}/image/conversation/llava/train.json", | |
| # f"{data_root}/images/coco", | |
| # ], | |
| # reasoning_clevr=[ | |
| # f"{anno_root_it}/image/reasoning/clevr/train.json", | |
| # f"{data_root}/images/m3it/clevr", | |
| # ], | |
| # reasoning_visual_mrc=[ | |
| # f"{anno_root_it}/image/reasoning/visual_mrc/train.json", | |
| # f"{data_root}/images/m3it/visual_mrc", | |
| # ], | |
| # reasoning_llava=[ | |
| # f"{anno_root_it}/image/reasoning/llava/train.json", | |
| # f"{data_root}/images/coco", | |
| # ], | |
| # vqa_vqav2=[ | |
| # f"{anno_root_it}/image/vqa/vqav2/train.json", | |
| # f"{data_root}/images/m3it/vqav2", | |
| # ], | |
| # vqa_gqa=[ | |
| # f"{anno_root_it}/image/vqa/gqa/train.json", | |
| # f"{data_root}/images/gqa/images", | |
| # ], | |
| # vqa_okvqa=[ | |
| # f"{anno_root_it}/image/vqa/okvqa/train.json", | |
| # f"{data_root}/images/m3it/okvqa", | |
| # ], | |
| # vqa_a_okvqa=[ | |
| # f"{anno_root_it}/image/vqa/a_okvqa/train.json", | |
| # f"{data_root}/images/m3it/a_okvqa", | |
| # ], | |
| # vqa_viquae=[ | |
| # f"{anno_root_it}/image/vqa/viquae/train.json", | |
| # f"{data_root}/images/viquae_images", | |
| # ], | |
| # vqa_ocr_vqa=[ | |
| # f"{anno_root_it}/image/vqa/ocr_vqa/train.json", | |
| # f"{data_root}/images/ocr_vqa/images", | |
| # ], | |
| # vqa_text_vqa=[ | |
| # f"{anno_root_it}/image/vqa/text_vqa/train.json", | |
| # f"{data_root}/images/textvqa", | |
| # ], | |
| # vqa_st_vqa=[ | |
| # f"{anno_root_it}/image/vqa/st_vqa/train.json", | |
| # f"{data_root}/images/m3it/st-vqa", | |
| # ], | |
| # vqa_docvqa=[ | |
| # f"{anno_root_it}/image/vqa/docvqa/train.json", | |
| # f"{data_root}/images/docvqa", | |
| # ], | |
| # origin_llava=[ | |
| # f"{anno_root_it}/image/origin_llava/train.json", | |
| # f"{data_root}/images", | |
| # ], | |
| # video | |
| caption_textvr=[ | |
| f"{anno_root_it}/video/caption/textvr/train.json", | |
| f"{data_root}/videos/TextVR", | |
| "video" | |
| ], | |
| caption_videochat=[ | |
| f"{anno_root_it}/video/caption/videochat/train.json", | |
| f"{data_root}/videos/webvid_10m", | |
| "video" | |
| ], # not ready, need to read from hdfs | |
| caption_webvid=[ | |
| f"{anno_root_it}/video/caption/webvid/train.json", | |
| f"{data_root}/videos/webvid_10m", | |
| "video" | |
| ], # not ready, need to read from hdfs | |
| caption_youcook2=[ | |
| f"{anno_root_it}/video/caption/youcook2/train.json", | |
| f"{data_root}/videos/YouCook2/split_videos", | |
| "video" | |
| ], | |
| classification_k710=[ | |
| f"{anno_root_it}/video/classification/k710/train.json", | |
| f"{data_root}/videos/kinetics", | |
| "video" | |
| ], | |
| classification_ssv2=[ | |
| f"{anno_root_it}/video/classification/ssv2/train.json", | |
| f"{data_root}/videos/20bn-something-something-v2", | |
| "video" | |
| ], | |
| conversation_videochat1=[ | |
| f"{anno_root_it}/video/conversation/videochat1/train.json", | |
| f"{data_root}/videos/webvid_10m", | |
| "video" | |
| ],# not ready, need to read from hdfs | |
| conversation_videochat2=[ | |
| f"{anno_root_it}/video/conversation/videochat2/train.json", | |
| f"{data_root}/videos/InternVid-10M-FLT/videos", | |
| "video" | |
| ], | |
| conversation_videochatgpt=[ | |
| f"{anno_root_it}/video/conversation/videochatgpt/train.json", | |
| f"{data_root}/videos/AVideo_ChatGPT", | |
| "video" | |
| ], | |
| reasoning_next_qa=[ | |
| f"{anno_root_it}/video/reasoning/next_qa/train.json", | |
| f"{data_root}/videos/NExTVideo", | |
| "video" | |
| ], | |
| reasoning_clevrer_qa=[ | |
| f"{anno_root_it}/video/reasoning/clevrer_qa/train.json", | |
| f"{data_root}/videos/CLEVRER", | |
| "video" | |
| ], | |
| reasoning_clevrer_mc=[ | |
| f"{anno_root_it}/video/reasoning/clevrer_mc/train.json", | |
| f"{data_root}/videos/CLEVRER", | |
| "video" | |
| ], | |
| vqa_ego_qa=[ | |
| f"{anno_root_it}/video/vqa/ego_qa/train.json", | |
| f"{data_root}/videos/ego4d_data/split_videos", | |
| "video" | |
| ], | |
| vqa_tgif_frame_qa=[ | |
| f"{anno_root_it}/video/vqa/tgif_frame_qa/train.json", | |
| f"{data_root}/videos/tgif", | |
| "video" | |
| ], | |
| vqa_tgif_transition_qa=[ | |
| f"{anno_root_it}/video/vqa/tgif_transition_qa/train.json", | |
| f"{data_root}/videos/tgif", | |
| "video" | |
| ], | |
| vqa_webvid_qa=[ | |
| f"{anno_root_it}/video/vqa/webvid_qa/train.json", | |
| f"{data_root}/videos/webvid_10m", | |
| "video" | |
| ],# not ready, need to read from hdfs | |
| origin_videochatgpt=[ | |
| f"{anno_root_it}/video/origin_videochatgpt/train.json", | |
| f"{data_root}/videos/Video_ChatGPT", | |
| "video" | |
| ], | |
| ) | |
| available_corpus["videochat2_instruction_full"] = [ | |
| available_corpus["caption_coco"], | |
| available_corpus["caption_llava"], | |
| available_corpus["caption_minigpt4"], | |
| available_corpus["caption_paragraph_captioning"], | |
| available_corpus["caption_textcaps"], | |
| available_corpus["classification_imagenet"], | |
| available_corpus["classification_coco_itm"], | |
| available_corpus["conversation_llava"], | |
| available_corpus["reasoning_clevr"], | |
| available_corpus["reasoning_visual_mrc"], | |
| available_corpus["reasoning_llava"], | |
| available_corpus["vqa_vqav2"], | |
| available_corpus["vqa_gqa"], | |
| available_corpus["vqa_okvqa"], | |
| available_corpus["vqa_a_okvqa"], | |
| available_corpus["vqa_viquae"], | |
| available_corpus["vqa_ocr_vqa"], | |
| available_corpus["vqa_text_vqa"], | |
| available_corpus["vqa_st_vqa"], | |
| available_corpus["vqa_docvqa"], | |
| available_corpus["caption_textvr"], | |
| available_corpus["caption_youcook2"], | |
| available_corpus["classification_k710"], | |
| available_corpus["classification_ssv2"], | |
| available_corpus["conversation_videochat2"], | |
| available_corpus["conversation_videochatgpt"], | |
| available_corpus["reasoning_next_qa"], | |
| available_corpus["reasoning_clevrer_qa"], | |
| available_corpus["reasoning_clevrer_mc"], | |
| available_corpus["vqa_ego_qa"], | |
| available_corpus["vqa_tgif_frame_qa"], | |
| available_corpus["vqa_tgif_transition_qa"], | |
| available_corpus["conversation_videochat1"], | |
| available_corpus["vqa_webvid_qa"], | |
| available_corpus["caption_videochat"], | |
| available_corpus["caption_webvid"], | |
| ] | |
| available_corpus["videochat2_video"] = [ | |
| available_corpus["caption_textvr"], | |
| available_corpus["caption_youcook2"], | |
| available_corpus["classification_k710"], | |
| available_corpus["classification_ssv2"], | |
| available_corpus["conversation_videochat2"], | |
| available_corpus["conversation_videochatgpt"], | |
| available_corpus["reasoning_next_qa"], | |
| available_corpus["reasoning_clevrer_qa"], | |
| available_corpus["reasoning_clevrer_mc"], | |
| available_corpus["vqa_ego_qa"], | |
| available_corpus["vqa_tgif_frame_qa"], | |
| available_corpus["vqa_tgif_transition_qa"], | |
| available_corpus["conversation_videochat1"], | |
| available_corpus["vqa_webvid_qa"], | |
| available_corpus["caption_videochat"], | |
| available_corpus["caption_webvid"], | |
| ] | |
| # ============== for debug================= | |
| available_corpus["videochat2_instruction_debug"] = [ | |
| # available_corpus["caption_minigpt4"], | |
| available_corpus["caption_textvr"], | |
| # available_corpus["vqa_ego_qa"], | |
| # available_corpus["classification_k710"], | |
| # available_corpus["reasoning_next_qa"], | |
| # available_corpus["caption_textvr"], | |
| # available_corpus["caption_youcook2"], | |
| # available_corpus["caption_textcaps"], # realistic caption foucsing in real life text | |
| # available_corpus["caption_textvr"], # good realistic captioning, also focusing on text | |
| ] | |
| if __name__ == '__main__': | |
| print(len(list( | |
| __itertools.chain( | |
| available_corpus['conversation_data'], | |
| available_corpus['reasoning_data'], | |
| available_corpus['conversation_videochat2'], | |
| available_corpus['caption_data'], | |
| available_corpus['classification_data'], | |
| ) | |
| ))) | |
| print(len(available_corpus['videochat2_instruction_full'])) |