Spaces:
Paused
Paused
| # Copyright 2025 Bytedance Ltd. and/or its affiliates. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| from .interleave_datasets import UnifiedEditIterableDataset | |
| from .t2i_dataset import T2IIterableDataset | |
| from .vlm_dataset import SftJSONLIterableDataset | |
| DATASET_REGISTRY = { | |
| 't2i_pretrain': T2IIterableDataset, | |
| 'vlm_sft': SftJSONLIterableDataset, | |
| 'unified_edit': UnifiedEditIterableDataset, | |
| } | |
| DATASET_INFO = { | |
| 't2i_pretrain': { | |
| 't2i': { | |
| 'data_dir': 'your_data_path/bagel_example/t2i', # path of the parquet files | |
| 'num_files': 10, # number of data units to be sharded across all ranks and workers | |
| 'num_total_samples': 1000, # number of total samples in the dataset | |
| }, | |
| }, | |
| 'unified_edit':{ | |
| 'seedxedit_multi': { | |
| 'data_dir': 'your_data_path/bagel_example/editing/seedxedit_multi', | |
| 'num_files': 10, | |
| 'num_total_samples': 1000, | |
| "parquet_info_path": 'your_data_path/bagel_example/editing/parquet_info/seedxedit_multi_nas.json', # information of the parquet files | |
| }, | |
| }, | |
| 'vlm_sft': { | |
| 'llava_ov': { | |
| 'data_dir': 'your_data_path/bagel_example/vlm/images', | |
| 'jsonl_path': 'your_data_path/bagel_example/vlm/llava_ov_si.jsonl', | |
| 'num_total_samples': 1000 | |
| }, | |
| }, | |
| } |