Spaces:

KingNish
/

Bagel-7B-Demo

Paused

App Files Files Community

KingNish commited on May 26

Commit

9481949

verified ·

1 Parent(s): 38fb4d8

Upload 14 files

Browse files

Files changed (14) hide show

data/__init__.py +2 -0
data/configs/example.yaml +45 -0
data/data_utils.py +177 -0
data/dataset_base.py +620 -0
data/dataset_info.py +39 -0
data/distributed_iterable_dataset.py +58 -0
data/interleave_datasets/__init__.py +5 -0
data/interleave_datasets/edit_dataset.py +72 -0
data/interleave_datasets/interleave_t2i_dataset.py +212 -0
data/parquet_utils.py +90 -0
data/t2i_dataset.py +128 -0
data/transforms.py +287 -0
data/video_utils.py +165 -0
data/vlm_dataset.py +195 -0

data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2025 Bytedance Ltd. and/or its affiliates.
2	+ # SPDX-License-Identifier: Apache-2.0

data/configs/example.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+t2i_pretrain:
+  dataset_names:
+  - t2i
+  image_transform_args:
+    image_stride: 16
+    max_image_size: 1024
+    min_image_size: 512
+  is_mandatory: true
+  num_used_data: # The sum should be larger that NUM_GPUS x NUM_WORKERS
+  - 10
+  weight: 1
+unified_edit:
+  dataset_names:
+  - seedxedit_multi
+  image_transform_args:
+    image_stride: 16
+    max_image_size: 1024
+    min_image_size: 512
+  vit_image_transform_args:
+    image_stride: 14
+    max_image_size: 518
+    min_image_size: 224
+  is_mandatory: false
+  num_used_data:
+  - 10
+  weight: 1
+vlm_sft:
+  dataset_names:
+  - llava_ov
+  image_transform_args:
+    image_stride: 14
+    max_image_size: 980
+    min_image_size: 378
+    max_pixels: 2_007_040
+  frame_sampler_args:
+    max_num_frames: 12
+    min_num_frames: 8
+  is_mandatory: true
+  shuffle_lines: True
+  shuffle_seed: 0
+  num_used_data:
+  - 1000
+  weight: 1

data/data_utils.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import math
+import random
+from PIL import Image
+import torch
+from torch.nn.attention.flex_attention import or_masks, and_masks
+def create_sparse_mask(document_lens, split_lens, attn_modes, device):
+    def causal_mask(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+    def full_and_noise_mask(b, h, q_idx, kv_idx):
+        return (full_and_noise_seq_id[q_idx] == full_and_noise_seq_id[kv_idx]) & (full_and_noise_seq_id[q_idx] >= 0)
+    def remove_noise_mask(b, h, q_idx, kv_idx):
+        return (~((noise_seq_id[kv_idx] >= 0) & (noise_seq_id[q_idx] != noise_seq_id[kv_idx])))
+    def sample_mask(b, h, q_idx, kv_idx):
+        return document_id[q_idx] == document_id[kv_idx]
+    full_and_noise_tmp = []
+    noise_tmp = []
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+        value = i if model in ['full', 'noise'] else -1
+        full_and_noise_tmp.extend([value] * length)
+        value_noise = i if model == 'noise' else -1
+        noise_tmp.extend([value_noise] * length)
+    full_and_noise_seq_id = torch.Tensor(full_and_noise_tmp).to(device)
+    noise_seq_id = torch.Tensor(noise_tmp).to(device)
+    document_id = torch.cat([torch.full((l,), i) for i, l in enumerate(document_lens, start=1)]).to(device)
+    return and_masks(or_masks(causal_mask, full_and_noise_mask), remove_noise_mask, sample_mask)
+def patchify(image, patch_size):
+    p = patch_size
+    c, h, w = image.shape
+    assert h % p == 0 and w % p == 0
+    image = image.reshape(c, h // p, p, w // p, p)
+    image = torch.einsum("chpwq->hwpqc", image)
+    image = image.reshape(-1, p**2 * c)
+    return image
+def get_flattened_position_ids_extrapolate(img_h, img_w, patch_size, max_num_patches_per_side):
+    num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
+    coords_h = torch.arange(0, num_patches_h)
+    coords_w = torch.arange(0, num_patches_w)
+    pos_ids = (coords_h[:, None] * max_num_patches_per_side + coords_w).flatten()
+    return pos_ids
+def get_flattened_position_ids_interpolate(img_h, img_w, patch_size, max_num_patches_per_side):
+    num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
+    boundaries = torch.arange(1 / max_num_patches_per_side, 1.0, 1 / max_num_patches_per_side)
+    fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / num_patches_h)
+    fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / num_patches_w)
+    bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+    bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+    pos_ids = (bucket_coords_h[:, None] * max_num_patches_per_side + bucket_coords_w).flatten()
+    return pos_ids
+def prepare_attention_mask_per_sample(split_lens, attn_modes, device="cpu"):
+    """
+    nested_split_lens: A list of N lists of ints. Each int indicates the length of a split within
+        a sample, where each sample contains multiple splits with different attn modes.
+    nested_attn_modes: whether to use full attn in each split.
+    """
+    sample_len = sum(split_lens)
+    attention_mask = torch.zeros((sample_len, sample_len), dtype=torch.bool, device=device)
+    csum = 0
+    for s, attn_mode in zip(split_lens, attn_modes):
+        assert attn_mode in ['causal', 'full', 'noise']
+        if attn_mode == "causal":
+            attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s), device=device).tril()
+            attention_mask[csum:csum + s, :csum] = 1
+        else:
+            attention_mask[csum:csum + s, csum:csum + s] = torch.ones((s, s))
+            attention_mask[csum:csum + s, :csum] = 1
+        csum += s
+    csum = 0
+    for s, attn_mode in zip(split_lens, attn_modes):
+        if attn_mode == "noise":
+            attention_mask[:, csum : csum + s] = torch.zeros((sample_len, s))
+            attention_mask[csum : csum + s, csum : csum + s] = torch.ones((s, s))
+        csum += s
+    attention_mask = torch.zeros_like(attention_mask, dtype=torch.float).masked_fill_(
+        ~attention_mask, float("-inf")
+    )
+    return attention_mask
+def split_integer_exp_decay(S, ng_sample_decay=1.0):
+    if ng_sample_decay == 1.0:
+        N = random.randint(1, S)
+    else:
+        base = (1 - ng_sample_decay) / (1 - math.pow(ng_sample_decay, S))
+        p = [base * math.pow(ng_sample_decay, i) for i in range(S)]
+        N = random.choices(list(range(1, S + 1)), p, k=1)[0]
+    cumsum = [0] + sorted(random.sample(range(1, S), N - 1)) + [S]
+    result = [cumsum[i+1] - cumsum[i] for i in range(len(cumsum) - 1)]
+    return result, cumsum
+def pil_img2rgb(image):
+    if image.mode == "RGBA" or image.info.get("transparency", None) is not None:
+        image = image.convert("RGBA")
+        white = Image.new(mode="RGB", size=image.size, color=(255, 255, 255))
+        white.paste(image, mask=image.split()[3])
+        image = white
+    else:
+        image = image.convert("RGB")
+    return image
+def add_special_tokens(tokenizer):
+    all_special_tokens = []
+    for k, v in tokenizer.special_tokens_map.items():
+        if isinstance(v, str):
+            all_special_tokens.append(v)
+        elif isinstance(v, list):
+            all_special_tokens += v
+    new_tokens = []
+    if '<|im_start|>' not in all_special_tokens:
+        new_tokens.append('<|im_start|>')
+    if '<|im_end|>' not in all_special_tokens:
+        new_tokens.append('<|im_end|>')
+    if '<|vision_start|>' not in all_special_tokens:
+        new_tokens.append('<|vision_start|>')
+    if '<|vision_end|>' not in all_special_tokens:
+        new_tokens.append('<|vision_end|>')
+    num_new_tokens = tokenizer.add_tokens(new_tokens)
+    bos_token_id = tokenizer.convert_tokens_to_ids('<|im_start|>')
+    eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>')
+    start_of_image = tokenizer.convert_tokens_to_ids('<|vision_start|>')
+    end_of_image = tokenizer.convert_tokens_to_ids('<|vision_end|>')
+    new_token_ids = dict(
+        bos_token_id=bos_token_id,
+        eos_token_id=eos_token_id,
+        start_of_image=start_of_image,
+        end_of_image=end_of_image,
+    )
+    return tokenizer, new_token_ids, num_new_tokens
+def len2weight(x, loss_reduction='square'):
+    if x == 0:
+        return x
+    if loss_reduction == 'token':
+        return 1
+    if loss_reduction == 'sample':
+        return 1 / x
+    if loss_reduction == 'square':
+        return 1 / (x ** 0.5)
+    raise NotImplementedError(loss_reduction)

data/dataset_base.py ADDED Viewed

	@@ -0,0 +1,620 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import random
+import json
+import numpy as np
+import torch
+from .data_utils import (
+    get_flattened_position_ids_interpolate,
+    get_flattened_position_ids_extrapolate,
+    len2weight,
+    patchify,
+    prepare_attention_mask_per_sample,
+)
+from .dataset_info import DATASET_INFO, DATASET_REGISTRY
+from .transforms import ImageTransform
+from .video_utils import FrameSampler
+class DataConfig:
+    def __init__(
+        self,
+        grouped_datasets,
+        text_cond_dropout_prob=0.1,
+        vit_cond_dropout_prob=0.4,
+        vae_cond_dropout_prob=0.1,
+        vae_image_downsample=16,
+        max_latent_size=32,
+        vit_patch_size=14,
+        max_num_patch_per_side=70,
+    ):
+        self.grouped_datasets = grouped_datasets
+        self.text_cond_dropout_prob = text_cond_dropout_prob
+        self.vit_cond_dropout_prob = vit_cond_dropout_prob
+        self.vit_patch_size = vit_patch_size
+        self.max_num_patch_per_side = max_num_patch_per_side
+        self.vae_cond_dropout_prob = vae_cond_dropout_prob
+        self.vae_image_downsample = vae_image_downsample
+        self.max_latent_size = max_latent_size
+class PackedDataset(torch.utils.data.IterableDataset):
+    def __init__(
+        self,
+        data_config,
+        tokenizer,
+        special_tokens,
+        local_rank,
+        world_size,
+        num_workers,
+        expected_num_tokens=32768,
+        max_num_tokens_per_sample=16384,
+        max_num_tokens=36864,
+        prefer_buffer_before=16384,
+        max_buffer_size=50,
+        interpolate_pos=False,
+        use_flex=False,
+        data_status=None,
+    ):
+        super().__init__()
+        self.expected_num_tokens = expected_num_tokens
+        self.max_num_tokens_per_sample = max_num_tokens_per_sample
+        self.prefer_buffer_before = prefer_buffer_before
+        self.max_num_tokens = max_num_tokens
+        self.max_buffer_size = max_buffer_size
+        self.tokenizer = tokenizer
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.num_workers = num_workers
+        self.use_flex = use_flex
+        for k, v in special_tokens.items():
+            setattr(self, k, v)
+        grouped_datasets, is_mandatory, grouped_weights = self.build_datasets(
+            data_config.grouped_datasets, data_status
+        )
+        self.grouped_datasets = grouped_datasets
+        self.dataset_iters = [iter(dataset) for dataset in grouped_datasets]
+        self.is_mandatory = is_mandatory
+        self.grouped_weights = grouped_weights
+        self.data_config = data_config
+        self.interpolate_pos = interpolate_pos
+        if self.interpolate_pos:
+            self.get_flattened_position_ids = get_flattened_position_ids_interpolate
+        else:
+            self.get_flattened_position_ids = get_flattened_position_ids_extrapolate
+    def build_datasets(self, datasets_metainfo, data_status):
+        datasets = []
+        is_mandatory = []
+        grouped_weights = []
+        for grouped_dataset_name, dataset_args in datasets_metainfo.items():
+            is_mandatory.append(dataset_args.pop('is_mandatory', False))
+            grouped_weights.append(dataset_args.pop('weight', 0.0))
+            if 'frame_sampler_args' in dataset_args.keys():
+                frame_sampler = FrameSampler(**dataset_args.pop('frame_sampler_args'))
+                dataset_args['frame_sampler'] = frame_sampler
+            if 'image_transform_args' in dataset_args.keys():
+                transform = ImageTransform(**dataset_args.pop('image_transform_args'))
+                dataset_args['transform'] = transform
+            if 'vit_image_transform_args' in dataset_args.keys():
+                vit_transform = ImageTransform(**dataset_args.pop('vit_image_transform_args'))
+                dataset_args['vit_transform'] = vit_transform
+            assert 'dataset_names' in dataset_args.keys()
+            dataset_names = dataset_args.pop('dataset_names')
+            dataset_args['data_dir_list'] = []
+            for item in dataset_names:
+                if self.local_rank == 0:
+                    print(f'Preparing Dataset {grouped_dataset_name}/{item}')
+                meta_info = DATASET_INFO[grouped_dataset_name][item]
+                dataset_args['data_dir_list'].append(meta_info['data_dir'])
+                if "parquet_info_path" in meta_info.keys():
+                    if 'parquet_info' not in dataset_args.keys():
+                        dataset_args['parquet_info'] = {}
+                    with open(meta_info['parquet_info_path'], 'r') as f:
+                        parquet_info = json.load(f)
+                    dataset_args['parquet_info'].update(parquet_info)
+                if 'json_dir' in meta_info.keys():
+                    # parquet/tar with json
+                    if 'json_dir_list' not in dataset_args.keys():
+                        dataset_args['json_dir_list'] = [meta_info['json_dir']]
+                    else:
+                        dataset_args['json_dir_list'].append(meta_info['json_dir'])
+                if 'jsonl_path' in meta_info.keys():
+                    # jsonl with jpeg
+                    if 'jsonl_path_list' not in dataset_args.keys():
+                        dataset_args['jsonl_path_list'] = [meta_info['jsonl_path']]
+                    else:
+                        dataset_args['jsonl_path_list'].append(meta_info['jsonl_path'])
+            resume_data_status = dataset_args.pop('resume_data_status', True)
+            if data_status is not None and grouped_dataset_name in data_status.keys() and resume_data_status:
+                data_status_per_group = data_status[grouped_dataset_name]
+            else:
+                data_status_per_group = None
+            dataset = DATASET_REGISTRY[grouped_dataset_name](
+                dataset_name=grouped_dataset_name,
+                tokenizer=self.tokenizer,
+                local_rank=self.local_rank,
+                world_size=self.world_size,
+                num_workers=self.num_workers,
+                data_status=data_status_per_group,
+                **dataset_args
+            )
+            datasets.append(dataset)
+        return datasets, is_mandatory, grouped_weights
+    def set_epoch(self, seed):
+        for dataset in self.grouped_datasets:
+            dataset.set_epoch(seed)
+    def set_sequence_status(self):
+        sequence_status = dict(
+            curr                        = 0,
+            sample_lens                 = list(),
+            packed_position_ids         = list(),
+            nested_attention_masks      = list(),
+            split_lens                  = list(),
+            attn_modes                  = list(),
+            packed_text_ids             = list(),
+            packed_text_indexes         = list(),
+            packed_label_ids            = list(),
+            ce_loss_indexes             = list(),
+            ce_loss_weights             = list(),
+            vae_image_tensors           = list(),
+            packed_latent_position_ids  = list(),
+            vae_latent_shapes           = list(),
+            packed_vae_token_indexes    = list(),
+            packed_timesteps            = list(),
+            mse_loss_indexes            = list(),
+            packed_vit_tokens           = list(),
+            vit_token_seqlens           = list(),
+            packed_vit_position_ids     = list(),
+            packed_vit_token_indexes    = list(),
+        )
+        return sequence_status
+    def to_tensor(self, sequence_status):
+        data = dict(
+            sequence_length=sum(sequence_status['sample_lens']),
+            sample_lens=sequence_status['sample_lens'],
+            packed_text_ids=torch.tensor(sequence_status['packed_text_ids']),
+            packed_text_indexes=torch.tensor(sequence_status['packed_text_indexes']),
+            packed_position_ids=torch.tensor(sequence_status['packed_position_ids']),
+        )
+        if not self.use_flex:
+            data['nested_attention_masks'] = sequence_status['nested_attention_masks']
+        else:
+            sequence_len = data['sequence_length']
+            pad_len = self.max_num_tokens - sequence_len
+            data['split_lens'] = sequence_status['split_lens'] + [pad_len]
+            data['attn_modes'] = sequence_status['attn_modes'] + ['causal']
+            data['sample_lens'] += [pad_len]
+        # if the model has a convnet vae (e.g., as visual tokenizer)
+        if len(sequence_status['vae_image_tensors']) > 0:
+            image_tensors = sequence_status.pop('vae_image_tensors')
+            image_sizes = [item.shape for item in image_tensors]
+            max_image_size = [max(item) for item in list(zip(*image_sizes))]
+            padded_images = torch.zeros(size=(len(image_tensors), *max_image_size))
+            for i, image_tensor in enumerate(image_tensors):
+                padded_images[i, :, :image_tensor.shape[1], :image_tensor.shape[2]] = image_tensor
+            data['padded_images'] = padded_images
+            data['patchified_vae_latent_shapes'] = sequence_status['vae_latent_shapes']
+            data['packed_latent_position_ids'] = torch.cat(sequence_status['packed_latent_position_ids'], dim=0)
+            data['packed_vae_token_indexes'] = torch.tensor(sequence_status['packed_vae_token_indexes'])
+        # if the model has a vit (e.g., as visual tokenizer)
+        if len(sequence_status['packed_vit_tokens']) > 0:
+            data['packed_vit_tokens'] = torch.cat(sequence_status['packed_vit_tokens'], dim=0)
+            data['packed_vit_position_ids'] = torch.cat(sequence_status['packed_vit_position_ids'], dim=0)
+            data['packed_vit_token_indexes'] = torch.tensor(sequence_status['packed_vit_token_indexes'])
+            data['vit_token_seqlens'] = torch.tensor(sequence_status['vit_token_seqlens'])
+        # if the model is required to perform visual generation
+        if len(sequence_status['packed_timesteps']) > 0:
+            data['packed_timesteps'] = torch.tensor(sequence_status['packed_timesteps'])
+            data['mse_loss_indexes'] = torch.tensor(sequence_status['mse_loss_indexes'])
+        # if the model is required to perform text generation
+        if len(sequence_status['packed_label_ids']) > 0:
+            data['packed_label_ids'] = torch.tensor(sequence_status['packed_label_ids'])
+            data['ce_loss_indexes'] = torch.tensor(sequence_status['ce_loss_indexes'])
+            data['ce_loss_weights'] = torch.tensor(sequence_status['ce_loss_weights'])
+        return data
+    def __iter__(self):
+        total_weights = sum(self.grouped_weights)
+        assert total_weights > 0.0
+        group_cumprobs = [sum(self.grouped_weights[:i + 1]) / total_weights
+                          for i in range(len(self.grouped_weights))]
+        sequence_status = self.set_sequence_status()
+        batch_data_indexes = []
+        buffer = []
+        while True:
+            # Ensure at least one sample from each group
+            if sequence_status['curr'] == 0:
+                for group_index, group_iter in enumerate(self.dataset_iters):
+                    if self.is_mandatory[group_index]:
+                        while True:
+                            sample = next(group_iter)
+                            # if a sample is too long, skip it
+                            num_tokens = sample['num_tokens'] + 2 * len(sample['sequence_plan'])
+                            if num_tokens < self.max_num_tokens_per_sample:
+                                sequence_status = self.pack_sequence(sample, sequence_status)
+                                batch_data_indexes.append(sample['data_indexes'])
+                                break
+                            else:
+                                print(f"skip a sample with length {num_tokens}")
+                                continue
+            if sequence_status['curr'] < self.prefer_buffer_before and len(buffer) > 0:
+                sample = buffer.pop(0)
+                sample_from_buffer = True
+            else:
+                # sample normally across all groups
+                n = random.random()
+                group_index = 0
+                for i, cumprob in enumerate(group_cumprobs):
+                    if n < cumprob:
+                        group_index = i
+                        break
+                sample = next(self.dataset_iters[group_index])
+                sample_from_buffer = False
+            # if a sample is too long, skip it
+            num_tokens = sample['num_tokens'] + 2 * len(sample['sequence_plan'])
+            if num_tokens > self.max_num_tokens_per_sample:
+                print(f"skip a sample with length {num_tokens}")
+                continue
+            if sequence_status['curr'] + num_tokens > self.max_num_tokens:
+                if len(buffer) < self.max_buffer_size and not sample_from_buffer:
+                    buffer.append(sample)
+                else:
+                    print(f"Yielding data with length {sum(sequence_status['sample_lens'])}")
+                    data = self.to_tensor(sequence_status)
+                    data['batch_data_indexes'] = batch_data_indexes
+                    yield data
+                    sequence_status = self.set_sequence_status()
+                    batch_data_indexes = []
+                continue
+            sequence_status = self.pack_sequence(sample, sequence_status)
+            batch_data_indexes.append(sample['data_indexes'])
+            if sequence_status['curr'] >= self.expected_num_tokens:
+                data = self.to_tensor(sequence_status)
+                data['batch_data_indexes'] = batch_data_indexes
+                yield data
+                sequence_status = self.set_sequence_status()
+                batch_data_indexes = []
+    def pack_sequence(self, sample, sequence_status):
+        image_tensor_list = sample['image_tensor_list']
+        text_ids_list = sample['text_ids_list']
+        sequence_plan = sample['sequence_plan']
+        split_lens, attn_modes = list(), list()
+        curr = sequence_status['curr']
+        curr_rope_id = 0
+        sample_lens = 0
+        for item in sequence_plan:
+            split_start = item.get('split_start', True)
+            if split_start:
+                curr_split_len = 0
+            if item['type'] == 'text':
+                text_ids = text_ids_list.pop(0)
+                if item['enable_cfg'] == 1 and random.random() < self.data_config.text_cond_dropout_prob:
+                    continue
+                shifted_text_ids = [self.bos_token_id] + text_ids
+                sequence_status['packed_text_ids'].extend(shifted_text_ids)
+                sequence_status['packed_text_indexes'].extend(range(curr, curr + len(shifted_text_ids)))
+                if item['loss'] == 1:
+                    sequence_status['ce_loss_indexes'].extend(range(curr, curr + len(shifted_text_ids)))
+                    sequence_status['ce_loss_weights'].extend(
+                        [len2weight(len(shifted_text_ids))] * len(shifted_text_ids)
+                    )
+                    sequence_status['packed_label_ids'].extend(text_ids + [self.eos_token_id])
+                curr += len(shifted_text_ids)
+                curr_split_len += len(shifted_text_ids)
+                # add a <|im_end|> token
+                sequence_status['packed_text_ids'].append(self.eos_token_id)
+                sequence_status['packed_text_indexes'].append(curr)
+                if item['special_token_loss'] == 1: # <|im_end|> may have loss
+                    sequence_status['ce_loss_indexes'].append(curr)
+                    sequence_status['ce_loss_weights'].append(1.0)
+                    sequence_status['packed_label_ids'].append(item['special_token_label'])
+                curr += 1
+                curr_split_len += 1
+                # update sequence status
+                attn_modes.append("causal")
+                sequence_status['packed_position_ids'].extend(range(curr_rope_id, curr_rope_id + curr_split_len))
+                curr_rope_id += curr_split_len
+            elif item['type'] == 'vit_image':
+                image_tensor = image_tensor_list.pop(0)
+                if item['enable_cfg'] == 1 and random.random() < self.data_config.vit_cond_dropout_prob:
+                    curr_rope_id += 1
+                    continue
+                # add a <|startofimage|> token
+                sequence_status['packed_text_ids'].append(self.start_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                curr += 1
+                curr_split_len += 1
+                # preprocess image
+                vit_tokens = patchify(image_tensor, self.data_config.vit_patch_size)
+                num_img_tokens = vit_tokens.shape[0]
+                sequence_status['packed_vit_token_indexes'].extend(range(curr, curr + num_img_tokens))
+                curr += num_img_tokens
+                curr_split_len += num_img_tokens
+                sequence_status['packed_vit_tokens'].append(vit_tokens)
+                sequence_status['vit_token_seqlens'].append(num_img_tokens)
+                sequence_status['packed_vit_position_ids'].append(
+                    self.get_flattened_position_ids(
+                        image_tensor.size(1), image_tensor.size(2),
+                        self.data_config.vit_patch_size,
+                        max_num_patches_per_side=self.data_config.max_num_patch_per_side
+                    )
+                )
+                # add a <|endofimage|> token
+                sequence_status['packed_text_ids'].append(self.end_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                if item['special_token_loss'] == 1: # <|endofimage|> may have loss
+                    sequence_status['ce_loss_indexes'].append(curr)
+                    sequence_status['ce_loss_weights'].append(1.0)
+                    sequence_status['packed_label_ids'].append(item['special_token_label'])
+                curr += 1
+                curr_split_len += 1
+                # update sequence status
+                attn_modes.append("full")
+                sequence_status['packed_position_ids'].extend([curr_rope_id] * curr_split_len)
+                curr_rope_id += 1
+            elif item['type'] == 'vae_image':
+                image_tensor = image_tensor_list.pop(0)
+                if item['enable_cfg'] == 1 and random.random() < self.data_config.vae_cond_dropout_prob:
+                    # FIXME fix vae dropout in video2video setting.
+                    curr_rope_id += 1
+                    continue
+                # add a <|startofimage|> token
+                sequence_status['packed_text_ids'].append(self.start_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                curr += 1
+                curr_split_len += 1
+                # preprocess image
+                sequence_status['vae_image_tensors'].append(image_tensor)
+                sequence_status['packed_latent_position_ids'].append(
+                    self.get_flattened_position_ids(
+                        image_tensor.size(1), image_tensor.size(2),
+                        self.data_config.vae_image_downsample,
+                        max_num_patches_per_side=self.data_config.max_latent_size
+                    )
+                )
+                H, W = image_tensor.shape[1:]
+                h = H // self.data_config.vae_image_downsample
+                w = W // self.data_config.vae_image_downsample
+                sequence_status['vae_latent_shapes'].append((h, w))
+                num_img_tokens = w * h
+                sequence_status['packed_vae_token_indexes'].extend(range(curr, curr + num_img_tokens))
+                if item['loss'] == 1:
+                    sequence_status['mse_loss_indexes'].extend(range(curr, curr + num_img_tokens))
+                    if split_start:
+                        timestep = np.random.randn()
+                else:
+                    timestep = float('-inf')
+                sequence_status['packed_timesteps'].extend([timestep] * num_img_tokens)
+                curr += num_img_tokens
+                curr_split_len += num_img_tokens
+                # add a <|endofimage|> token
+                sequence_status['packed_text_ids'].append(self.end_of_image)
+                sequence_status['packed_text_indexes'].append(curr)
+                # <|endofimage|> may have loss
+                if item['special_token_loss'] == 1:
+                    sequence_status['ce_loss_indexes'].append(curr)
+                    sequence_status['ce_loss_weights'].append(1.0)
+                    sequence_status['packed_label_ids'].append(item['special_token_label'])
+                curr += 1
+                curr_split_len += 1
+                # update sequence status
+                if split_start:
+                    if item['loss'] == 1 and 'frame_delta' not in item.keys():
+                        attn_modes.append("noise")
+                    else:
+                        attn_modes.append("full")
+                sequence_status['packed_position_ids'].extend([curr_rope_id] * (num_img_tokens + 2))
+                if 'frame_delta' in item.keys():
+                    curr_rope_id += item['frame_delta']
+                elif item['loss'] == 0:
+                    curr_rope_id += 1
+            if item.get('split_end', True):
+                split_lens.append(curr_split_len)
+                sample_lens += curr_split_len
+        sequence_status['curr'] = curr
+        sequence_status['sample_lens'].append(sample_lens)
+        # prepare attention mask
+        if not self.use_flex:
+            sequence_status['nested_attention_masks'].append(
+                prepare_attention_mask_per_sample(split_lens, attn_modes)
+            )
+        else:
+            sequence_status['split_lens'].extend(split_lens)
+            sequence_status['attn_modes'].extend(attn_modes)
+        return sequence_status
+class SimpleCustomBatch:
+    def __init__(self, batch):
+        data = batch[0]
+        self.batch_data_indexes = data['batch_data_indexes']
+        self.sequence_length = data["sequence_length"]
+        self.sample_lens = data["sample_lens"]
+        self.packed_text_ids = data["packed_text_ids"]
+        self.packed_text_indexes = data["packed_text_indexes"]
+        self.packed_position_ids = data["packed_position_ids"]
+        self.use_flex = "nested_attention_masks" not in data.keys()
+        if self.use_flex:
+            self.split_lens = data["split_lens"]
+            self.attn_modes = data["attn_modes"]
+        else:
+            self.nested_attention_masks = data["nested_attention_masks"]
+        if "padded_images" in data.keys():
+            self.padded_images = data["padded_images"]
+            self.patchified_vae_latent_shapes = data["patchified_vae_latent_shapes"]
+            self.packed_latent_position_ids = data["packed_latent_position_ids"]
+            self.packed_vae_token_indexes = data["packed_vae_token_indexes"]
+        if "packed_vit_tokens" in data.keys():
+            self.packed_vit_tokens = data["packed_vit_tokens"]
+            self.packed_vit_position_ids = data["packed_vit_position_ids"]
+            self.packed_vit_token_indexes = data["packed_vit_token_indexes"]
+            self.vit_token_seqlens = data["vit_token_seqlens"]
+        if "packed_timesteps" in data.keys():
+            self.packed_timesteps = data["packed_timesteps"]
+            self.mse_loss_indexes = data["mse_loss_indexes"]
+        if "packed_label_ids" in data.keys():
+            self.packed_label_ids = data["packed_label_ids"]
+            self.ce_loss_indexes = data["ce_loss_indexes"]
+            self.ce_loss_weights = data["ce_loss_weights"]
+    def pin_memory(self):
+        self.packed_text_ids = self.packed_text_ids.pin_memory()
+        self.packed_text_indexes = self.packed_text_indexes.pin_memory()
+        self.packed_position_ids = self.packed_position_ids.pin_memory()
+        if not self.use_flex:
+            self.nested_attention_masks = [item.pin_memory() for item in self.nested_attention_masks]
+        if hasattr(self, 'padded_images'):
+            self.padded_images = self.padded_images.pin_memory()
+            self.packed_vae_token_indexes = self.packed_vae_token_indexes.pin_memory()
+            self.packed_latent_position_ids = self.packed_latent_position_ids.pin_memory()
+        if hasattr(self, 'packed_timesteps'):
+            self.packed_timesteps = self.packed_timesteps.pin_memory()
+            self.mse_loss_indexes = self.mse_loss_indexes.pin_memory()
+        if hasattr(self, 'packed_vit_tokens'):
+            self.packed_vit_tokens = self.packed_vit_tokens.pin_memory()
+            self.packed_vit_position_ids = self.packed_vit_position_ids.pin_memory()
+            self.packed_vit_token_indexes = self.packed_vit_token_indexes.pin_memory()
+            self.vit_token_seqlens = self.vit_token_seqlens.pin_memory()
+        if hasattr(self, 'packed_label_ids'):
+            self.packed_label_ids = self.packed_label_ids.pin_memory()
+            self.ce_loss_indexes = self.ce_loss_indexes.pin_memory()
+            self.ce_loss_weights = self.ce_loss_weights.pin_memory()
+        return self
+    def cuda(self, device):
+        self.packed_text_ids = self.packed_text_ids.to(device)
+        self.packed_text_indexes = self.packed_text_indexes.to(device)
+        self.packed_position_ids = self.packed_position_ids.to(device)
+        if not self.use_flex:
+            self.nested_attention_masks = [item.to(device) for item in self.nested_attention_masks]
+        if hasattr(self, 'padded_images'):
+            self.padded_images = self.padded_images.to(device)
+            self.packed_vae_token_indexes = self.packed_vae_token_indexes.to(device)
+            self.packed_latent_position_ids = self.packed_latent_position_ids.to(device)
+        if hasattr(self, 'packed_timesteps'):
+            self.packed_timesteps = self.packed_timesteps.to(device)
+            self.mse_loss_indexes = self.mse_loss_indexes.to(device)
+        if hasattr(self, 'packed_vit_tokens'):
+            self.packed_vit_tokens = self.packed_vit_tokens.to(device)
+            self.packed_vit_position_ids = self.packed_vit_position_ids.to(device)
+            self.packed_vit_token_indexes = self.packed_vit_token_indexes.to(device)
+            self.vit_token_seqlens = self.vit_token_seqlens.to(device)
+        if hasattr(self, 'packed_label_ids'):
+            self.packed_label_ids = self.packed_label_ids.to(device)
+            self.ce_loss_indexes = self.ce_loss_indexes.to(device)
+            self.ce_loss_weights = self.ce_loss_weights.to(device)
+        return self
+    def to_dict(self):
+        data = dict(
+            sequence_length = self.sequence_length,
+            sample_lens = self.sample_lens,
+            packed_text_ids = self.packed_text_ids,
+            packed_text_indexes = self.packed_text_indexes,
+            packed_position_ids = self.packed_position_ids,
+            batch_data_indexes = self.batch_data_indexes,
+        )
+        if not self.use_flex:
+            data['nested_attention_masks'] = self.nested_attention_masks
+        else:
+            data['split_lens'] = self.split_lens
+            data['attn_modes'] = self.attn_modes
+        if hasattr(self, 'padded_images'):
+            data['padded_images'] = self.padded_images
+            data['patchified_vae_latent_shapes'] = self.patchified_vae_latent_shapes
+            data['packed_latent_position_ids'] = self.packed_latent_position_ids
+            data['packed_vae_token_indexes'] = self.packed_vae_token_indexes
+        if hasattr(self, 'packed_vit_tokens'):
+            data['packed_vit_tokens'] = self.packed_vit_tokens
+            data['packed_vit_position_ids'] = self.packed_vit_position_ids
+            data['packed_vit_token_indexes'] = self.packed_vit_token_indexes
+            data['vit_token_seqlens'] = self.vit_token_seqlens
+        if hasattr(self, 'packed_timesteps'):
+            data['packed_timesteps'] = self.packed_timesteps
+            data['mse_loss_indexes'] = self.mse_loss_indexes
+        if hasattr(self, 'packed_label_ids'):
+            data['packed_label_ids'] = self.packed_label_ids
+            data['ce_loss_indexes'] = self.ce_loss_indexes
+            data['ce_loss_weights'] = self.ce_loss_weights
+        return data
+def collate_wrapper():
+    def collate_fn(batch):
+        return SimpleCustomBatch(batch)
+    return collate_fn

data/dataset_info.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+from .interleave_datasets import UnifiedEditIterableDataset
+from .t2i_dataset import T2IIterableDataset
+from .vlm_dataset import SftJSONLIterableDataset
+DATASET_REGISTRY = {
+    't2i_pretrain': T2IIterableDataset,
+    'vlm_sft': SftJSONLIterableDataset,
+    'unified_edit': UnifiedEditIterableDataset,
+}
+DATASET_INFO = {
+    't2i_pretrain': {
+        't2i': {
+            'data_dir': 'your_data_path/bagel_example/t2i', # path of the parquet files
+            'num_files': 10, # number of data units to be sharded across all ranks and workers
+            'num_total_samples': 1000, # number of total samples in the dataset
+        },
+    },
+    'unified_edit':{
+        'seedxedit_multi': {
+            'data_dir': 'your_data_path/bagel_example/editing/seedxedit_multi',
+            'num_files': 10,
+            'num_total_samples': 1000,
+            "parquet_info_path": 'your_data_path/bagel_example/editing/parquet_info/seedxedit_multi_nas.json', # information of the parquet files
+		},
+    },
+    'vlm_sft': {
+        'llava_ov': {
+			'data_dir': 'your_data_path/bagel_example/vlm/images',
+			'jsonl_path': 'your_data_path/bagel_example/vlm/llava_ov_si.jsonl',
+			'num_total_samples': 1000
+		},
+    },
+}

data/distributed_iterable_dataset.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import random
+import torch
+class DistributedIterableDataset(torch.utils.data.IterableDataset):
+    def __init__(self, dataset_name, local_rank=0, world_size=1, num_workers=8):
+        self.dataset_name = dataset_name
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.num_workers = num_workers
+        self.rng = random.Random()
+        self.data_paths = None
+    def get_data_paths(self, *args, **kwargs):
+        raise NotImplementedError
+    def set_epoch(self, seed=42):
+        if self.data_paths is None:
+            return
+        if isinstance(self.data_paths[0], tuple):
+            data_paths = sorted(self.data_paths, key=lambda x: (x[0], x[1]))
+        elif isinstance(self.data_paths[0], str):
+            data_paths = sorted(self.data_paths)
+        else:
+            raise ValueError(f"Unknown data_paths type: {type(self.data_paths[0])}")
+        self.rng.seed(seed)
+        self.rng.shuffle(data_paths)
+        num_files_per_rank = len(data_paths) // self.world_size
+        local_start = self.local_rank * num_files_per_rank
+        local_end = (self.local_rank + 1) * num_files_per_rank
+        self.num_files_per_rank = num_files_per_rank
+        self.data_paths_per_rank = data_paths[local_start:local_end]
+    def get_data_paths_per_worker(self):
+        if self.data_paths is None:
+            return None
+        info = torch.utils.data.get_worker_info()
+        if info is None:
+            # Single worker: Use all files assigned to the rank
+            return self.data_paths_per_rank, 0
+        worker_id = info.id
+        num_files_per_worker = self.num_files_per_rank // info.num_workers
+        start = num_files_per_worker * worker_id
+        end = num_files_per_worker * (worker_id + 1)
+        data_paths_per_worker = self.data_paths_per_rank[start:end]
+        return data_paths_per_worker[::-1], worker_id
+    def __iter__(self):
+        raise NotImplementedError

data/interleave_datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+from .edit_dataset import UnifiedEditIterableDataset

data/interleave_datasets/edit_dataset.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import io
+import random
+from PIL import Image, ImageFile, PngImagePlugin
+from .interleave_t2i_dataset import InterleavedBaseIterableDataset, ParquetStandardIterableDataset
+from ..data_utils import pil_img2rgb
+Image.MAX_IMAGE_PIXELS = 200000000
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+MaximumDecompressedSize = 1024
+MegaByte = 2 ** 20
+PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
+class UnifiedEditIterableDataset(InterleavedBaseIterableDataset, ParquetStandardIterableDataset):
+    def parse_row(self, row):
+        image_num = len(row["image_list"])
+        # randomly choose start and end, return [0, 1] when only two images
+        start_idx = random.choice(range(image_num - 1))
+        max_end = min(start_idx + 3, image_num)
+        end_idx = random.choice(range(start_idx + 1, max_end))
+        data = self._init_data()
+        data = self._add_image(
+            data,
+            pil_img2rgb(Image.open(io.BytesIO(row["image_list"][start_idx]))),
+            need_loss=False,
+            need_vae=True,
+            need_vit=True,
+        )
+        if end_idx - start_idx > 1 and random.random() < 0.5: # concat multiple insturction
+            if end_idx == image_num - 1:
+                end_idx -= 1
+            instruction = ""
+            for idx in range(start_idx + 1, end_idx + 1):
+                instruction += random.choice(row["instruction_list"][idx-1]) + ". "
+            data = self._add_text(data, instruction.rstrip(), need_loss=False)
+            data = self._add_image(
+                data,
+                pil_img2rgb(Image.open(io.BytesIO(row["image_list"][end_idx]))),
+                need_loss=True,
+                need_vae=False,
+                need_vit=False,
+            )
+        else:
+            for idx in range(start_idx + 1, end_idx + 1):
+                instruction = random.choice(row["instruction_list"][idx-1])
+                data = self._add_text(data, instruction, need_loss=False)
+                if idx != end_idx:
+                    data = self._add_image(
+                        data,
+                        pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
+                        need_loss=True,
+                        need_vae=True,
+                        need_vit=True,
+                    )
+                else:
+                    data = self._add_image(
+                        data,
+                        pil_img2rgb(Image.open(io.BytesIO(row["image_list"][idx]))),
+                        need_loss=True,
+                        need_vae=False,
+                        need_vit=False,
+                    )
+        return data

data/interleave_datasets/interleave_t2i_dataset.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import pyarrow.parquet as pq
+from ..distributed_iterable_dataset import DistributedIterableDataset
+from ..parquet_utils import get_parquet_data_paths, init_arrow_pf_fs
+class InterleavedBaseIterableDataset(DistributedIterableDataset):
+    def _init_data(self):
+        data = {
+            'sequence_plan': [],
+            'text_ids_list': [],
+            'image_tensor_list': [],
+            'num_tokens': 0,
+        }
+        return data
+    def _add_text(self, data, text, need_loss, enable_cfg=True):
+        text_ids = self.tokenizer.encode(text)
+        data['num_tokens'] += len(text_ids)
+        data['text_ids_list'].append(text_ids)
+        data['sequence_plan'].append(
+            {
+                'type': 'text',
+                'enable_cfg': int(enable_cfg),
+                'loss': int(need_loss),
+                'special_token_loss': 0,
+                'special_token_label': None,
+            }
+        )
+        return data
+    def _add_image(self, data, image, need_loss, need_vae, need_vit, enable_cfg=True):
+        assert need_loss or need_vae or need_vit
+        if need_loss:
+            data['sequence_plan'].append(
+                {
+                    'type': 'vae_image',
+                    'enable_cfg': 0,
+                    'loss': 1,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                }
+            )
+            image_tensor = self.transform(image)
+            height, width = image_tensor.shape[1:]
+            data['num_tokens'] += width * height // self.transform.stride ** 2
+            data['image_tensor_list'].append(image_tensor)
+        if need_vae:
+            data['sequence_plan'].append(
+                {
+                    'type': 'vae_image',
+                    'enable_cfg': int(enable_cfg),
+                    'loss': 0,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                }
+            )
+            image_tensor = self.transform(image)
+            height, width = image_tensor.shape[1:]
+            data['num_tokens'] += width * height // self.transform.stride ** 2
+            data['image_tensor_list'].append(image_tensor.clone())
+        if need_vit:
+            data['sequence_plan'].append(
+                {
+                    'type': 'vit_image',
+                    'enable_cfg': int(enable_cfg),
+                    'loss': 0,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                },
+            )
+            vit_image_tensor = self.vit_transform(image)
+            height, width = vit_image_tensor.shape[1:]
+            data['num_tokens'] += width * height // self.vit_transform.stride ** 2
+            data['image_tensor_list'].append(vit_image_tensor)
+        return data
+    def _add_video(self, data, frames, frame_indexes, need_loss, need_vae, enable_cfg=True):
+        assert int(need_loss) + int(need_vae) == 1
+        if need_loss:
+            for idx, (image, frame_idx) in enumerate(zip(frames, frame_indexes)):
+                current_sequence_plan = {
+                    'type': 'vae_image',
+                    'enable_cfg': 0,
+                    'loss': 1,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                    'split_start': idx == 0,
+                    'split_end': idx == len(frames) - 1,
+                }
+                if idx < len(frame_indexes) - 1:
+                    current_sequence_plan['frame_delta'] = frame_indexes[idx + 1] - frame_idx
+                data['sequence_plan'].append(current_sequence_plan)
+                image_tensor = self.transform(image)
+                height, width = image_tensor.shape[1:]
+                data['image_tensor_list'].append(image_tensor)
+                data['num_tokens'] += width * height // self.transform.stride ** 2
+        elif need_vae:
+            for idx, (image, frame_idx) in enumerate(zip(frames, frame_indexes)):
+                current_sequence_plan = {
+                    'type': 'vae_image',
+                    'enable_cfg': int(enable_cfg),
+                    'loss': 0,
+                    'special_token_loss': 0,
+                    'special_token_label': None,
+                    'split_start': idx == 0,
+                    'split_end': idx == len(frames) - 1,
+                }
+                if idx < len(frame_indexes) - 1:
+                    current_sequence_plan['frame_delta'] = frame_indexes[idx + 1] - frame_idx
+                data['sequence_plan'].append(current_sequence_plan)
+                image_tensor = self.transform(image)
+                height, width = image_tensor.shape[1:]
+                data['image_tensor_list'].append(image_tensor)
+                data['num_tokens'] += width * height // self.transform.stride ** 2
+        return data
+class ParquetStandardIterableDataset(DistributedIterableDataset):
+    def __init__(
+        self, dataset_name, transform, tokenizer, vit_transform,
+        data_dir_list, num_used_data, parquet_info,
+        local_rank=0, world_size=1, num_workers=8, data_status=None,
+    ):
+        """
+        data_dir_list: list of data directories contains parquet files
+        num_used_data: list of number of sampled data paths for each data directory
+        vit_transform: input transform for vit model.
+        """
+        super().__init__(dataset_name, local_rank, world_size, num_workers)
+        self.transform = transform
+        self.vit_transform = vit_transform
+        self.tokenizer = tokenizer
+        self.data_status = data_status
+        self.data_paths = self.get_data_paths(data_dir_list, num_used_data, parquet_info)
+        self.set_epoch()
+    def get_data_paths(self, data_dir_list, num_used_data, parquet_info):
+        row_groups = []
+        for data_dir, num_data_path in zip(data_dir_list, num_used_data):
+            data_paths = get_parquet_data_paths([data_dir], [num_data_path])
+            for data_path in data_paths:
+                if data_path in parquet_info.keys():
+                    num_row_groups = parquet_info[data_path]['num_row_groups']
+                    for rg_idx in range(num_row_groups):
+                        row_groups.append((data_path, rg_idx))
+        return row_groups
+    def parse_row(self, row):
+        raise NotImplementedError
+    def __iter__(self):
+        file_paths_per_worker, worker_id = self.get_data_paths_per_worker()
+        if self.data_status is not None:
+            global_row_group_start_id = self.data_status[worker_id][0]
+            row_start_id = self.data_status[worker_id][1] + 1
+        else:
+            global_row_group_start_id = 0
+            row_start_id = 0
+        print(
+            f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
+            f"resuming data at global_rg#{global_row_group_start_id}, row#{row_start_id}"
+        )
+        while True:
+            file_paths_per_worker_ = file_paths_per_worker[global_row_group_start_id:]
+            for global_row_group_idx, (parquet_file_path, row_group_id) in enumerate(
+                file_paths_per_worker_, start=global_row_group_start_id
+            ):
+                fs = init_arrow_pf_fs(parquet_file_path)
+                with fs.open_input_file(parquet_file_path) as f:
+                    try:
+                        fr = pq.ParquetFile(f)
+                        df = fr.read_row_group(row_group_id).to_pandas()
+                        df = df.iloc[row_start_id:]
+                    except Exception as e:
+                        print(f'Error {e} in rg#{row_group_id}, {parquet_file_path}')
+                        continue
+                    for row_idx, row in df.iterrows():
+                        try:
+                            data = self.parse_row(row)
+                            if len(data) == 0:
+                                continue
+                            data['data_indexes'] = {
+                                "data_indexes": [global_row_group_idx, row_idx],
+                                "worker_id": worker_id,
+                                "dataset_name": self.dataset_name,
+                            }
+                        except Exception as e:
+                            print(f'Error {e} in rg#{row_group_id}, {parquet_file_path}')
+                            continue
+                        yield data
+                    row_start_id = 0
+            global_row_group_start_id = 0
+            print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")

data/parquet_utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import os
+import xml.etree.ElementTree as ET
+import subprocess
+import logging
+import pyarrow.fs as pf
+import torch.distributed as dist
+logger = logging.getLogger(__name__)
+def get_parquet_data_paths(data_dir_list, num_sampled_data_paths, rank=0, world_size=1):
+    num_data_dirs = len(data_dir_list)
+    if world_size > 1:
+        chunk_size = (num_data_dirs + world_size - 1) // world_size
+        start_idx = rank * chunk_size
+        end_idx = min(start_idx + chunk_size, num_data_dirs)
+        local_data_dir_list = data_dir_list[start_idx:end_idx]
+        local_num_sampled_data_paths = num_sampled_data_paths[start_idx:end_idx]
+    else:
+        local_data_dir_list = data_dir_list
+        local_num_sampled_data_paths = num_sampled_data_paths
+    local_data_paths = []
+    for data_dir, num_data_path in zip(local_data_dir_list, local_num_sampled_data_paths):
+        if data_dir.startswith("hdfs://"):
+            files = hdfs_ls_cmd(data_dir)
+            data_paths_per_dir = [
+                file for file in files if file.endswith(".parquet")
+            ]
+        else:
+            files = os.listdir(data_dir)
+            data_paths_per_dir = [
+                os.path.join(data_dir, name)
+                for name in files
+                if name.endswith(".parquet")
+            ]
+        repeat = num_data_path // len(data_paths_per_dir)
+        data_paths_per_dir = data_paths_per_dir * (repeat + 1)
+        local_data_paths.extend(data_paths_per_dir[:num_data_path])
+    if world_size > 1:
+        gather_list = [None] * world_size
+        dist.all_gather_object(gather_list, local_data_paths)
+        combined_chunks = []
+        for chunk_list in gather_list:
+            if chunk_list is not None:
+                combined_chunks.extend(chunk_list)
+    else:
+        combined_chunks = local_data_paths
+    return combined_chunks
+# NOTE: cumtomize this function for your cluster
+def get_hdfs_host():
+    return "hdfs://xxx"
+# NOTE: cumtomize this function for your cluster
+def get_hdfs_block_size():
+    return 134217728
+# NOTE: cumtomize this function for your cluster
+def get_hdfs_extra_conf():
+    return None
+def init_arrow_pf_fs(parquet_file_path):
+    if parquet_file_path.startswith("hdfs://"):
+        fs = pf.HadoopFileSystem(
+            host=get_hdfs_host(),
+            port=0,
+            buffer_size=get_hdfs_block_size(),
+            extra_conf=get_hdfs_extra_conf(),
+        )
+    else:
+        fs = pf.LocalFileSystem()
+    return fs
+def hdfs_ls_cmd(dir):
+    result = subprocess.run(["hdfs", "dfs", "ls", dir], capture_output=True, text=True).stdout
+    return ['hdfs://' + i.split('hdfs://')[-1].strip() for i in result.split('\n') if 'hdfs://' in i]

data/t2i_dataset.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import io
+import json
+import pyarrow.parquet as pq
+import random
+from PIL import Image
+from .data_utils import pil_img2rgb
+from .distributed_iterable_dataset import DistributedIterableDataset
+from .parquet_utils import get_parquet_data_paths, init_arrow_pf_fs
+Image.MAX_IMAGE_PIXELS = 20_000_000
+class T2IIterableDataset(DistributedIterableDataset):
+    def __init__(
+        self, dataset_name, transform, tokenizer, data_dir_list, num_used_data,
+        local_rank=0, world_size=1, num_workers=8, data_status=None,
+    ):
+        """
+        data_dir_list: list of data directories contains parquet files
+        num_used_data: list of number of sampled data paths for each data directory
+        """
+        super().__init__(dataset_name, local_rank, world_size, num_workers)
+        self.transform = transform
+        self.tokenizer = tokenizer
+        self.data_status = data_status
+        self.data_paths = self.get_data_paths(data_dir_list, num_used_data)
+        self.set_epoch()
+    def get_data_paths(self, data_dir_list, num_used_data):
+        return get_parquet_data_paths(data_dir_list, num_used_data)
+    def __iter__(self):
+        data_paths_per_worker, worker_id = self.get_data_paths_per_worker()
+        if self.data_status is not None:
+            parquet_start_id = self.data_status[worker_id][0]
+            row_group_start_id = self.data_status[worker_id][1]
+            row_start_id = self.data_status[worker_id][2] + 1
+        else:
+            parquet_start_id = 0
+            row_group_start_id = 0
+            row_start_id = 0
+        transform_stride = self.transform.stride
+        print(
+            f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
+            f"resuming data at parquet#{parquet_start_id}, rg#{row_group_start_id}, row#{row_start_id}"
+        )
+        while True:
+            data_paths_per_worker_ = data_paths_per_worker[parquet_start_id:]
+            for parquet_idx, parquet_file_path in enumerate(data_paths_per_worker_, start=parquet_start_id):
+                fs = init_arrow_pf_fs(parquet_file_path)
+                with fs.open_input_file(parquet_file_path) as f:
+                    fr = pq.ParquetFile(f)
+                    row_group_ids = list(range(fr.num_row_groups))
+                    row_group_ids_ = row_group_ids[row_group_start_id:]
+                    for row_group_id in row_group_ids_:
+                        df = fr.read_row_group(row_group_id).to_pandas()
+                        df = df.iloc[row_start_id:]
+                        for row_idx, row in df.iterrows():
+                            num_tokens = 0
+                            try:
+                                image_byte = row['image']
+                                image = pil_img2rgb(Image.open(io.BytesIO(image_byte)))
+                            except Exception as e:
+                                print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}')
+                                continue
+                            image_tensor = self.transform(image)
+                            height, width = image_tensor.shape[1:]
+                            num_tokens += width * height // transform_stride ** 2
+                            try:
+                                caption_dict = row['captions']
+                                caption_dict = json.loads(caption_dict)
+                            except Exception as e:
+                                print(f'Error: {e} in rg#{row_group_id}, {parquet_file_path}')
+                                continue
+                            caps_token = [self.tokenizer.encode(v) for _, v in caption_dict.items()]
+                            if len(caps_token) == 0:
+                                print(f'no caption in rg#{row_group_id}, {parquet_file_path}')
+                                caption_token = self.tokenizer.encode(' ')
+                            else:
+                                caption_token = random.choice(caps_token)
+                            sequence_plan, text_ids_list = [], []
+                            text_ids = caption_token
+                            num_tokens += len(caption_token)
+                            text_ids_list.append(text_ids)
+                            sequence_plan.append({
+                                'type': 'text',
+                                'enable_cfg': 1,
+                                'loss': 0,
+                                'special_token_loss': 0,
+                                'special_token_label': None,
+                            })
+                            sequence_plan.append({
+                                'type': 'vae_image',
+                                'enable_cfg': 0,
+                                'loss': 1,
+                                'special_token_loss': 0,
+                                'special_token_label': None,
+                            })
+                            sample = dict(
+                                image_tensor_list=[image_tensor],
+                                text_ids_list=text_ids_list,
+                                num_tokens=num_tokens,
+                                sequence_plan=sequence_plan,
+                                data_indexes={
+                                    "data_indexes": [parquet_idx, row_group_id, row_idx],
+                                    "worker_id": worker_id,
+                                    "dataset_name": self.dataset_name,
+                                }
+                            )
+                            yield sample
+                        row_start_id = 0
+                    row_group_start_id = 0
+            parquet_start_id = 0
+            print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")

data/transforms.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import random
+from PIL import Image
+import cv2
+import numpy as np
+import torch
+from torchvision import transforms
+from torchvision.transforms import functional as F
+from torchvision.transforms import InterpolationMode
+class MaxLongEdgeMinShortEdgeResize(torch.nn.Module):
+    """Resize the input image so that its longest side and shortest side are within a specified range,
+    ensuring that both sides are divisible by a specified stride.
+    Args:
+        max_size (int): Maximum size for the longest edge of the image.
+        min_size (int): Minimum size for the shortest edge of the image.
+        stride (int): Value by which the height and width of the image must be divisible.
+        max_pixels (int): Maximum pixels for the full image.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR``, and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g., ``PIL.Image.BILINEAR`` are also accepted.
+        antialias (bool, optional): Whether to apply antialiasing (default is True).
+    """
+    def __init__(
+        self,
+        max_size: int,
+        min_size: int,
+        stride: int,
+        max_pixels: int,
+        interpolation=InterpolationMode.BICUBIC,
+        antialias=True
+    ):
+        super().__init__()
+        self.max_size = max_size
+        self.min_size = min_size
+        self.stride = stride
+        self.max_pixels = max_pixels
+        self.interpolation = interpolation
+        self.antialias = antialias
+    def _make_divisible(self, value, stride):
+        """Ensure the value is divisible by the stride."""
+        return max(stride, int(round(value / stride) * stride))
+    def _apply_scale(self, width, height, scale):
+        new_width = round(width * scale)
+        new_height = round(height * scale)
+        new_width = self._make_divisible(new_width, self.stride)
+        new_height = self._make_divisible(new_height, self.stride)
+        return new_width, new_height
+    def forward(self, img, img_num=1):
+        """
+        Args:
+            img (PIL Image): Image to be resized.
+            img_num (int): Number of images, used to change max_tokens.
+        Returns:
+            PIL Image or Tensor: Rescaled image with divisible dimensions.
+        """
+        if isinstance(img, torch.Tensor):
+            height, width = img.shape[-2:]
+        else:
+            width, height = img.size
+        scale = min(self.max_size / max(width, height), 1.0)
+        scale = max(scale, self.min_size / min(width, height))
+        new_width, new_height = self._apply_scale(width, height, scale)
+        # Ensure the number of pixels does not exceed max_pixels
+        if new_width * new_height > self.max_pixels / img_num:
+            scale = self.max_pixels / img_num / (new_width * new_height)
+            new_width, new_height = self._apply_scale(new_width, new_height, scale)
+        # Ensure longest edge does not exceed max_size
+        if max(new_width, new_height) > self.max_size:
+            scale = self.max_size / max(new_width, new_height)
+            new_width, new_height = self._apply_scale(new_width, new_height, scale)
+        return F.resize(img, (new_height, new_width), self.interpolation, antialias=self.antialias)
+class ImageTransform:
+    def __init__(
+        self,
+        max_image_size,
+        min_image_size,
+        image_stride,
+        max_pixels=14*14*9*1024,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5]
+    ):
+        self.stride = image_stride
+        self.resize_transform = MaxLongEdgeMinShortEdgeResize(
+            max_size=max_image_size,
+            min_size=min_image_size,
+            stride=image_stride,
+            max_pixels=max_pixels,
+        )
+        self.to_tensor_transform = transforms.ToTensor()
+        self.normalize_transform = transforms.Normalize(mean=image_mean, std=image_std, inplace=True)
+    def __call__(self, img, img_num=1):
+        img = self.resize_transform(img, img_num=img_num)
+        img = self.to_tensor_transform(img)
+        img = self.normalize_transform(img)
+        return img
+def decolorization(image):
+    gray_image = image.convert('L')
+    return Image.merge(image.mode, [gray_image] * 3) if image.mode in ('RGB', 'L') else gray_image
+def downscale(image, scale_factor):
+    new_width = int(round(image.width * scale_factor))
+    new_height = int(round(image.height * scale_factor))
+    new_width = max(1, new_width)
+    new_height = max(1, new_height)
+    return image.resize((new_width, new_height), resample=Image.BICUBIC)
+def crop(image, crop_factors):
+    target_h, target_w = crop_factors
+    img_w, img_h = image.size
+    if target_h > img_h or target_w > img_w:
+        raise ValueError("Crop size exceeds image dimensions")
+    x = random.randint(0, img_w - target_w)
+    y = random.randint(0, img_h - target_h)
+    return image.crop((x, y, x + target_w, y + target_h)), [[x, y], [x + target_w, y + target_h]]
+def motion_blur_opencv(image, kernel_size=15, angle=0):
+    # 线性核
+    kernel = np.zeros((kernel_size, kernel_size), dtype=np.float32)
+    kernel[kernel_size // 2, :] = np.ones(kernel_size, dtype=np.float32)
+    # 旋转核
+    center = (kernel_size / 2 - 0.5, kernel_size / 2 - 0.5)
+    M = cv2.getRotationMatrix2D(center, angle, 1)
+    rotated_kernel = cv2.warpAffine(kernel, M, (kernel_size, kernel_size))
+    # 归一化核
+    rotated_kernel /= rotated_kernel.sum() if rotated_kernel.sum() != 0 else 1
+    img = np.array(image)
+    if img.ndim == 2:
+        blurred = cv2.filter2D(img, -1, rotated_kernel, borderType=cv2.BORDER_REFLECT)
+    else:
+        # 对于彩色图像，各通道独立卷积
+        blurred = np.zeros_like(img)
+        for c in range(img.shape[2]):
+            blurred[..., c] = cv2.filter2D(img[..., c], -1, rotated_kernel, borderType=cv2.BORDER_REFLECT)
+    return Image.fromarray(blurred.astype(np.uint8))
+def shuffle_patch(image, num_splits, gap_size=2):
+    """将图像分割为块（允许尺寸不整除），随机打乱后拼接，块间保留间隙"""
+    h_splits, w_splits = num_splits
+    img_w, img_h = image.size
+    base_patch_h = img_h // h_splits
+    patch_heights = [base_patch_h] * (h_splits - 1)
+    patch_heights.append(img_h - sum(patch_heights))
+    base_patch_w = img_w // w_splits
+    patch_widths = [base_patch_w] * (w_splits - 1)
+    patch_widths.append(img_w - sum(patch_widths))
+    patches = []
+    current_y = 0
+    for i in range(h_splits):
+        current_x = 0
+        patch_h = patch_heights[i]
+        for j in range(w_splits):
+            patch_w = patch_widths[j]
+            patch = image.crop((current_x, current_y, current_x + patch_w, current_y + patch_h))
+            patches.append(patch)
+            current_x += patch_w
+        current_y += patch_h
+    random.shuffle(patches)
+    total_width = sum(patch_widths) + (w_splits - 1) * gap_size
+    total_height = sum(patch_heights) + (h_splits - 1) * gap_size
+    new_image = Image.new(image.mode, (total_width, total_height), color=(255, 255, 255))
+    current_y = 0  # 当前行的起始 Y 坐标
+    patch_idx = 0  # 当前处理的块索引
+    for i in range(h_splits):
+        current_x = 0  # 当前列的起始 X 坐标
+        patch_h = patch_heights[i]  # 当前行块的高度
+        for j in range(w_splits):
+            # 取出打乱后的块
+            patch = patches[patch_idx]
+            patch_w = patch_widths[j]  # 当前列块的宽度
+            # 粘贴块（左上角坐标为 (current_x, current_y)）
+            new_image.paste(patch, (current_x, current_y))
+            # 更新 X 坐标（下一个块的起始位置 = 当前块宽度 + 间隙）
+            current_x += patch_w + gap_size
+            patch_idx += 1
+        # 更新 Y 坐标（下一行的起始位置 = 当前行高度 + 间隙）
+        current_y += patch_h + gap_size
+    return new_image
+def inpainting(image, num_splits, blank_ratio=0.3, blank_color=(255, 255, 255)):
+    """
+    图像分割后随机空白部分patch，用于inpainting任务
+    参数：
+        image: PIL.Image 输入图像（RGB模式）
+        h_splits: int 行分割数（垂直方向分割块数）
+        w_splits: int 列分割数（水平方向分割块数）
+        blank_ratio: float 空白patch的比例（0~1）
+        blank_color: tuple 空白区域的颜色（RGB，如白色(255,255,255)）
+    返回：
+        PIL.Image 处理后拼接的图像
+    """
+    h_splits, w_splits = num_splits
+    img_w, img_h = image.size
+    base_patch_h = img_h // h_splits
+    patch_heights = [base_patch_h] * (h_splits - 1)
+    patch_heights.append(img_h - sum(patch_heights))
+    base_patch_w = img_w // w_splits
+    patch_widths = [base_patch_w] * (w_splits - 1)
+    patch_widths.append(img_w - sum(patch_widths))
+    patches = []
+    current_y = 0
+    for i in range(h_splits):
+        current_x = 0
+        patch_h = patch_heights[i]
+        for j in range(w_splits):
+            patch_w = patch_widths[j]
+            patch = image.crop((current_x, current_y, current_x + patch_w, current_y + patch_h))
+            patches.append(patch)
+            current_x += patch_w
+        current_y += patch_h
+    total_patches = h_splits * w_splits
+    num_blank = int(total_patches * blank_ratio)
+    num_blank = max(0, min(num_blank, total_patches))
+    blank_indices = random.sample(range(total_patches), num_blank)
+    processed_patches = []
+    for idx, patch in enumerate(patches):
+        if idx in blank_indices:
+            blank_patch = Image.new("RGB", patch.size, color=blank_color)
+            processed_patches.append(blank_patch)
+        else:
+            processed_patches.append(patch)
+    # 创建结果图像（尺寸与原图一致）
+    result_image = Image.new("RGB", (img_w, img_h))
+    current_y = 0
+    patch_idx = 0
+    for i in range(h_splits):
+        current_x = 0
+        patch_h = patch_heights[i]
+        for j in range(w_splits):
+            # 取出处理后的patch
+            patch = processed_patches[patch_idx]
+            patch_w = patch_widths[j]
+            # 粘贴到原位置
+            result_image.paste(patch, (current_x, current_y))
+            current_x += patch_w
+            patch_idx += 1
+        current_y += patch_h
+    return result_image

data/video_utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) 2023 OpenGVLab
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: MIT
+#
+# This file has been modified by ByteDance Ltd. and/or its affiliates. on 2025-05-20.
+#
+# Original file was released under MIT, with the full license text
+# available at https://github.com/OpenGVLab/InternVL/blob/main/LICENSE.
+#
+# This modified file is released under the same license.
+import io
+import os
+import random
+import re
+import numpy as np
+import decord
+from PIL import Image
+def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
+    if sample in ['rand', 'middle']: # uniform sampling
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if sample == 'rand':
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif sample == 'middle':
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+    elif 'fps' in sample:  # fps0.5, sequentially sample frames at 0.5 fps
+        output_fps = float(sample[3:])
+        duration = float(vlen) / input_fps
+        delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
+        frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+        if max_num_frames > 0 and len(frame_indices) > max_num_frames:
+            frame_indices = frame_indices[:max_num_frames]
+    else:
+        raise ValueError
+    return frame_indices
+def read_frames_decord(video_path, num_frames, sample='rand', fix_start=None, clip=None, min_num_frames=4):
+    video_reader = decord.VideoReader(video_path, num_threads=1)
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    duration = vlen / float(fps)
+    if clip:
+        start, end = clip
+        duration = end - start
+        vlen = int(duration * fps)
+        start_index = int(start * fps)
+    t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
+    frame_indices = get_frame_indices(
+        t_num_frames, vlen, sample=sample, fix_start=fix_start,
+        input_fps=fps
+    )
+    if clip:
+        frame_indices = [f + start_index for f in frame_indices]
+    frames = video_reader.get_batch(frame_indices).asnumpy()  # (T, H, W, C), np.uint8
+    frames = [Image.fromarray(frames[i]) for i in range(frames.shape[0])]
+    return frames
+def extract_frame_number(filename):
+    # Extract the numeric part from the filename using regular expressions
+    match = re.search(r'_(\d+).jpg$', filename)
+    return int(match.group(1)) if match else -1
+def sort_frames(frame_paths):
+    # Extract filenames from each path and sort by their numeric part
+    return sorted(frame_paths, key=lambda x: extract_frame_number(os.path.basename(x)))
+def read_frames_folder(video_path, num_frames, sample='rand', fix_start=None, min_num_frames=4):
+    image_list = sort_frames(list(os.listdir(video_path)))
+    frames = []
+    for image in image_list:
+        fp = os.path.join(video_path, image)
+        frame = Image.open(fp).convert('RGB')
+        frames.append(frame)
+    vlen = len(frames)
+    t_num_frames = np.random.randint(min_num_frames, num_frames + 1)
+    if vlen > t_num_frames:
+        frame_indices = get_frame_indices(
+            t_num_frames, vlen, sample=sample, fix_start=fix_start
+        )
+        frames = [frames[i] for i in frame_indices]
+    return frames
+class FrameSampler:
+    def __init__(self, max_num_frames=-1, min_num_frames=8, sample='rand'):
+        self.max_num_frames = max_num_frames
+        self.min_num_frames = min_num_frames
+        self.sample = sample
+    def __call__(self, file_name):
+        fn = read_frames_folder if file_name.endswith('/') else read_frames_decord
+        frames = fn(file_name, num_frames=self.max_num_frames, min_num_frames=self.min_num_frames, sample=self.sample)
+        return frames
+def decode_video_byte(video_bytes):
+    video_stream = io.BytesIO(video_bytes)
+    vr = decord.VideoReader(video_stream)
+    return vr
+def sample_mp4_frames(mp4_p, n_frames=None, fps=None, return_frame_indices=False, random_sample=False):
+    if isinstance(mp4_p, str):
+        vr = decord.VideoReader(mp4_p, num_threads=1)
+    elif isinstance(mp4_p, decord.video_reader.VideoReader):
+        vr = mp4_p
+    video_fps = vr.get_avg_fps()  # 获取视频的帧率
+    video_duration = len(vr) / video_fps
+    if n_frames is not None:
+        if random_sample:
+            frame_indices = sorted(random.sample(range(len(vr)), n_frames))
+        else:
+            frame_indices = np.linspace(0, len(vr)-1, n_frames, dtype=int).tolist()
+    else:
+        frame_indices = [int(i) for i in np.arange(0, len(vr)-1, video_fps/fps)]
+    frames = vr.get_batch(frame_indices).asnumpy()  # 转换为 numpy 数组
+    frames = [Image.fromarray(frame).convert("RGB") for frame in frames]
+    if not return_frame_indices:
+        return frames, video_duration
+    else:
+        return frames, video_duration, frame_indices
+def sample_mp4_frames_by_indices(mp4_p, frame_indices: list):
+    if isinstance(mp4_p, str):
+        vr = decord.VideoReader(mp4_p, num_threads=1)
+    elif isinstance(mp4_p, decord.video_reader.VideoReader):
+        vr = mp4_p
+    # sample the frames in frame_indices
+    frames = vr.get_batch(frame_indices).asnumpy()  # 转换为 numpy 数组
+    frames = [Image.fromarray(frame).convert("RGB") for frame in frames]
+    return frames

data/vlm_dataset.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+import json
+import os
+import traceback
+from PIL import Image, ImageFile, PngImagePlugin
+from .data_utils import pil_img2rgb
+from .distributed_iterable_dataset import DistributedIterableDataset
+Image.MAX_IMAGE_PIXELS = 200000000
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+MaximumDecompressedSize = 1024
+MegaByte = 2 ** 20
+PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
+class SftJSONLIterableDataset(DistributedIterableDataset):
+    def __init__(
+        self, dataset_name, transform, tokenizer, frame_sampler,
+        jsonl_path_list, data_dir_list, num_used_data,
+        local_rank=0, world_size=1, num_workers=8, data_status=None,
+        shuffle_lines=False, shuffle_seed=0,
+    ):
+        """
+        jsonl_path_list: list of jsonl file paths
+        data_dir_list: list of image directories containing the images of each jsonl file
+        num_used_data: list of number of sampled data points for each jsonl
+        """
+        super().__init__(dataset_name, local_rank, world_size, num_workers)
+        self.transform = transform
+        self.tokenizer = tokenizer
+        self.frame_sampler = frame_sampler
+        self.data_status = data_status
+        self.data_paths = self.get_data_paths(
+            jsonl_path_list,
+            data_dir_list,
+            num_used_data,
+            shuffle_lines,
+            shuffle_seed,
+        )
+        self.set_epoch()
+    def get_data_paths(
+        self,
+        jsonl_path_list,
+        data_dir_list,
+        num_used_data,
+        shuffle_lines,
+        shuffle_seed,
+    ):
+        data_paths = []
+        for jsonl_path, image_dir, num_data_point in zip(
+            jsonl_path_list, data_dir_list, num_used_data
+        ):
+            with open(jsonl_path, 'r') as f:
+                raw_data = f.readlines()
+            if shuffle_lines:
+                self.rng.seed(shuffle_seed)
+                self.rng.shuffle(raw_data)
+            raw_data = raw_data[:num_data_point]
+            data_paths.extend([(json_data, image_dir) for json_data in raw_data])
+        return data_paths
+    def change_format(self, data, num_images):
+        elements = []
+        for conversation in data['conversations']:
+            if conversation['from'] == 'human':
+                if '<image>' not in conversation['value']:
+                    elements.append({
+                        'type': 'text',
+                        'has_loss': 0,
+                        'text': conversation['value'],
+                    })
+                else:
+                    text_list = conversation['value'].split('<image>')
+                    for idx, text in enumerate(text_list):
+                        if text.strip() != '':
+                            elements.append({
+                                'type': 'text',
+                                'has_loss': 0,
+                                'text': text.strip(),
+                            })
+                        if (idx != len(text_list) - 1) and (idx < num_images):
+                            elements.append({'type': 'image',})
+            elif conversation['from'] == 'gpt':
+                elements.append({
+                    'type': 'text',
+                    'has_loss': 1,
+                    'text': conversation['value'],
+                })
+        return elements
+    def __iter__(self):
+        data_paths_per_worker, worker_id = self.get_data_paths_per_worker()
+        if self.data_status is not None:
+            row_start_id = self.data_status[worker_id] + 1
+        else:
+            row_start_id = 0
+        transform_stride = self.transform.stride
+        print(
+            f"rank-{self.local_rank} worker-{worker_id} dataset-{self.dataset_name}: "
+            f"resuming data at row#{row_start_id}"
+        )
+        while True:
+            data_paths_per_worker_ = data_paths_per_worker[row_start_id:]
+            for row_idx, (data, image_dir) in enumerate(data_paths_per_worker_, start=row_start_id):
+                num_tokens = 0
+                image_tensor_list = []
+                text_ids_list = []
+                sequence_plan = []
+                try:
+                    data_item = json.loads(data)
+                    raw_images = None
+                    if 'image' in data_item:
+                        if type(data_item['image']) == list:
+                            raw_images = [
+                                pil_img2rgb(Image.open(os.path.join(image_dir, image)))
+                                for image in data_item['image']
+                            ]
+                        else:
+                            raw_images = [
+                                pil_img2rgb(Image.open(os.path.join(image_dir, data_item['image'])))
+                            ]
+                    elif 'video' in data_item:
+                        raw_images = self.frame_sampler(os.path.join(image_dir, data_item['video']))
+                        special_tokens = '<image>' * len(raw_images)
+                        for item in data_item['conversations']:
+                            if '<video>' in item['value']:
+                                item['value'] = item['value'].replace('<video>', special_tokens)
+                                break
+                            else:
+                                raise ValueError("Cannot find <video> in the conversation!")
+                except:
+                    traceback.print_exc()
+                    continue
+                if raw_images:
+                    for raw_image in raw_images:
+                        image_tensor = self.transform(raw_image, img_num=len(raw_images))
+                        image_tensor_list.append(image_tensor)
+                        height, width = image_tensor.shape[1:]
+                        num_tokens += width * height // transform_stride ** 2
+                elements = self.change_format(data_item, len(image_tensor_list))
+                for item in elements:
+                    if item['type'] == 'text':
+                        text_data = item['text']
+                        text_ids = self.tokenizer.encode(text_data)
+                        if len(text_ids) > 0:
+                            text_ids_list.append(text_ids)
+                            num_tokens += len(text_ids)
+                            current_plan = {
+                                'type': 'text',
+                                'enable_cfg': 0,
+                                'loss': item['has_loss'],
+                                'special_token_loss': 0,
+                                'special_token_label': None,
+                            }
+                            sequence_plan.append(current_plan)
+                    elif item['type'] == 'image':
+                        current_plan = {
+                            'type': 'vit_image',
+                            'enable_cfg': 0,
+                            'loss': 0,
+                            'special_token_loss': 0,
+                            'special_token_label': None,
+                        }
+                        sequence_plan.append(current_plan)
+                has_loss = [item['loss'] for item in sequence_plan]
+                if sum(has_loss) == 0:
+                    print(f'No loss defined, skipped.')
+                    continue
+                yield dict(
+                    image_tensor_list=image_tensor_list,
+                    text_ids_list=text_ids_list,
+                    sequence_plan=sequence_plan,
+                    num_tokens=num_tokens,
+                    data_indexes={
+                        "data_indexes": row_idx,
+                        "worker_id": worker_id,
+                        "dataset_name": self.dataset_name,
+                    }
+                )
+            row_start_id = 0
+            print(f"{self.dataset_name} repeat in rank-{self.local_rank} worker-{worker_id}")