# configuration_eat.py

from transformers import PretrainedConfig

class EATConfig(PretrainedConfig):
    model_type = "eat"

    def __init__(
        self,
        embed_dim=768,
        depth=12,
        num_heads=12,
        patch_size=16,
        stride=16,
        in_chans=1,
        mel_bins=128,
        max_length=768,
        num_classes=527,
        model_variant="pretrain",  # or "finetune"

        mlp_ratio=4.0,
        qkv_bias=True,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        activation_dropout=0.0,
        post_mlp_drop=0.0,
        start_drop_path_rate=0.0,
        end_drop_path_rate=0.0,

        layer_norm_first=False,
        norm_eps=1e-6,
        norm_affine=True,
        fixed_positions=True,

        img_size=(1024, 128),  # (target_length, mel_bins)

        **kwargs,
    ):
        super().__init__(**kwargs)

        self.embed_dim = embed_dim
        self.depth = depth
        self.num_heads = num_heads
        self.patch_size = patch_size
        self.stride = stride
        self.in_chans = in_chans
        self.mel_bins = mel_bins
        self.max_length = max_length
        self.num_classes = num_classes
        self.model_variant = model_variant

        self.mlp_ratio = mlp_ratio
        self.qkv_bias = qkv_bias
        self.drop_rate = drop_rate
        self.attn_drop_rate = attn_drop_rate
        self.activation_dropout = activation_dropout
        self.post_mlp_drop = post_mlp_drop
        self.start_drop_path_rate = start_drop_path_rate
        self.end_drop_path_rate = end_drop_path_rate

        self.layer_norm_first = layer_norm_first
        self.norm_eps = norm_eps
        self.norm_affine = norm_affine
        self.fixed_positions = fixed_positions

        self.img_size = img_size