Spaces:
Runtime error
Runtime error
| _base_ = ['./san-vit-b16_voc12aug-640x640.py'] | |
| model = dict( | |
| type='MultimodalEncoderDecoder', | |
| pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth', | |
| encoder_resolution=0.7, | |
| image_encoder=dict( | |
| type='VisionTransformer', | |
| img_size=(336, 336), | |
| patch_size=14, | |
| patch_pad=0, | |
| embed_dims=1024, | |
| num_layers=18, | |
| num_heads=16, | |
| out_indices=(5, 11, 17), | |
| ), | |
| text_encoder=dict( | |
| type='CLIPTextEncoder', | |
| embed_dims=768, | |
| num_layers=12, | |
| num_heads=12, | |
| output_dims=768, | |
| ), | |
| decode_head=dict( | |
| type='SideAdapterCLIPHead', | |
| san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)), | |
| maskgen_cfg=dict( | |
| num_layers=6, | |
| embed_dims=1024, | |
| num_heads=16, | |
| out_dims=768, | |
| ))) | |