Spaces:
Runtime error
Runtime error
| from torch.nn import GroupNorm, ReLU | |
| from mmdet.models import MSDeformAttnPixelDecoder, CrossEntropyLoss, DiceLoss, FocalLoss | |
| from mmdet.models.task_modules.assigners import HungarianAssigner, ClassificationCost, CrossEntropyLossCost, DiceCost | |
| from mmdet.models.task_modules.samplers import MaskPseudoSampler | |
| from seg.models.detectors import Mask2formerVideo | |
| from seg.models.fusion_head import OMGFusionHead | |
| from seg.models.heads import Mask2FormerVideoHead | |
| from seg.models.backbones import OpenCLIPBackbone | |
| num_things_classes = 80 | |
| num_stuff_classes = 53 | |
| ov_model_name = 'convnext_large_d_320' | |
| ov_datasets_name = 'CocoPanopticOVDataset' | |
| model = dict( | |
| type=Mask2formerVideo, | |
| data_preprocessor=None, # to fill | |
| backbone=dict( | |
| type=OpenCLIPBackbone, | |
| model_name='convnext_large_d_320', | |
| fix=True, | |
| init_cfg=dict( | |
| type='clip_pretrain', | |
| checkpoint='laion2b_s29b_b131k_ft_soup' | |
| ) | |
| ), | |
| panoptic_head=dict( | |
| init_cfg=dict( | |
| type='Pretrained', | |
| checkpoint='./models/omg_seg_convl.pth', | |
| prefix='panoptic_head.' | |
| ), | |
| type=Mask2FormerVideoHead, | |
| sphere_cls=True, | |
| ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', | |
| logit=None, | |
| enable_box_query=True, | |
| in_channels=[192, 384, 768, 1536], # pass to pixel_decoder inside | |
| strides=[4, 8, 16, 32], | |
| feat_channels=256, | |
| out_channels=256, | |
| num_things_classes=num_things_classes, | |
| num_stuff_classes=num_stuff_classes, | |
| num_queries=300, | |
| num_transformer_feat_level=3, | |
| pixel_decoder=dict( | |
| type=MSDeformAttnPixelDecoder, | |
| num_outs=3, | |
| norm_cfg=dict(type=GroupNorm, num_groups=32), | |
| act_cfg=dict(type=ReLU), | |
| encoder=dict( # DeformableDetrTransformerEncoder | |
| num_layers=6, | |
| layer_cfg=dict( # DeformableDetrTransformerEncoderLayer | |
| self_attn_cfg=dict( # MultiScaleDeformableAttention | |
| embed_dims=256, | |
| num_heads=8, | |
| num_levels=3, | |
| num_points=4, | |
| dropout=0.0, | |
| batch_first=True), | |
| ffn_cfg=dict( | |
| embed_dims=256, | |
| feedforward_channels=1024, | |
| num_fcs=2, | |
| ffn_drop=0.0, | |
| act_cfg=dict(type=ReLU, inplace=True)))), | |
| positional_encoding=dict(num_feats=128, normalize=True)), | |
| enforce_decoder_input_project=False, | |
| positional_encoding=dict(num_feats=128, normalize=True), | |
| transformer_decoder=dict( # Mask2FormerTransformerDecoder | |
| return_intermediate=True, | |
| num_layers=9, | |
| layer_cfg=dict( # Mask2FormerTransformerDecoderLayer | |
| self_attn_cfg=dict( # MultiheadAttention | |
| embed_dims=256, | |
| num_heads=8, | |
| dropout=0.0, | |
| batch_first=True), | |
| cross_attn_cfg=dict( # MultiheadAttention | |
| embed_dims=256, | |
| num_heads=8, | |
| dropout=0.0, | |
| batch_first=True), | |
| ffn_cfg=dict( | |
| embed_dims=256, | |
| feedforward_channels=2048, | |
| num_fcs=2, | |
| ffn_drop=0.0, | |
| act_cfg=dict(type='ReLU', inplace=True))), | |
| init_cfg=None), | |
| loss_cls=dict( | |
| type=CrossEntropyLoss, | |
| use_sigmoid=False, | |
| loss_weight=2.0, | |
| reduction='mean', | |
| class_weight=None # [1.0] * num_classes + [0.1] | |
| ), | |
| loss_mask=dict( | |
| type=CrossEntropyLoss, | |
| use_sigmoid=True, | |
| reduction='mean', | |
| loss_weight=5.0), | |
| loss_dice=dict( | |
| type=DiceLoss, | |
| use_sigmoid=True, | |
| activate=True, | |
| reduction='mean', | |
| naive_dice=True, | |
| eps=1.0, | |
| loss_weight=5.0), | |
| loss_iou=dict( | |
| type=FocalLoss, | |
| use_sigmoid=True, | |
| loss_weight=2.0, | |
| reduction='mean' | |
| ) | |
| ), | |
| panoptic_fusion_head=dict( | |
| type=OMGFusionHead, | |
| num_things_classes=num_things_classes, | |
| num_stuff_classes=num_stuff_classes, | |
| loss_panoptic=None, | |
| init_cfg=None | |
| ), | |
| train_cfg=dict( | |
| num_points=12544, | |
| oversample_ratio=3.0, | |
| importance_sample_ratio=0.75, | |
| assigner=dict( | |
| type=HungarianAssigner, | |
| match_costs=[ | |
| dict(type=ClassificationCost, weight=2.0), | |
| dict( | |
| type=CrossEntropyLossCost, weight=5.0, use_sigmoid=True), | |
| dict(type=DiceCost, weight=5.0, pred_act=True, eps=1.0) | |
| ]), | |
| sampler=dict(type=MaskPseudoSampler)), | |
| test_cfg=dict( | |
| panoptic_on=True, | |
| semantic_on=False, | |
| instance_on=True, | |
| # max_per_image is for instance segmentation. | |
| max_per_image=100, | |
| iou_thr=0.8, | |
| # In Mask2Former's panoptic postprocessing, | |
| # it will filter mask area where score is less than 0.5 . | |
| filter_low_score=True, | |
| object_mask_thr=0., | |
| ), | |
| init_cfg=None | |
| ) | |