"""Gradio Demo for 3D-MOOD."""

import spaces
import gradio as gr
import gc
import os

import numpy as np
import torch
from PIL import Image

from vis4d.data.transforms.base import compose
from vis4d.data.transforms.normalize import NormalizeImages
from vis4d.data.transforms.resize import ResizeImages, ResizeIntrinsics
from vis4d.data.transforms.to_tensor import ToTensor
from vis4d.common.ckpt import load_model_checkpoint
from vis4d.op.fpp.fpn import FPN
from vis4d.vis.image.functional import imshow_bboxes3d

from opendet3d.data.transforms.pad import CenterPadImages, CenterPadIntrinsics
from opendet3d.data.transforms.resize import GenResizeParameters
from opendet3d.model.detect3d.grounding_dino_3d import GroundingDINO3D
from opendet3d.op.base.swin import SwinTransformer
from opendet3d.op.detect3d.grounding_dino_3d import (
    GroundingDINO3DCoder,
    GroundingDINO3DHead,
    RoI2Det3D,
    UniDepthHead,
)
from opendet3d.op.fpp.channel_mapper import ChannelMapper


def get_3d_mood_swin_base(
    max_per_image: int = 100, score_thres: float = 0.1
) -> GroundingDINO3D:
    """Get the config of Swin-Base."""
    basemodel = SwinTransformer(
        convert_weights=True,
        pretrain_img_size=384,
        embed_dims=128,
        depths=[2, 2, 18, 2],
        num_heads=[4, 8, 16, 32],
        window_size=12,
        drop_path_rate=0.3,
        out_indices=(0, 1, 2, 3),
    )

    neck = ChannelMapper(
        in_channels=[256, 512, 1024],
        out_channels=256,
        num_outs=4,
        kernel_size=1,
        norm="GroupNorm",
        num_groups=32,
        activation=None,
        bias=True,
    )

    depth_fpn = FPN(
        in_channels_list=[128, 256, 512, 1024],
        out_channels=256,
        extra_blocks=None,
        start_index=0,
    )

    depth_head = UniDepthHead(input_dims=[256, 256, 256, 256])

    box_coder = GroundingDINO3DCoder()

    bbox3d_head = GroundingDINO3DHead(box_coder=box_coder)

    roi2det3d = RoI2Det3D(
        nms=True,
        class_agnostic_nms=True,
        max_per_img=max_per_image,
        score_threshold=score_thres,
    )

    return GroundingDINO3D(
        basemodel=basemodel,
        neck=neck,
        bbox3d_head=bbox3d_head,
        roi2det3d=roi2det3d,
        fpn=depth_fpn,
        depth_head=depth_head,
    )

@spaces.GPU
def run_3d_mood(image, text_prompts, score_thres, fx, fy, cx, cy):
    """Run 3D-MOOD demo."""

    gc.collect()

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Data
    images = image.astype(np.float32)[None, ...]
    intrinsics = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]).astype(np.float32)

    input_texts = text_prompts.split(".")

    class_id_mapping = {i: txt for i, txt in enumerate(input_texts)}

    data_dict = {
        "images": images,
        "original_images": images,
        "input_hw": (images.shape[1], images.shape[2]),
        "original_hw": (images.shape[1], images.shape[2]),
        "intrinsics": intrinsics,
        "original_intrinsics": intrinsics,
    }
    # Transform
    preprocess_transforms = compose(
        transforms=[
            GenResizeParameters(shape=(800, 1333)),
            ResizeImages(),
            ResizeIntrinsics(),
            NormalizeImages(),
            CenterPadImages(stride=1, shape=(800, 1333), update_input_hw=True),
            CenterPadIntrinsics(),
        ]
    )

    data = preprocess_transforms([data_dict])[0]

    # Convert to Tensor
    to_tensor = ToTensor()
    data = to_tensor([data])[0]

    # Model
    model = get_3d_mood_swin_base(score_thres=score_thres).to(device)

    load_model_checkpoint(
        model,
        weights="https://huggingface.co/RoyYang0714/3D-MOOD/resolve/main/gdino3d_swin-b_120e_omni3d_834c97.pt",
        rev_keys=[(r"^model\.", ""), (r"^module\.", "")],
    )

    model.eval()

    # Run predict
    with torch.no_grad():
        boxes, boxes3d, scores, class_ids, depth_maps, categories = model(
            images=data["images"].to(device),
            input_hw=[data["input_hw"]],
            original_hw=[data["original_hw"]],
            intrinsics=data["intrinsics"].to(device)[None],
            padding=[data["padding"]],
            input_texts=[input_texts],
        )

    # Save the prediction for visualization
    imshow_bboxes3d(
        image=data["original_images"].cpu(),
        boxes3d=[b.cpu() for b in boxes3d],
        intrinsics=data["original_intrinsics"].cpu().numpy(),
        scores=[s.cpu() for s in scores],
        class_ids=[c.cpu() for c in class_ids],
        class_id_mapping=class_id_mapping,
        file_path="./output.png",
        n_colors=len(class_id_mapping),
    )

    output = Image.open("./output.png")
    os.remove("./output.png")

    return output


demo = gr.Blocks()

with demo:
    gr.HTML(
    """
    <h1>3D-MOOD: Lifting 2D to 3D for Monocular Open-Set Object Detection</h1>
    <p><a href="https://github.com/cvg/3D-MOOD">🌟 GitHub Repository</a> | <a href="https://royyang0714.github.io/3D-MOOD">🚀 Project Page</a></p>
    <div style="font-size: 16px; line-height: 1.5;">
        <p>Upload one image, camera parameters and language prompts to run the 3D object detection in the wild!</p>
        <p><strong>PLEASE NOTE: </strong>We are using ZeroGPU thanks to HuggingFace community Grant. However, while running on HuggingFace Space, it will take extra time to load the model for each inference. For faster visualization, please consider using a local machine to run our demo from our GitHub repository.</p>
    </div>
    """
    )
    with gr.Row():
        with gr.Column(scale=1):
            fx = gr.Number(label="fx")
            fy = gr.Number(label="fy")
            cx = gr.Number(label="cx")
            cy = gr.Number(label="cy")
            text_prompts = gr.Textbox(label="Language Prompt")
            score_thres = gr.Number(label="Score Threshold")
            submit_btn = gr.Button("Run 3D-MOOD", scale=1, variant="primary")

        with gr.Column(scale=2):
            input_image = gr.Image(label="Upload Image")

        with gr.Column(scale=2):
            detection_output = gr.Image(label="Detection Results.")

    gr.Examples(
        examples=[["rgb.png", "chair.table", 0.1, 518.8579, 519.4696, 325.58246, 253.73616]],
        inputs=[
            input_image,
            text_prompts,
            score_thres,
            fx,
            fy,
            cx,
            cy,
        ],
        outputs=[detection_output],
        fn=run_3d_mood,
        cache_examples=False,
        examples_per_page=50,
    )

    submit_btn.click(
        fn=run_3d_mood,
        inputs=[
            input_image,
            text_prompts,
            score_thres,
            fx,
            fy,
            cx,
            cy,
        ],
        outputs=[detection_output],
    )

demo.launch()