import gradio as gr
import torch
from PIL import Image
import requests
from io import BytesIO
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

# --- Configuration ---
MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
CPU_DEVICE = "cpu"

# --- Model and Processor Loading ---
print("Loading model and processor... This will take a few minutes on a CPU.")
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    dtype="auto",  # Use 'auto' for dtype for better compatibility
    device_map="auto" # This is the key for CPU (and GPU) compatibility
)
print("Model and processor loaded successfully.")

# --- Inference Function ---
def process_and_generate(image_input, text_prompt):
    """
    Processes the image and text prompt, and generates a response from the model.
    """
    if image_input is None or not text_prompt.strip():
        return "Please provide both an image and a text prompt."

    # Convert Gradio's numpy array to a PIL Image
    pil_image = Image.fromarray(image_input)

    # Prepare the messages payload for the model
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": pil_image},
                {"type": "text", "text": text_prompt},
            ],
        }
    ]

    print("Processing inputs and generating response... This will be slow.")
    try:
        # Preparation for inference
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs = inputs.to(model.device)

        # Inference: Generation of the output
        generated_ids = model.generate(**inputs, max_new_tokens=1024)

        # To get only the new tokens, we trim the input IDs from the generated IDs
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        # Decode the trimmed IDs to text
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        # batch_decode returns a list, we return the first element
        return output_text[0]
        
    except Exception as e:
        return f"An error occurred during generation: {str(e)}"

# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Qwen3-VL-2B-Instruct CPU Demo
        This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library.
        **Warning:** Running this on a free CPU Space is **very slow**. Duplicate this space for solo experience.
        """
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="numpy", label="Upload Image")
            text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.")
            submit_button = gr.Button("Generate Response")
        with gr.Column():
            output_text = gr.Textbox(label="Model Output", lines=10, interactive=False)

    submit_button.click(
        fn=process_and_generate,
        inputs=[image_input, text_prompt],
        outputs=output_text
    )


if __name__ == "__main__":
    demo.launch()