Spaces:

prithivMLmods
/

Qwen3-VL-Outpost

Running on Zero

App Files Files Community

prithivMLmods commited on 4 days ago

Commit

83c8eb3

verified ·

1 Parent(s): 38b4e1b

update app

Browse files

Files changed (1) hide show

app.py +30 -18

app.py CHANGED Viewed

@@ -102,6 +102,24 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Qwen3-VL-4B-Instruct
 MODEL_ID_Q = "Qwen/Qwen3-VL-4B-Instruct"
 processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
@@ -120,15 +138,6 @@ model_y = Qwen3VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Qwen3-VL-2B-Instruct
-MODEL_ID_L = "Qwen/Qwen3-VL-2B-Instruct"
-processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
-model_l = Qwen3VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_L,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
@@ -161,13 +170,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     """
     Generates responses using the selected model for image input.
     """
-    if model_name == "Qwen3-VL-4B-Instruct":
         processor, model = processor_q, model_q
     elif model_name == "Qwen3-VL-8B-Instruct":
         processor, model = processor_y, model_y
-    elif model_name == "Qwen3-VL-2B-Instruct":
-        processor, model = processor_l, model_l
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -200,12 +210,14 @@ def generate_video(model_name: str, text: str, video_path: str,
     """
     Generates responses using the selected model for video input.
     """
-    if model_name == "Qwen3-VL-4B-Instruct":
         processor, model = processor_q, model_q
     elif model_name == "Qwen3-VL-8B-Instruct":
         processor, model = processor_y, model_y
-    elif model_name == "Qwen3-VL-2B-Instruct":
-        processor, model = processor_l, model_l
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -295,12 +307,12 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
-            output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
             with gr.Accordion("(Result.md)", open=False):
                 markdown_output = gr.Markdown()
             model_choice = gr.Radio(
-                choices=["Qwen3-VL-4B-Instruct", "Qwen3-VL-2B-Instruct", "Qwen3-VL-8B-Instruct"],
                 label="Select Model",
                 value="Qwen3-VL-4B-Instruct"
             )

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load Qwen2.5-VL-7B-Instruct
+MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load Qwen2.5-VL-3B-Instruct
+MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_X,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
 # Load Qwen3-VL-4B-Instruct
 MODEL_ID_Q = "Qwen/Qwen3-VL-4B-Instruct"
 processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
     """
     Generates responses using the selected model for image input.
     """
+    if model_name == "Qwen2.5-VL-7B-Instruct":
+        processor, model = processor_m, model_m
+    elif model_name == "Qwen2.5-VL-3B-Instruct":
+        processor, model = processor_x, model_x
+    elif model_name == "Qwen3-VL-4B-Instruct":
         processor, model = processor_q, model_q
     elif model_name == "Qwen3-VL-8B-Instruct":
         processor, model = processor_y, model_y
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     """
     Generates responses using the selected model for video input.
     """
+    if model_name == "Qwen2.5-VL-7B-Instruct":
+        processor, model = processor_m, model_m
+    elif model_name == "Qwen2.5-VL-3B-Instruct":
+        processor, model = processor_x, model_x
+    elif model_name == "Qwen3-VL-4B-Instruct":
         processor, model = processor_q, model_q
     elif model_name == "Qwen3-VL-8B-Instruct":
         processor, model = processor_y, model_y
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
+            output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
             with gr.Accordion("(Result.md)", open=False):
                 markdown_output = gr.Markdown()
             model_choice = gr.Radio(
+                choices=["Qwen3-VL-4B-Instruct", "Qwen3-VL-8B-Instruct", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Instruct"],
                 label="Select Model",
                 value="Qwen3-VL-4B-Instruct"
             )