Spaces:

prithivMLmods
/

Qwen3-VL-Outpost

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 23

Commit

17b06a6

verified ·

1 Parent(s): bb456cd

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -13

app.py CHANGED Viewed

@@ -1,5 +1,11 @@
 import gradio as gr
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
 import time
@@ -9,6 +15,12 @@ import cv2
 import numpy as np
 from PIL import Image
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
@@ -29,6 +41,9 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
 def downsample_video(video_path):
     """
     Downsamples the video to 10 evenly spaced frames.
@@ -54,19 +69,40 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"  # Alternatively: "Qwen/Qwen2.5-VL-3B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16
 ).to("cuda").eval()
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
-    files = input_dict["files"]
     if text.strip().lower().startswith("@video-infer"):
         # Remove the tag from the query.
         text = text[len("@video-infer"):].strip()
@@ -103,7 +139,7 @@ def model_inference(input_dict, history):
         # Set up streaming generation.
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield progress_bar_html("Processing video with Qwen2.5VL Model")
@@ -113,6 +149,46 @@ def model_inference(input_dict, history):
             yield buffer
         return
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -120,9 +196,6 @@ def model_inference(input_dict, history):
     else:
         images = []
-    if text == "" and not images:
-        gr.Error("Please input a query and optionally image(s).")
-        return
     if text == "" and images:
         gr.Error("Please input a text query along with the image(s).")
         return
@@ -145,7 +218,7 @@ def model_inference(input_dict, history):
     ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html("Processing with Qwen2.5VL Model")
@@ -154,11 +227,15 @@ def model_inference(input_dict, history):
         time.sleep(0.01)
         yield buffer
 examples = [
     [{"text": "Describe the Image?", "files": ["example_images/document.jpg"]}],
     [{"text": "@video-infer Explain the content of the Advertisement", "files": ["example_images/videoplayback.mp4"]}],
     [{"text": "@video-infer Explain the content of the video in detail", "files": ["example_images/breakfast.mp4"]}],
-    [{"text": "@video-infer Explain the content of the video.", "files": ["example_images/sky.mp4"]}],
 ]
 demo = gr.ChatInterface(
@@ -172,4 +249,5 @@ demo = gr.ChatInterface(
     cache_examples=False,
 )
-demo.launch(debug=True)

 import gradio as gr
+from transformers import (
+    AutoProcessor,
+    Qwen2_5_VLForConditionalGeneration,
+    TextIteratorStreamer,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
 from transformers.image_utils import load_image
 from threading import Thread
 import time
 import numpy as np
 from PIL import Image
+# A constant for token length limit
+MAX_INPUT_TOKEN_LENGTH = 4096
+# -----------------------
+# Progress Bar Helper
+# -----------------------
 def progress_bar_html(label: str) -> str:
     """
     Returns an HTML snippet for a thin progress bar with a label.
 </style>
     '''
+# -----------------------
+# Video Downsampling Helper
+# -----------------------
 def downsample_video(video_path):
     """
     Downsamples the video to 10 evenly spaced frames.
     vidcap.release()
     return frames
+# -----------------------
+# Qwen2.5-VL Multimodal Setup
+# -----------------------
+MODEL_ID_QWEN = "Qwen/Qwen2.5-VL-7B-Instruct"  # Alternatively: "Qwen/Qwen2.5-VL-3B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
+qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_QWEN,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16
 ).to("cuda").eval()
+# -----------------------
+# DeepHermes Text Generation Setup
+# -----------------------
+text_model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
+text_tokenizer = AutoTokenizer.from_pretrained(text_model_id)
+text_model = AutoModelForCausalLM.from_pretrained(
+    text_model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+text_model.eval()
+# -----------------------
+# Main Inference Function
+# -----------------------
 @spaces.GPU
 def model_inference(input_dict, history):
     text = input_dict["text"]
+    files = input_dict.get("files", [])
+    # -----------------------
+    # Video Inference Branch
+    # -----------------------
     if text.strip().lower().startswith("@video-infer"):
         # Remove the tag from the query.
         text = text[len("@video-infer"):].strip()
         # Set up streaming generation.
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+        thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
         thread.start()
         buffer = ""
         yield progress_bar_html("Processing video with Qwen2.5VL Model")
             yield buffer
         return
+    # -----------------------
+    # Text-Only Inference Branch (using DeepHermes text generation)
+    # -----------------------
+    if not files:
+        # Prepare a simple conversation for text-only input.
+        conversation = [{"role": "user", "content": text}]
+        # Here we use the text tokenizer’s chat template method.
+        input_ids = text_tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, return_tensors="pt"
+        )
+        # Trim if necessary.
+        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(text_model.device)
+        streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": 1024,
+            "do_sample": True,
+            "top_p": 0.9,
+            "top_k": 50,
+            "temperature": 0.6,
+            "num_beams": 1,
+            "repetition_penalty": 1.2,
+        }
+        thread = Thread(target=text_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing with DeepHermes Text Generation Model")
+        for new_text in streamer:
+            buffer += new_text
+            time.sleep(0.01)
+            yield buffer
+        return
+    # -----------------------
+    # Multimodal (Image) Inference Branch with Qwen2.5-VL
+    # -----------------------
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
     else:
         images = []
     if text == "" and images:
         gr.Error("Please input a text query along with the image(s).")
         return
     ).to("cuda")
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    thread = Thread(target=qwen_model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     yield progress_bar_html("Processing with Qwen2.5VL Model")
         time.sleep(0.01)
         yield buffer
+# -----------------------
+# Gradio Chat Interface
+# -----------------------
 examples = [
     [{"text": "Describe the Image?", "files": ["example_images/document.jpg"]}],
+    [{"text": "Tell me a story about a brave knight in a faraway kingdom."}],
     [{"text": "@video-infer Explain the content of the Advertisement", "files": ["example_images/videoplayback.mp4"]}],
     [{"text": "@video-infer Explain the content of the video in detail", "files": ["example_images/breakfast.mp4"]}],
 ]
 demo = gr.ChatInterface(
     cache_examples=False,
 )
+if __name__ == "__main__":
+    demo.launch(debug=True)