llava-onevision

Runtime error

App Files Files Community

merve HF Staff commited on Jul 17, 2024

Commit

99ad72b

verified ·

1 Parent(s): 9bbf94b

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -26

app.py CHANGED Viewed

@@ -6,15 +6,20 @@ import time
 from PIL import Image
 import torch
 import cv2
-import spaces
-model_id = "llava-hf/llava-interleave-qwen-7b-hf"
 processor = LlavaProcessor.from_pretrained(model_id)
 model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
 model.to("cuda")
-def sample_frames(video_file, num_frames) :
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     interval = total_frames // num_frames
@@ -31,9 +36,16 @@ def sample_frames(video_file, num_frames) :
 @spaces.GPU
 def bot_streaming(message, history):
-  if message["files"]:
-    image = message["files"][-1]
   else:
     # if there's no image uploaded for this turn, look for images in the past turns
     # kept inside tuples, take the last one
@@ -41,28 +53,44 @@ def bot_streaming(message, history):
       if type(hist[0])==tuple:
         image = hist[0][0]
-  txt = message["text"]
-  img = message["files"]
-  ext_buffer =f"'user\ntext': '{txt}', 'files': '{img}' assistant"
-  if image is None:
       gr.Error("You need to upload an image or video for LLaVA to work.")
   video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg")
   image_extensions = Image.registered_extensions()
   image_extensions = tuple([ex for ex, f in image_extensions.items()])
-  if image.endswith(video_extensions):
-      image = sample_frames(image, 12)
-      image_tokens = "<image>" * 13
-      prompt = f"<|im_start|>user {image_tokens}\n{message}<|im_end|><|im_start|>assistant"
-  elif image.endswith(image_extensions):
-      image = Image.open(image).convert("RGB")
-      prompt = f"<|im_start|>user <image>\n{message}<|im_end|><|im_start|>assistant"
   inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
-  streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": True})
   generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=100)
   generated_text = ""
@@ -75,15 +103,19 @@ def bot_streaming(message, history):
   for new_text in streamer:
     buffer += new_text
-    print(buffer)
     generated_text_without_prompt = buffer[len(ext_buffer):]
     time.sleep(0.01)
     yield generated_text_without_prompt
-demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Interleave", examples=[{"text": "What is on the flower?", "files":["./bee.jpg"]},
-                                                                      {"text": "How to make this pastry?", "files":["./baklava.png"]},
-                                                                      {"text": "What type of cats are these?", "files":["./cats.mp4"]}],
-                        description="Try [LLaVA Interleave](https://huggingface.co/docs/transformers/main/en/model_doc/llava) in this demo (more specifically, the [Qwen-1.5-7B variant](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
-                        stop_btn="Stop Generation", multimodal=True)
 demo.launch(debug=True)

 from PIL import Image
 import torch
 import cv2
+import spaces
+model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
 processor = LlavaProcessor.from_pretrained(model_id)
 model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
 model.to("cuda")
+def replace_video_with_images(text, frames):
+  return text.replace("<video>", "<image>" * frames)
+def sample_frames(video_file, num_frames):
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     interval = total_frames // num_frames
 @spaces.GPU
 def bot_streaming(message, history):
+  txt = message.text
+  ext_buffer = f"user\n{txt} assistant"
+  if message.files:
+    if len(message.files) == 1:
+      image = [message.files[0].path]
+    # interleaved images or video
+    elif len(message.files) > 1:
+      image = [msg.path for msg in message.files]
   else:
     # if there's no image uploaded for this turn, look for images in the past turns
     # kept inside tuples, take the last one
       if type(hist[0])==tuple:
         image = hist[0][0]
+  if message.files is None:
       gr.Error("You need to upload an image or video for LLaVA to work.")
   video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg")
   image_extensions = Image.registered_extensions()
   image_extensions = tuple([ex for ex, f in image_extensions.items()])
+  if len(image) == 1:
+    if image[0].endswith(video_extensions):
+        image = sample_frames(image[0], 12)
+        image_tokens = "<image>" * 13
+        prompt = f"<|im_start|>user {image_tokens}\n{message.text}<|im_end|><|im_start|>assistant"
+    elif image[0].endswith(image_extensions):
+        image = Image.open(image[0]).convert("RGB")
+        prompt = f"<|im_start|>user <image>\n{message.text}<|im_end|><|im_start|>assistant"
+  elif len(image) > 1:
+    image_list = []
+    user_prompt = message.text
+    for img in image:
+      if img.endswith(image_extensions):
+        img = Image.open(img).convert("RGB")
+        image_list.append(img)
+      elif img.endswith(video_extensions):
+        frames = sample_frames(img, 6)
+        for frame in frames:
+          image_list.append(frame)
+    toks = "<image>" * len(image_list)
+    prompt = "<|im_start|>user"+ toks + f"\n{user_prompt}<|im_end|><|im_start|>assistant"
+    image = image_list
   inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
+  streamer = TextIteratorStreamer(processor, **{"max_new_tokens": 200, "skip_special_tokens": True})
   generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=100)
   generated_text = ""
   for new_text in streamer:
     buffer += new_text
     generated_text_without_prompt = buffer[len(ext_buffer):]
     time.sleep(0.01)
     yield generated_text_without_prompt
+demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA Interleave", examples=[
+     {"text": "What are these cats doing?", "files":["./cats.mp4"]},
+     {"text": "The input contains two videos, are the cats in this video and this video doing the same thing?", "files":["./cats_1.mp4", "./cats_2.mp4"]},
+     {"text": "What is on the flower?", "files":["./bee.jpg"]},
+     {"text": "There are two images in the input. What is the relationship between this image and this image?", "files":["./bee.jpg", "./depth-bee.png"]},
+    {"text": "How to make this pastry?", "files":["./baklava.png"]}],
+      textbox=gr.MultimodalTextbox(file_count="multiple"),
+      description="Try [LLaVA Interleave](https://huggingface.co/docs/transformers/main/en/model_doc/llava) in this demo (more specifically, the [Qwen-1.5-7B variant](https://huggingface.co/llava-hf/llava-interleave-qwen-7b-hf)). Upload an image or a video, and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error. ",
+      stop_btn="Stop Generation", multimodal=True)
 demo.launch(debug=True)