Llama-3.2-Vision-Free

Runtime error

App Files Files Community

akhaliq HF Staff commited on Sep 26, 2024

Commit

b6ef90b

verified ·

1 Parent(s): d7dd1ce

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -47

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import gradio as gr
-from gradio_multimodalchatbot import MultimodalChatbot
-from gradio.data_classes import FileData
 import os
 from together import Together
 import base64
 # Initialize Together client
 client = Together()
@@ -16,61 +18,70 @@ def encode_image(image_path):
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
-def call_llama_vision_api(prompt: str, image_path: str) -> str:
-    getDescriptionPrompt = "You are a UX/UI designer. Describe the attached screenshot or UI mockup in detail. I will feed in the output you give me to a coding model that will attempt to recreate this mockup, so please think step by step and describe the UI in detail. Pay close attention to background color, text color, font size, font family, padding, margin, border, etc. Match the colors and sizes exactly. Make sure to mention every part of the screenshot including any headers, footers, etc. Use the exact text from the screenshot."
-    base64_image = encode_image(image_path)
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": getDescriptionPrompt + "\n\n" + prompt},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_image}"
-                    },
-                },
-            ],
-        }
-    ]
     stream = client.chat.completions.create(
         model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
         messages=messages,
         stream=True,
     )
-    response = ""
     for chunk in stream:
-        content = chunk.choices[0].delta.content or ""
-        response += content
-    return response
-def chat(message, history):
-    user_message = message["text"]
-    files = message.get("files", [])
-    if files and files[0]["file"].path:
-        image_path = files[0]["file"].path
-        response = call_llama_vision_api(user_message, image_path)
-    else:
-        response = "I'm sorry, but I need an image to analyze. Please upload an image along with your question."
-    history.append((message, {"text": response, "files": []}))
-    return history
-with gr.Blocks() as demo:
-    gr.Markdown("# Llama 3.2 Vision Multimodal Chatbot Demo")
-    gr.Markdown("Upload an image and enter your message to analyze using the Llama 3.2 Vision model.")
-    chatbot = MultimodalChatbot(
-        value=[],
-        height=800,
-    )
-    chatbot.submit(chat, [chatbot.messages, chatbot.messages], [chatbot.messages])
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from PIL import Image
+import requests
 import os
 from together import Together
 import base64
+from threading import Thread
+import time
 # Initialize Together client
 client = Together()
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
+def bot_streaming(message, history, max_new_tokens=250):
+    txt = message["text"]
+    messages = []
+    images = []
+    for i, msg in enumerate(history):
+        if isinstance(msg[0], tuple):
+            messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(msg[0][0])}"}}]})
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
+        elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
+            pass
+        elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
+            messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
+    if len(message["files"]) == 1:
+        if isinstance(message["files"][0], str):  # examples
+            image_path = message["files"][0]
+        else:  # regular input
+            image_path = message["files"][0]["path"]
+        messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}]})
+    else:
+        messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
     stream = client.chat.completions.create(
         model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
         messages=messages,
+        max_tokens=max_new_tokens,
         stream=True,
     )
+    buffer = ""
     for chunk in stream:
+        if chunk.choices[0].delta.content is not None:
+            buffer += chunk.choices[0].delta.content
+            time.sleep(0.01)
+            yield buffer
+demo = gr.ChatInterface(
+    fn=bot_streaming,
+    title="Multimodal Llama",
+    examples=[
+        [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200],
+        [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250],
+        [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250],
+        [{"text": "Which company was this invoice addressed to?", "files":["./examples/invoice.png"]}, 250],
+        [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250],
+    ],
+    textbox=gr.MultimodalTextbox(),
+    additional_inputs=[
+        gr.Slider(
+            minimum=10,
+            maximum=500,
+            value=250,
+            step=10,
+            label="Maximum number of new tokens to generate",
+        )
+    ],
+    cache_examples=False,
+    description="Try Multimodal Llama by Meta with the Together API in this demo. Upload an image, and start chatting about it, or simply try one of the examples below.",
+    stop_btn="Stop Generation",
+    fill_height=True,
+    multimodal=True
+)
 if __name__ == "__main__":
+    demo.launch(debug=True)