UIGEN-T3-4B-Demo

Runtime error

App Files Files Community

smirki commited on Feb 22

Commit

d8d468c

verified ·

1 Parent(s): c2cc818

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -146

app.py CHANGED Viewed

@@ -1,17 +1,4 @@
 import subprocess
-# Minimal essential installs (FlashAttention pinned version, skipping cuda build)
-subprocess.run(
-    "pip install flash-attn==2.7.0.post2 --no-build-isolation",
-    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    shell=True
-)
-subprocess.run("pip install transformers 'accelerate>=0.26.0' gradio==3.30.0", shell=True)
-# Optional: This can boost performance on some systems.
-import torch
-torch.backends.cudnn.benchmark = True
 import os
 import re
 import logging
@@ -19,103 +6,75 @@ import base64
 from threading import Thread
 from typing import List
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-# ----------------------------------------------------------------------
-# 1. Setup Model & Tokenizer
-# ----------------------------------------------------------------------
-model_name = "smirki/UIGEN-T1.1-Qwen-7B"  # change as needed
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logger.info("Loading model and tokenizer...")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
-    device_map="auto",  # attempts to automatically place the model on GPU
     trust_remote_code=True,
 )
-model.eval()  # disable dropout for faster inference
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     trust_remote_code=True
 )
-logger.info("Model and tokenizer loaded successfully.")
-# ----------------------------------------------------------------------
-# 2. Two-Phase Prompt Templates
-# ----------------------------------------------------------------------
 s1_inference_prompt_think_only = """<|im_start|>user
 {question}<|im_end|>
 <|im_start|>assistant
 <|im_start|>think
 """
-# ----------------------------------------------------------------------
-# 3. Generation Parameter Setup
-# ----------------------------------------------------------------------
 THINK_MAX_NEW_TOKENS = 2048
 ANSWER_MAX_NEW_TOKENS = 2048
 def initialize_gen_kwargs():
     return {
-        "max_new_tokens": 512,  # default; updated dynamically for think/answer
         "do_sample": True,
         "temperature": 0.7,
         "top_p": 0.9,
         "repetition_penalty": 1.05,
-        # "eos_token_id": model.generation_config.eos_token_id,
         "pad_token_id": tokenizer.pad_token_id,
         "use_cache": True,
-        "streamer": None,  # will attach actual streamer at runtime
     }
-# ----------------------------------------------------------------------
-# 4. Helper to submit chat
-# ----------------------------------------------------------------------
-def submit_chat(chatbot, text_input):
-    if not text_input.strip():
-        return chatbot, ""
-    chatbot.append((text_input, ""))
-    logger.info(f"New chat prompt: {text_input}")
-    return chatbot, ""
-# ----------------------------------------------------------------------
-# 5. Artifacts Handling
-# ----------------------------------------------------------------------
 def extract_html_code_block(text: str) -> str:
-    """
-    Extract the first ```html ... ``` code block (if any).
-    """
     pattern = r"```html\s*(.*?)\s*```"
     match = re.search(pattern, text, re.DOTALL)
-    if match:
-        return match.group(1).strip()
-    return text.strip()
 def send_to_sandbox(html_code: str) -> str:
-    """
-    Renders the extracted HTML in an iframe.
-    """
     encoded_html = base64.b64encode(html_code.encode("utf-8")).decode("utf-8")
     data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
     return f'<iframe src="{data_uri}" width="100%" height="920px"></iframe>'
-# ----------------------------------------------------------------------
-# 6. The Two-Phase Streaming Inference
-# ----------------------------------------------------------------------
-def ovis_chat(chatbot: List[List[str]]):
-    """
-    1) Think Phase: produce chain-of-thought (hidden to user).
-    2) Answer Phase: produce final user-facing answer + HTML artifact if present.
-    """
-    # Phase 1: "think" phase
-    last_query = chatbot[-1][0]
-    formatted_think_prompt = s1_inference_prompt_think_only.format(question=last_query)
     input_ids_think = tokenizer.encode(formatted_think_prompt, return_tensors="pt").to(model.device)
     attention_mask_think = (input_ids_think != tokenizer.pad_token_id).to(model.device)
@@ -127,24 +86,27 @@ def ovis_chat(chatbot: List[List[str]]):
     full_think = ""
     try:
         with torch.inference_mode():
-            thread_think = Thread(
-                target=lambda: model.generate(input_ids=input_ids_think, attention_mask=attention_mask_think, **gen_kwargs_think)
             )
-            thread_think.start()
-            # Streaming tokens from 'think' phase
             for new_text in think_streamer:
                 full_think += new_text
-                # We won't log each token to reduce overhead.
-                # Update partial chain-of-thought display:
-                chatbot[-1][1] = f"<|im_start|>think\n{full_think.strip()}"
-                yield chatbot, ""
-            thread_think.join()
     except Exception as e:
         logger.error(f"Error during think phase: {e}")
-        yield chatbot, f"Error in think phase: {str(e)}"
         return
-    # Phase 2: "answer" phase
     new_prompt = (
         formatted_think_prompt
         + full_think.strip()
@@ -161,101 +123,92 @@ def ovis_chat(chatbot: List[List[str]]):
     full_answer = ""
     try:
         with torch.inference_mode():
-            thread_answer = Thread(
-                target=lambda: model.generate(input_ids=input_ids_answer, attention_mask=attention_mask_answer, **gen_kwargs_answer)
             )
-            thread_answer.start()
-            # Streaming tokens from 'answer' phase
             for new_text in answer_streamer:
                 full_answer += new_text
-                # For the UI, display both think + answer
                 display_text = (
                     f"<|im_start|>think\n{full_think.strip()}\n\n"
                     f"<|im_start|>answer\n{full_answer.strip()}"
                 )
-                chatbot[-1][1] = display_text
-                yield chatbot, ""
-            thread_answer.join()
     except Exception as e:
         logger.error(f"Error during answer phase: {e}")
-        yield chatbot, f"Error in answer phase: {str(e)}"
         return
-    # Finally, parse out any HTML artifact from the final answer
-    html_code = extract_html_code_block(full_answer)
-    sandbox_iframe = send_to_sandbox(html_code)
-    yield chatbot, sandbox_iframe
-# ----------------------------------------------------------------------
-# 7. Clearing
-# ----------------------------------------------------------------------
 def clear_chat():
     return [], "", ""
-# ----------------------------------------------------------------------
-# 8. Gradio UI Setup
-# ----------------------------------------------------------------------
-css_code = """
 .left_header {
-  display: flex;
-  flex-direction: column;
-  justify-content: center;
-  align-items: center;
 }
 .right_panel {
-  margin-top: 16px;
-  border: 1px solid #BFBFC4;
-  border-radius: 8px;
-  overflow: hidden;
 }
 .render_header {
-  height: 30px;
-  width: 100%;
-  padding: 5px 16px;
-  background-color: #f5f5f5;
 }
 .header_btn {
-  display: inline-block;
-  height: 10px;
-  width: 10px;
-  border-radius: 50%;
-  margin-right: 4px;
-}
-.render_header > .header_btn:nth-child(1) {
-  background-color: #f5222d;
-}
-.render_header > .header_btn:nth-child(2) {
-  background-color: #faad14;
-}
-.render_header > .header_btn:nth-child(3) {
-  background-color: #52c41a;
 }
 .right_content {
-  height: 920px;
-  display: flex;
-  flex-direction: column;
-  justify-content: center;
-  align-items: center;
-}
-.html_content {
-  width: 100%;
-  height: 920px;
 }
 """
-svg_content = """
 <svg width="40" height="40" viewBox="0 0 45 45" fill="none" xmlns="http://www.w3.org/2000/svg">
-  <circle cx="22.5" cy="22.5" r="22.5" fill="#5572F9"/>
-  <path d="M22.5 11.25L26.25 16.875H18.75L22.5 11.25Z" fill="white"/>
-  <path d="M22.5 33.75L26.25 28.125H18.75L22.5 33.75Z" fill="white"/>
-  <path d="M28.125 22.5L22.5 28.125L16.875 22.5L22.5 16.875L28.125 22.5Z" fill="white"/>
 </svg>
 """
-with gr.Blocks(title=model_name.split('/')[-1], css=css_code) as demo:
     gr.HTML(f"""
         <div class="left_header" style="margin-bottom: 20px;">
-            {svg_content}
             <h1>{model_name.split('/')[-1]} - Chat + Artifacts</h1>
             <p>(Two-phase chain-of-thought with artifact extraction)</p>
         </div>
@@ -277,6 +230,7 @@ with gr.Blocks(title=model_name.split('/')[-1], css=css_code) as demo:
             with gr.Row():
                 submit_btn = gr.Button("Send", variant="primary")
                 clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=6):
             gr.HTML('<div class="render_header"><span class="header_btn"></span><span class="header_btn"></span><span class="header_btn"></span></div>')
             artifact_html = gr.HTML(
@@ -284,23 +238,38 @@ with gr.Blocks(title=model_name.split('/')[-1], css=css_code) as demo:
                 elem_classes="html_content"
             )
-    # Button logic
-    submit_btn.click(
-        submit_chat, [chatbot, text_input], [chatbot, text_input]
     ).then(
-        ovis_chat, [chatbot], [chatbot, artifact_html]
     )
-    text_input.submit(
-        submit_chat, [chatbot, text_input], [chatbot, text_input]
     ).then(
-        ovis_chat, [chatbot], [chatbot, artifact_html]
     )
     clear_btn.click(
-        clear_chat,
         outputs=[chatbot, text_input, artifact_html]
     )
-logger.info("Launching Gradio demo...")
-demo.queue(default_concurrency_count=1).launch(server_name="0.0.0.0", share=True)

 import subprocess
 import os
 import re
 import logging
 from threading import Thread
 from typing import List
+import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Optional: Performance boost
+torch.backends.cudnn.benchmark = True
+# Model setup
+model_name = "smirki/UIGEN-T1.1-Qwen-7B"
 logger.info("Loading model and tokenizer...")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
+    device_map="auto",
     trust_remote_code=True,
 )
+model.eval()
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     trust_remote_code=True
 )
+# Prompt templates
 s1_inference_prompt_think_only = """<|im_start|>user
 {question}<|im_end|>
 <|im_start|>assistant
 <|im_start|>think
 """
+# Constants
 THINK_MAX_NEW_TOKENS = 2048
 ANSWER_MAX_NEW_TOKENS = 2048
 def initialize_gen_kwargs():
     return {
+        "max_new_tokens": 512,
         "do_sample": True,
         "temperature": 0.7,
         "top_p": 0.9,
         "repetition_penalty": 1.05,
         "pad_token_id": tokenizer.pad_token_id,
         "use_cache": True,
     }
 def extract_html_code_block(text: str) -> str:
     pattern = r"```html\s*(.*?)\s*```"
     match = re.search(pattern, text, re.DOTALL)
+    return match.group(1).strip() if match else text.strip()
 def send_to_sandbox(html_code: str) -> str:
     encoded_html = base64.b64encode(html_code.encode("utf-8")).decode("utf-8")
     data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
     return f'<iframe src="{data_uri}" width="100%" height="920px"></iframe>'
+def chat_stream(history: List[List[str]], text: str):
+    if not text.strip():
+        return history
+    history.append([text, ""])
+    logger.info(f"New chat prompt: {text}")
+    # Think Phase
+    formatted_think_prompt = s1_inference_prompt_think_only.format(question=text)
     input_ids_think = tokenizer.encode(formatted_think_prompt, return_tensors="pt").to(model.device)
     attention_mask_think = (input_ids_think != tokenizer.pad_token_id).to(model.device)
     full_think = ""
     try:
         with torch.inference_mode():
+            thread = Thread(
+                target=lambda: model.generate(
+                    input_ids=input_ids_think,
+                    attention_mask=attention_mask_think,
+                    **gen_kwargs_think
+                )
             )
+            thread.start()
             for new_text in think_streamer:
                 full_think += new_text
+                history[-1][1] = f"<|im_start|>think\n{full_think.strip()}"
+                yield history
+            thread.join()
     except Exception as e:
         logger.error(f"Error during think phase: {e}")
+        history[-1][1] = f"Error in think phase: {str(e)}"
+        yield history
         return
+    # Answer Phase
     new_prompt = (
         formatted_think_prompt
         + full_think.strip()
     full_answer = ""
     try:
         with torch.inference_mode():
+            thread = Thread(
+                target=lambda: model.generate(
+                    input_ids=input_ids_answer,
+                    attention_mask=attention_mask_answer,
+                    **gen_kwargs_answer
+                )
             )
+            thread.start()
             for new_text in answer_streamer:
                 full_answer += new_text
                 display_text = (
                     f"<|im_start|>think\n{full_think.strip()}\n\n"
                     f"<|im_start|>answer\n{full_answer.strip()}"
                 )
+                history[-1][1] = display_text
+                yield history
+            thread.join()
     except Exception as e:
         logger.error(f"Error during answer phase: {e}")
+        history[-1][1] = f"Error in answer phase: {str(e)}"
+        yield history
         return
+def process_artifact(history: List[List[str]]):
+    if not history or not history[-1][1]:
+        return ""
+    html_code = extract_html_code_block(history[-1][1])
+    return send_to_sandbox(html_code)
 def clear_chat():
     return [], "", ""
+# Gradio UI
+css = """
 .left_header {
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
 }
 .right_panel {
+    margin-top: 16px;
+    border: 1px solid #BFBFC4;
+    border-radius: 8px;
+    overflow: hidden;
 }
 .render_header {
+    height: 30px;
+    width: 100%;
+    padding: 5px 16px;
+    background-color: #f5f5f5;
 }
 .header_btn {
+    display: inline-block;
+    height: 10px;
+    width: 10px;
+    border-radius: 50%;
+    margin-right: 4px;
 }
+.render_header > .header_btn:nth-child(1) { background-color: #f5222d; }
+.render_header > .header_btn:nth-child(2) { background-color: #faad14; }
+.render_header > .header_btn:nth-child(3) { background-color: #52c41a; }
 .right_content {
+    height: 920px;
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
 }
+.html_content { width: 100%; height: 920px; }
 """
+svg_logo = """
 <svg width="40" height="40" viewBox="0 0 45 45" fill="none" xmlns="http://www.w3.org/2000/svg">
+    <circle cx="22.5" cy="22.5" r="22.5" fill="#5572F9"/>
+    <path d="M22.5 11.25L26.25 16.875H18.75L22.5 11.25Z" fill="white"/>
+    <path d="M22.5 33.75L26.25 28.125H18.75L22.5 33.75Z" fill="white"/>
+    <path d="M28.125 22.5L22.5 28.125L16.875 22.5L22.5 16.875L28.125 22.5Z" fill="white"/>
 </svg>
 """
+with gr.Blocks(title=model_name.split('/')[-1], css=css) as demo:
     gr.HTML(f"""
         <div class="left_header" style="margin-bottom: 20px;">
+            {svg_logo}
             <h1>{model_name.split('/')[-1]} - Chat + Artifacts</h1>
             <p>(Two-phase chain-of-thought with artifact extraction)</p>
         </div>
             with gr.Row():
                 submit_btn = gr.Button("Send", variant="primary")
                 clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=6):
             gr.HTML('<div class="render_header"><span class="header_btn"></span><span class="header_btn"></span><span class="header_btn"></span></div>')
             artifact_html = gr.HTML(
                 elem_classes="html_content"
             )
+    # Event handlers
+    text_input.submit(
+        fn=chat_stream,
+        inputs=[chatbot, text_input],
+        outputs=chatbot
+    ).then(
+        fn=lambda: "",
+        outputs=text_input
     ).then(
+        fn=process_artifact,
+        inputs=[chatbot],
+        outputs=artifact_html
     )
+    submit_btn.click(
+        fn=chat_stream,
+        inputs=[chatbot, text_input],
+        outputs=chatbot
+    ).then(
+        fn=lambda: "",
+        outputs=text_input
     ).then(
+        fn=process_artifact,
+        inputs=[chatbot],
+        outputs=artifact_html
     )
     clear_btn.click(
+        fn=clear_chat,
         outputs=[chatbot, text_input, artifact_html]
     )
+if __name__ == "__main__":
+    logger.info("Launching Gradio demo...")
+    demo.queue(concurrency_limit=1).launch(server_name="0.0.0.0", share=True)