UIGEN-T3-4B-Demo

Runtime error

App Files Files Community

smirki commited on Feb 22

Commit

46823cb

verified ·

1 Parent(s): 91b3f5a

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -63

app.py CHANGED Viewed

@@ -1,47 +1,57 @@
 import subprocess
-subprocess.run(
-    'pip install flash-attn==2.7.0.post2 --no-build-isolation',
-    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
-    shell=True
-)
-subprocess.run(
-    'pip install transformers',
-    shell=True
-)
-import spaces
 import os
 import re
 import logging
 from typing import List
 from threading import Thread
-import base64
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 # ----------------------------------------------------------------------
-# 1. Setup Model & Tokenizer
 # ----------------------------------------------------------------------
-model_name = 'smirki/UIGEN-T1.1-Qwen-7B'  # Change as needed
-use_thread = True  # Generation happens in a background thread
 logger = logging.getLogger(__name__)
-logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.basicConfig(level=logging.INFO)
-logger.info("Loading model and tokenizer...")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    torch_dtype=torch.bfloat16,
     trust_remote_code=True
-).to("cuda")
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 logger.info("Model and tokenizer loaded successfully.")
 # ----------------------------------------------------------------------
-# 2. Two-Phase Prompt Templates
 # ----------------------------------------------------------------------
 s1_inference_prompt_think_only = """<|im_start|>user
 {question}<|im_end|>
@@ -49,29 +59,27 @@ s1_inference_prompt_think_only = """<|im_start|>user
 <|im_start|>think
 """
-# ----------------------------------------------------------------------
-# 3. Generation Parameter Setup
-# ----------------------------------------------------------------------
 THINK_MAX_NEW_TOKENS = 12000
 ANSWER_MAX_NEW_TOKENS = 12000
 def initialize_gen_kwargs():
     return {
-        "max_new_tokens": 1024,  # default; will be overwritten per phase
         "do_sample": True,
         "temperature": 0.7,
         "top_p": 0.9,
         "repetition_penalty": 1.05,
-        # "eos_token_id": model.generation_config.eos_token_id,  # Removed to avoid premature stopping
         "pad_token_id": tokenizer.pad_token_id,
         "use_cache": True,
-        "streamer": None  # dynamically added
     }
 # ----------------------------------------------------------------------
 # 4. Helper to submit chat
 # ----------------------------------------------------------------------
 def submit_chat(chatbot, text_input):
     if not text_input.strip():
         return chatbot, ""
     response = ""
@@ -83,6 +91,10 @@ def submit_chat(chatbot, text_input):
 # 5. Artifacts Handling
 # ----------------------------------------------------------------------
 def extract_html_code_block(text: str) -> str:
     pattern = r'```html\s*(.*?)\s*```'
     match = re.search(pattern, text, re.DOTALL)
     if match:
@@ -93,6 +105,10 @@ def extract_html_code_block(text: str) -> str:
         return text.strip()
 def send_to_sandbox(html_code: str) -> str:
     encoded_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
     data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
     return f'<iframe src="{data_uri}" width="100%" height="920px"></iframe>'
@@ -100,18 +116,27 @@ def send_to_sandbox(html_code: str) -> str:
 # ----------------------------------------------------------------------
 # 6. The Two-Phase Streaming Inference
 # ----------------------------------------------------------------------
-@spaces.GPU
 def ovis_chat(chatbot: List[List[str]]):
     logger.info("Starting two-phase generation...")
-    # Phase 1: "think" phase
-    last_query = chatbot[-1][0]
     formatted_think_prompt = s1_inference_prompt_think_only.format(question=last_query)
-    logger.info("Formatted think prompt.")
-    input_ids_think = tokenizer.encode(formatted_think_prompt, return_tensors="pt").to(model.device)
-    attention_mask_think = torch.ne(input_ids_think, tokenizer.pad_token_id).to(model.device)
     think_inputs = {"input_ids": input_ids_think, "attention_mask": attention_mask_think}
     gen_kwargs_think = initialize_gen_kwargs()
     gen_kwargs_think["max_new_tokens"] = THINK_MAX_NEW_TOKENS
     think_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
@@ -120,27 +145,30 @@ def ovis_chat(chatbot: List[List[str]]):
     full_think = ""
     try:
         with torch.inference_mode():
-            logger.info("Starting think phase generation thread...")
             thread_think = Thread(target=lambda: model.generate(**think_inputs, **gen_kwargs_think))
             thread_think.start()
             for new_text in think_streamer:
                 full_think += new_text
-                logger.info(f"Think phase token: {new_text.strip()}")
                 display_text = f"<|im_start|>think\n{full_think.strip()}"
                 chatbot[-1][1] = display_text
                 yield chatbot, ""
             thread_think.join()
-            logger.info("Think phase completed.")
     except Exception as e:
         logger.error("Error during think phase: " + str(e))
         yield chatbot, f"Error in think phase: {str(e)}"
         return
-    # Phase 2: "answer" phase
     new_prompt = formatted_think_prompt + full_think.strip() + "\n<|im_start|>answer\n"
-    logger.info("Constructed prompt for answer phase.")
-    input_ids_answer = tokenizer.encode(new_prompt, return_tensors="pt").to(model.device)
-    attention_mask_answer = torch.ne(input_ids_answer, tokenizer.pad_token_id).to(model.device)
     answer_inputs = {"input_ids": input_ids_answer, "attention_mask": attention_mask_answer}
     gen_kwargs_answer = initialize_gen_kwargs()
@@ -151,12 +179,12 @@ def ovis_chat(chatbot: List[List[str]]):
     full_answer = ""
     try:
         with torch.inference_mode():
-            logger.info("Starting answer phase generation thread...")
             thread_answer = Thread(target=lambda: model.generate(**answer_inputs, **gen_kwargs_answer))
             thread_answer.start()
             for new_text in answer_streamer:
                 full_answer += new_text
-                logger.info(f"Answer phase token: {new_text.strip()}")
                 display_text = (
                     f"<|im_start|>think\n{full_think.strip()}\n\n"
                     f"<|im_start|>answer\n{full_answer.strip()}"
@@ -164,13 +192,16 @@ def ovis_chat(chatbot: List[List[str]]):
                 chatbot[-1][1] = display_text
                 yield chatbot, ""
             thread_answer.join()
-            logger.info("Answer phase completed.")
     except Exception as e:
         logger.error("Error during answer phase: " + str(e))
         yield chatbot, f"Error in answer phase: {str(e)}"
         return
     log_conversation(chatbot)
     html_code = extract_html_code_block(full_answer)
     sandbox_iframe = send_to_sandbox(html_code)
     yield chatbot, sandbox_iframe
@@ -197,21 +228,18 @@ css_code = """
   justify-content: center;
   align-items: center;
 }
 .right_panel {
   margin-top: 16px;
   border: 1px solid #BFBFC4;
   border-radius: 8px;
   overflow: hidden;
 }
 .render_header {
   height: 30px;
   width: 100%;
   padding: 5px 16px;
   background-color: #f5f5f5;
 }
 .header_btn {
   display: inline-block;
   height: 10px;
@@ -219,18 +247,15 @@ css_code = """
   border-radius: 50%;
   margin-right: 4px;
 }
 .render_header > .header_btn:nth-child(1) {
   background-color: #f5222d;
 }
 .render_header > .header_btn:nth-child(2) {
   background-color: #faad14;
 }
 .render_header > .header_btn:nth-child(3) {
   background-color: #52c41a;
 }
 .right_content {
   height: 920px;
   display: flex;
@@ -238,7 +263,6 @@ css_code = """
   justify-content: center;
   align-items: center;
 }
 .html_content {
   width: 100%;
   height: 920px;
@@ -265,11 +289,7 @@ with gr.Blocks(title=model_name.split('/')[-1], css=css_code) as demo:
     with gr.Row():
         with gr.Column(scale=4):
-            chatbot = gr.Chatbot(
-                label="Chat",
-                height=520,
-                show_copy_button=True
-            )
             with gr.Row():
                 text_input = gr.Textbox(
                     label="Prompt",
@@ -280,11 +300,12 @@ with gr.Blocks(title=model_name.split('/')[-1], css=css_code) as demo:
                 submit_btn = gr.Button("Send", variant="primary")
                 clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=6):
-            gr.HTML('<div class="render_header"><span class="header_btn"></span><span class="header_btn"></span><span class="header_btn"></span></div>')
-            artifact_html = gr.HTML(
-                value="",
-                elem_classes="html_content"
             )
     submit_btn.click(
         submit_chat, [chatbot, text_input], [chatbot, text_input]
@@ -303,5 +324,5 @@ with gr.Blocks(title=model_name.split('/')[-1], css=css_code) as demo:
         outputs=[chatbot, text_input, artifact_html]
     )
-logger.info("Launching demo with GPU support...")
-demo.queue(default_concurrency_limit=1).launch(server_name="0.0.0.0", share=True)

 import subprocess
 import os
 import re
 import logging
+import base64
 from typing import List
 from threading import Thread
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# Install packages (if needed). Adjust or remove if your environment already has them.
+subprocess.run(
+    ["pip", "install", "flash-attn==2.7.0.post2", "--no-build-isolation"]
+)
+subprocess.run(["pip", "install", "transformers"])
+# Optional: set up CUDA-specific environment vars if you need them
+# os.environ["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "TRUE"
 # ----------------------------------------------------------------------
+# 1. Setup Logging
 # ----------------------------------------------------------------------
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+# ----------------------------------------------------------------------
+# 2. Model & Tokenizer Initialization
+# ----------------------------------------------------------------------
+model_name = "smirki/UIGEN-T1.1-Qwen-7B"  # adjust as needed
+logger.info("Loading model & tokenizer...")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
+    device_map="auto",           # auto-shard across available GPU(s)
+    torch_dtype=torch.bfloat16,  # or torch.float16, depending on your hardware
     trust_remote_code=True
+)
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Optional speed-up via torch.compile (requires PyTorch ≥ 2.0).
+# Comment out if you run into any compatibility issues.
+try:
+    model = torch.compile(model)
+    logger.info("Model compiled with torch.compile for potential speed-up.")
+except Exception as e:
+    logger.warning(f"Could not compile model: {e}")
 logger.info("Model and tokenizer loaded successfully.")
 # ----------------------------------------------------------------------
+# 3. Two-Phase Prompt Templates
 # ----------------------------------------------------------------------
 s1_inference_prompt_think_only = """<|im_start|>user
 {question}<|im_end|>
 <|im_start|>think
 """
 THINK_MAX_NEW_TOKENS = 12000
 ANSWER_MAX_NEW_TOKENS = 12000
 def initialize_gen_kwargs():
+    """Common generation parameters for both phases; tweak as necessary."""
     return {
+        "max_new_tokens": 1024,  # will be updated for each phase
         "do_sample": True,
         "temperature": 0.7,
         "top_p": 0.9,
         "repetition_penalty": 1.05,
         "pad_token_id": tokenizer.pad_token_id,
         "use_cache": True,
+        "streamer": None  # will be replaced with TextIteratorStreamer
     }
 # ----------------------------------------------------------------------
 # 4. Helper to submit chat
 # ----------------------------------------------------------------------
 def submit_chat(chatbot, text_input):
+    """Adds the user query to the Chatbot list, clearing the textbox."""
     if not text_input.strip():
         return chatbot, ""
     response = ""
 # 5. Artifacts Handling
 # ----------------------------------------------------------------------
 def extract_html_code_block(text: str) -> str:
+    """
+    Extracts the first ```html ... ``` block from the model's answer.
+    If none found, returns the entire text stripped.
+    """
     pattern = r'```html\s*(.*?)\s*```'
     match = re.search(pattern, text, re.DOTALL)
     if match:
         return text.strip()
 def send_to_sandbox(html_code: str) -> str:
+    """
+    Converts HTML code into a base64-encoded Data URI embedded in an iframe,
+    which can be displayed in Gradio’s HTML component.
+    """
     encoded_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
     data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
     return f'<iframe src="{data_uri}" width="100%" height="920px"></iframe>'
 # ----------------------------------------------------------------------
 # 6. The Two-Phase Streaming Inference
 # ----------------------------------------------------------------------
 def ovis_chat(chatbot: List[List[str]]):
+    """
+    Main two-phase pipeline:
+      1) "Think" phase (hidden chain-of-thought)
+      2) "Answer" phase
+    Yields intermediate partial results for real-time streaming in Gradio.
+    """
     logger.info("Starting two-phase generation...")
+    # -- Phase 1: "think" --
+    last_query = chatbot[-1][0]  # latest user query
     formatted_think_prompt = s1_inference_prompt_think_only.format(question=last_query)
+    # Prepare input
+    input_ids_think = tokenizer.encode(formatted_think_prompt, return_tensors="pt")
+    attention_mask_think = torch.ne(input_ids_think, tokenizer.pad_token_id)
+    # Move to correct device automatically if using device_map="auto"
+    # or if single GPU, you can do e.g. input_ids_think = input_ids_think.cuda()
     think_inputs = {"input_ids": input_ids_think, "attention_mask": attention_mask_think}
+    # Generation params
     gen_kwargs_think = initialize_gen_kwargs()
     gen_kwargs_think["max_new_tokens"] = THINK_MAX_NEW_TOKENS
     think_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     full_think = ""
     try:
         with torch.inference_mode():
             thread_think = Thread(target=lambda: model.generate(**think_inputs, **gen_kwargs_think))
             thread_think.start()
+            # Stream partial tokens as they arrive
             for new_text in think_streamer:
                 full_think += new_text
+                # If you don’t need every single token logged, skip or reduce:
+                # logger.debug(f"Think token: {new_text.strip()}")
+                # Show partial chain-of-thought in the Chatbot’s assistant window
                 display_text = f"<|im_start|>think\n{full_think.strip()}"
                 chatbot[-1][1] = display_text
                 yield chatbot, ""
             thread_think.join()
     except Exception as e:
         logger.error("Error during think phase: " + str(e))
         yield chatbot, f"Error in think phase: {str(e)}"
         return
+    logger.info("Think phase completed.")
+    # -- Phase 2: "answer" --
     new_prompt = formatted_think_prompt + full_think.strip() + "\n<|im_start|>answer\n"
+    input_ids_answer = tokenizer.encode(new_prompt, return_tensors="pt")
+    attention_mask_answer = torch.ne(input_ids_answer, tokenizer.pad_token_id)
     answer_inputs = {"input_ids": input_ids_answer, "attention_mask": attention_mask_answer}
     gen_kwargs_answer = initialize_gen_kwargs()
     full_answer = ""
     try:
         with torch.inference_mode():
             thread_answer = Thread(target=lambda: model.generate(**answer_inputs, **gen_kwargs_answer))
             thread_answer.start()
             for new_text in answer_streamer:
                 full_answer += new_text
+                # logger.debug(f"Answer token: {new_text.strip()}")
                 display_text = (
                     f"<|im_start|>think\n{full_think.strip()}\n\n"
                     f"<|im_start|>answer\n{full_answer.strip()}"
                 chatbot[-1][1] = display_text
                 yield chatbot, ""
             thread_answer.join()
     except Exception as e:
         logger.error("Error during answer phase: " + str(e))
         yield chatbot, f"Error in answer phase: {str(e)}"
         return
+    logger.info("Answer phase completed.")
+    # Logging the final conversation
     log_conversation(chatbot)
+    # Extract HTML code if any & display
     html_code = extract_html_code_block(full_answer)
     sandbox_iframe = send_to_sandbox(html_code)
     yield chatbot, sandbox_iframe
   justify-content: center;
   align-items: center;
 }
 .right_panel {
   margin-top: 16px;
   border: 1px solid #BFBFC4;
   border-radius: 8px;
   overflow: hidden;
 }
 .render_header {
   height: 30px;
   width: 100%;
   padding: 5px 16px;
   background-color: #f5f5f5;
 }
 .header_btn {
   display: inline-block;
   height: 10px;
   border-radius: 50%;
   margin-right: 4px;
 }
 .render_header > .header_btn:nth-child(1) {
   background-color: #f5222d;
 }
 .render_header > .header_btn:nth-child(2) {
   background-color: #faad14;
 }
 .render_header > .header_btn:nth-child(3) {
   background-color: #52c41a;
 }
 .right_content {
   height: 920px;
   display: flex;
   justify-content: center;
   align-items: center;
 }
 .html_content {
   width: 100%;
   height: 920px;
     with gr.Row():
         with gr.Column(scale=4):
+            chatbot = gr.Chatbot(label="Chat", height=520, show_copy_button=True)
             with gr.Row():
                 text_input = gr.Textbox(
                     label="Prompt",
                 submit_btn = gr.Button("Send", variant="primary")
                 clear_btn = gr.Button("Clear", variant="secondary")
         with gr.Column(scale=6):
+            gr.HTML(
+                '<div class="render_header">'
+                '<span class="header_btn"></span><span class="header_btn"></span><span class="header_btn"></span>'
+                '</div>'
             )
+            artifact_html = gr.HTML(value="", elem_classes="html_content")
     submit_btn.click(
         submit_chat, [chatbot, text_input], [chatbot, text_input]
         outputs=[chatbot, text_input, artifact_html]
     )
+logger.info("Launching Gradio app. Please wait...")
+demo.queue(concurrency_count=10).launch(server_name="0.0.0.0", share=True)