ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Oct 12

Commit

9ac7f36

1 Parent(s): a94befb

Improve cancel generation with robust UI state management and orchestrator pattern

Browse files

Files changed (1) hide show

app.py +60 -59

app.py CHANGED Viewed

@@ -334,7 +334,7 @@ def load_pipeline(model_name):
                 model=repo,
                 tokenizer=tokenizer,
                 trust_remote_code=True,
-                torch_dtype=dtype,
                 device_map="auto",
                 use_cache=True,      # Enable past-key-value caching
                 token=access_token)
@@ -509,12 +509,14 @@ def chat_response(user_msg, chat_history, system_prompt,
         thought_buf = ''
         answer_buf = ''
         in_thought = False
         # Stream tokens
         for chunk in streamer:
             # Check for cancellation signal
             if cancel_event.is_set():
-                history[-1]['content'] += " [Generation Canceled]"
                 yield history, debug
                 break
@@ -523,21 +525,14 @@ def chat_response(user_msg, chat_history, system_prompt,
             # Detect start of thinking
             if not in_thought and '<think>' in text:
                 in_thought = True
-                # Insert thought placeholder
-                history.append({
-                    'role': 'assistant',
-                    'content': '',
-                    'metadata': {'title': '💭 Thought'}
-                })
-                # Capture after opening tag
                 after = text.split('<think>', 1)[1]
                 thought_buf += after
-                # If closing tag in same chunk
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
-                    # Start answer buffer
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
@@ -545,14 +540,12 @@ def chat_response(user_msg, chat_history, system_prompt,
                 yield history, debug
                 continue
-            # Continue thought streaming
             if in_thought:
                 thought_buf += text
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
-                    # Start answer buffer
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
@@ -561,8 +554,10 @@ def chat_response(user_msg, chat_history, system_prompt,
                 continue
             # Stream answer
-            if not answer_buf:
                 history.append({'role': 'assistant', 'content': ''})
             answer_buf += text
             history[-1]['content'] = answer_buf
             yield history, debug
@@ -573,7 +568,6 @@ def chat_response(user_msg, chat_history, system_prompt,
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
-        # Final cleanup
         gc.collect()
@@ -583,21 +577,14 @@ def update_default_prompt(enable_search):
 def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
-        # Create dummy values for the other parameters that get_duration expects
-        dummy_msg = ""
-        dummy_history = []
-        dummy_system_prompt = ""
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
                               max_tokens, 0.7, 40, 0.9, 1.2, search_timeout)
         model_size = MODELS[model_name].get("params_b", 4.0)
-        use_aot = model_size >= 2
-        return f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n" \
-               f"📊 **Model Size:** {model_size:.1f}B parameters\n" \
-               f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}"
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
@@ -613,10 +600,7 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
             search_chk = gr.Checkbox(label="Enable Web Search", value=False)
             sys_prompt = gr.Textbox(label="System Prompt", lines=3, value=update_default_prompt(search_chk.value))
-            # GPU Time Estimate Display
-            duration_display = gr.Markdown(value=update_duration_estimate(
-                "Qwen3-1.7B", False, 4, 50, 1024, 5.0
-            ))
             gr.Markdown("### Generation Parameters")
             max_tok = gr.Slider(64, 16384, value=1024, step=32, label="Max Tokens")
@@ -641,58 +625,75 @@ with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
     # Group all inputs for cleaner event handling
     chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st]
-    def start_generation_and_update_ui(*args):
-        # Update UI to "generating" state
         yield {
             submit_btn: gr.update(interactive=False),
             cancel_btn: gr.update(visible=True),
-            txt: gr.update(interactive=False, value=""), # Clear textbox and disable
         }
-        # Call the actual chat response generator
-        for output in chat_response(*args):
             yield {
-                chat: output[0],
-                dbg: output[1]
             }
-    def reset_ui_after_generation():
-        # Update UI back to "idle" state
-        return {
-            submit_btn: gr.update(interactive=True),
-            cancel_btn: gr.update(visible=False),
-            txt: gr.update(interactive=True), # Re-enable textbox
-        }
     def set_cancel_flag():
         cancel_event.set()
         print("Cancellation signal sent.")
-    # When the user submits their message (via button or enter)
     submit_event = txt.submit(
-        fn=start_generation_and_update_ui,
         inputs=chat_inputs,
-        outputs=[chat, dbg, submit_btn, cancel_btn, txt]
-    ).then(fn=reset_ui_after_generation, outputs=[submit_btn, cancel_btn, txt])
     submit_btn.click(
-        fn=start_generation_and_update_ui,
         inputs=chat_inputs,
-        outputs=[chat, dbg, submit_btn, cancel_btn, txt]
-    ).then(fn=reset_ui_after_generation, outputs=[submit_btn, cancel_btn, txt])
-    # When the user clicks the cancel button
     cancel_btn.click(
         fn=set_cancel_flag,
-        cancels=[submit_event] # This tells Gradio to stop the running `submit_event`
     )
-    # Update duration estimate when relevant inputs change
     duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
-    # Other event listeners
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])

                 model=repo,
                 tokenizer=tokenizer,
                 trust_remote_code=True,
+                dtype=dtype, # Use `dtype` instead of deprecated `torch_dtype`
                 device_map="auto",
                 use_cache=True,      # Enable past-key-value caching
                 token=access_token)
         thought_buf = ''
         answer_buf = ''
         in_thought = False
+        assistant_message_started = False
         # Stream tokens
         for chunk in streamer:
             # Check for cancellation signal
             if cancel_event.is_set():
+                if assistant_message_started and history and history[-1]['role'] == 'assistant':
+                    history[-1]['content'] += " [Generation Canceled]"
                 yield history, debug
                 break
             # Detect start of thinking
             if not in_thought and '<think>' in text:
                 in_thought = True
+                history.append({'role': 'assistant', 'content': '', 'metadata': {'title': '💭 Thought'}})
+                assistant_message_started = True
                 after = text.split('<think>', 1)[1]
                 thought_buf += after
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                 yield history, debug
                 continue
             if in_thought:
                 thought_buf += text
                 if '</think>' in thought_buf:
                     before, after2 = thought_buf.split('</think>', 1)
                     history[-1]['content'] = before.strip()
                     in_thought = False
                     answer_buf = after2
                     history.append({'role': 'assistant', 'content': answer_buf})
                 else:
                 continue
             # Stream answer
+            if not answer_buf and not assistant_message_started:
                 history.append({'role': 'assistant', 'content': ''})
+                assistant_message_started = True
             answer_buf += text
             history[-1]['content'] = answer_buf
             yield history, debug
         history.append({'role': 'assistant', 'content': f"Error: {e}"})
         yield history, debug
     finally:
         gc.collect()
 def update_duration_estimate(model_name, enable_search, max_results, max_chars, max_tokens, search_timeout):
     """Calculate and format the estimated GPU duration for current settings."""
     try:
+        dummy_msg, dummy_history, dummy_system_prompt = "", [], ""
         duration = get_duration(dummy_msg, dummy_history, dummy_system_prompt,
                               enable_search, max_results, max_chars, model_name,
                               max_tokens, 0.7, 40, 0.9, 1.2, search_timeout)
         model_size = MODELS[model_name].get("params_b", 4.0)
+        return (f"⏱️ **Estimated GPU Time: {duration:.1f} seconds**\n\n"
+                f"📊 **Model Size:** {model_size:.1f}B parameters\n"
+                f"🔍 **Web Search:** {'Enabled' if enable_search else 'Disabled'}")
     except Exception as e:
         return f"⚠️ Error calculating estimate: {e}"
             search_chk = gr.Checkbox(label="Enable Web Search", value=False)
             sys_prompt = gr.Textbox(label="System Prompt", lines=3, value=update_default_prompt(search_chk.value))
+            duration_display = gr.Markdown(value=update_duration_estimate("Qwen3-1.7B", False, 4, 50, 1024, 5.0))
             gr.Markdown("### Generation Parameters")
             max_tok = gr.Slider(64, 16384, value=1024, step=32, label="Max Tokens")
     # Group all inputs for cleaner event handling
     chat_inputs = [txt, chat, sys_prompt, search_chk, mr, mc, model_dd, max_tok, temp, k, p, rp, st]
+    # Group all UI components that change state
+    interactive_components = [txt, submit_btn, cancel_btn, chat, dbg]
+    def submit_and_manage_ui(user_msg, chat_history, *args):
+        """
+        An orchestrator function that manages the UI state and calls the backend chat function.
+        It uses a try...finally block to ensure the UI is always reset.
+        """
+        # Immediately update UI to a "generating" state
         yield {
+            # Add the user's message to the chat and a placeholder for the response
+            chat: chat_history + [[user_msg, None]],
+            txt: gr.update(value="", interactive=False),
             submit_btn: gr.update(interactive=False),
             cancel_btn: gr.update(visible=True),
         }
+        try:
+            # Package the arguments for the backend function
+            backend_args = [user_msg, chat_history] + list(args)
+            # Stream the response from the backend
+            for response_chunk in chat_response(*backend_args):
+                yield {
+                    chat: response_chunk[0],
+                    dbg: response_chunk[1],
+                }
+        except Exception as e:
+            print(f"An error occurred during generation: {e}")
+        finally:
+            # Always reset the UI to an "idle" state, regardless of completion or cancellation
+            print("Resetting UI state.")
             yield {
+                txt: gr.update(interactive=True),
+                submit_btn: gr.update(interactive=True),
+                cancel_btn: gr.update(visible=False),
             }
     def set_cancel_flag():
+        """Called by the cancel button, sets the global event."""
         cancel_event.set()
         print("Cancellation signal sent.")
+    # Event for submitting text via Enter key
     submit_event = txt.submit(
+        fn=submit_and_manage_ui,
         inputs=chat_inputs,
+        outputs=interactive_components,
+    )
+    # Event for submitting text via the "Submit" button
     submit_btn.click(
+        fn=submit_and_manage_ui,
         inputs=chat_inputs,
+        outputs=interactive_components,
+    )
+    # Event for the "Cancel" button. It calls the flag-setting function
+    # and, crucially, cancels the long-running submit_event.
     cancel_btn.click(
         fn=set_cancel_flag,
+        cancels=[submit_event]
     )
+    # Listeners for updating the duration estimate
     duration_inputs = [model_dd, search_chk, mr, mc, max_tok, st]
     for component in duration_inputs:
         component.change(fn=update_duration_estimate, inputs=duration_inputs, outputs=duration_display)
+    # Other minor event listeners
     search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
     clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])