ZeroGPU-LLM-Inference

Running

App Files Files Community

Luigi commited on Apr 12

Commit

ef361b0

1 Parent(s): 5f6306a

apply history flatten before it goint to prompt

Browse files

Files changed (1) hide show

app.py +36 -9

app.py CHANGED Viewed

@@ -109,6 +109,30 @@ def retrieve_context(query, max_results=6, max_chars_per_result=600):
     except Exception:
         return ""
 # ------------------------------
 # Chat Response Generation with ZeroGPU using Pipeline
 # ------------------------------
@@ -120,7 +144,8 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
     - Appends the user's message to the conversation history.
     - Optionally retrieves web search context and inserts it as an additional system message.
-    - Uses a cached pipeline (loaded via load_pipeline) to generate a response.
     - Returns the updated conversation history and a debug message.
     """
     cancel_event.clear()
@@ -131,7 +156,6 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
     # Retrieve web search context if enabled.
     debug_message = ""
-    retrieved_context = ""
     if enable_search:
         debug_message = "Initiating web search..."
         yield conversation, debug_message
@@ -155,23 +179,26 @@ def chat_response(user_message, chat_history, system_prompt, enable_search,
     conversation.append({"role": "assistant", "content": ""})
     try:
         # Load the pipeline (cached) for the selected model.
         pipe = load_pipeline(model_name)
-        # Use the pipeline directly with conversation history.
         response = pipe(
-            conversation,
             max_new_tokens=max_tokens,
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repeat_penalty,
         )
-        # Extract the assistant's reply.
-        try:
-            assistant_text = response[0]["generated_text"][-1]["content"]
-        except (KeyError, IndexError, TypeError):
-            assistant_text = response[0]["generated_text"]
         # Update the conversation history.
         conversation[-1]["content"] = assistant_text

     except Exception:
         return ""
+# ----------------------------------------------------------------------------
+# NEW HELPER FUNCTION: Format Conversation History into a Clean Prompt
+# ----------------------------------------------------------------------------
+def format_conversation(conversation, system_prompt):
+    """
+    Converts a list of conversation messages (each a dict with 'role' and 'content')
+    and a system prompt into a single plain text string.
+    This prevents raw role labels from being passed to the model.
+    """
+    # Start with the system prompt.
+    prompt = system_prompt.strip() + "\n"
+    # Loop through conversation and format user and assistant messages.
+    for msg in conversation:
+        if msg["role"] == "user":
+            prompt += "User: " + msg["content"].strip() + "\n"
+        elif msg["role"] == "assistant":
+            prompt += "Assistant: " + msg["content"].strip() + "\n"
+        elif msg["role"] == "system":
+            prompt += msg["content"].strip() + "\n"
+    # Append the assistant cue to indicate the start of the reply.
+    if not prompt.strip().endswith("Assistant:"):
+        prompt += "Assistant: "
+    return prompt
 # ------------------------------
 # Chat Response Generation with ZeroGPU using Pipeline
 # ------------------------------
     - Appends the user's message to the conversation history.
     - Optionally retrieves web search context and inserts it as an additional system message.
+    - Converts the conversation into a formatted prompt to avoid leaking role labels.
+    - Uses the cached pipeline to generate a response.
     - Returns the updated conversation history and a debug message.
     """
     cancel_event.clear()
     # Retrieve web search context if enabled.
     debug_message = ""
     if enable_search:
         debug_message = "Initiating web search..."
         yield conversation, debug_message
     conversation.append({"role": "assistant", "content": ""})
     try:
+        # Format the entire conversation into a single prompt (this fixes both issues).
+        prompt_text = format_conversation(conversation, system_prompt)
         # Load the pipeline (cached) for the selected model.
         pipe = load_pipeline(model_name)
+        # Generate a response using the formatted prompt.
         response = pipe(
+            prompt_text,
             max_new_tokens=max_tokens,
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
             repetition_penalty=repeat_penalty,
         )
+        # Extract the generated text.
+        generated = response[0]["generated_text"]
+        # Remove the prompt portion so we only keep the new assistant reply.
+        assistant_text = generated[len(prompt_text):].strip()
         # Update the conversation history.
         conversation[-1]["content"] = assistant_text