Spaces:

fyzanshaik
/

neuralnet_cfmodel

Build error

App Files Files Community

fyzanshaik commited on Jun 5

Commit

03f2222

verified ·

1 Parent(s): 7d7b966

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -25

app.py CHANGED Viewed

@@ -1,62 +1,140 @@
-# app.py (Revisit this version from previous long answer)
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 # --- Configuration ---
-MODEL_NAME = "neuralnets/cf_codebot"
 # --- Model Loading ---
 try:
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
     model.eval()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    print(f"Model loaded on: {device}")
 except Exception as e:
-    print(f"Error loading model '{MODEL_NAME}': {e}")
     print("Using a dummy function for demonstration purposes.")
-    tokenizer, model, device = None, None, "cpu"
 # --- Inference Function ---
 def generate_editorial(problem_statement: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
-    if model is None:
-        return "Model not loaded, using dummy generation. (Check logs)"
     try:
-        input_text = problem_statement
         inputs = tokenizer(
             input_text,
             return_tensors="pt",
             padding=True,
             truncation=True,
-            max_length=512
-        ).to(device)
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             num_return_sequences=1,
-            pad_token_id=tokenizer.eos_token_id,
             do_sample=True,
             top_k=50,
             top_p=top_p,
             temperature=temperature,
-            stop_sequences=["<end_of_turn>"] # Can use this, or `stop` if transformers is very new
         )
         generated_sequence = tokenizer.decode(outputs[0], skip_special_tokens=False)
-        if generated_sequence.startswith(input_text):
-            editorial_content = generated_sequence[len(input_text):].strip()
-            editorial_content = editorial_content.replace("<end_of_turn>", "").strip()
         else:
             editorial_content = generated_sequence.strip()
-            editorial_content = editorial_content.replace("<end_of_turn>", "").strip()
         return editorial_content
     except Exception as e:
@@ -64,7 +142,7 @@ def generate_editorial(problem_statement: str, max_new_tokens: int, temperature:
         return f"An error occurred during editorial generation: {e}"
 # --- Gradio Interface Setup ---
-demo = gr.Interface(
     fn=generate_editorial,
     inputs=[
         gr.Textbox(lines=10, label="Problem Statement", placeholder="Paste your problem statement here...", autofocus=True),
@@ -73,8 +151,8 @@ demo = gr.Interface(
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
     ],
     outputs=gr.Textbox(label="Generated Editorial"),
-    title="Codeforces Editorial Assistant (Model Loaded In-Space)",
-    description="Paste a Codeforces problem statement and get a generated editorial from neuralnets/cf_codebot.",
     flagging_mode="auto", # Updated from allow_flagging
     examples=[
         [
@@ -87,4 +165,4 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    demo.launch()

+# app.py (Revised for Unsloth LoRA Gemma Model)
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM # We still use AutoModelForCausalLM for the base model
 import torch
+# Import unsloth for loading the adapters
+from unsloth import FastLanguageModel
 # --- Configuration ---
+BASE_MODEL_NAME = "unsloth/gemma-3-4b-it"
+ADAPTER_MODEL_NAME = "neuralnets/cf_codebot" # Your friend's fine-tuned adapters
 # --- Model Loading ---
+# This block will run once when the Space starts up.
 try:
+    # Load the base model and tokenizer using unsloth's optimized method
+    # This automatically handles loading the tokenizer too.
+    # We specify "bf16" for faster inference if GPU is available, else it will default.
+    # max_seq_length is important for context window. 2048 is a common default for Gemma.
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name = BASE_MODEL_NAME,
+        max_seq_length = 2048, # Max context length the model can handle
+        dtype = torch.bfloat16, # Optimized dtype for performance
+        load_in_4bit = True, # Load in 4-bit to save memory (even on CPU, though less impact than GPU)
+    )
+    # Load the LoRA adapters from your friend's model onto the base model
+    model = FastLanguageModel.get_peft_model(
+        model,
+        # Default LoRA configuration for inference (should match training if possible)
+        # If your friend shared their training config, use those ranks.
+        r = 16, # Rank of the LoRA adapters
+        target_modules = FastLanguageModel.get_model_peft_target_modules(model),
+        lora_alpha = 16, # Alpha value for LoRA
+        lora_dropout = 0, # Dropout for inference is usually 0
+        bias = "none",
+        use_gradient_checkpointing = False,
+        random_state = 3407,
+        max_seq_length = 2048,
+        # `use_te_vllm` for inference if you have specific hardware, but usually not needed for basic deployment
+    )
+    # Load the trained adapters
+    model.load_lora_weights(ADAPTER_MODEL_NAME)
+    # Set model to evaluation mode
     model.eval()
+    # Move model to device (unsloth often handles this, but explicit is good)
+    # Note: Unsloth's 4-bit loading often uses `accelerate` which handles device placement.
+    # Keeping `device` print for debugging.
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # model.to(device) # No need to explicitly move model if load_in_4bit is True, handled by bitsandbytes/accelerate
+    print(f"Base model '{BASE_MODEL_NAME}' and adapters '{ADAPTER_MODEL_NAME}' loaded successfully.")
+    # You can infer the actual device from the model object's parameters later if needed.
 except Exception as e:
+    print(f"Error loading model '{BASE_MODEL_NAME}' or adapters '{ADAPTER_MODEL_NAME}': {e}")
     print("Using a dummy function for demonstration purposes.")
+    tokenizer, model = None, None # Indicate model not loaded
 # --- Inference Function ---
 def generate_editorial(problem_statement: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
+    if model is None or tokenizer is None: # If model failed to load, use dummy
+        print("Model not loaded, using dummy generation.")
+        if "watermelon" in problem_statement.lower():
+            return "To be able to split the watermelon such that each part is even..."
+        return "This is a placeholder editorial based on your problem statement.\n(Model failed to load, check logs)"
     try:
+        # Construct the prompt in an instruction-tuned format
+        # This is CRUCIAL for instruction-tuned models like Gemma-IT
+        # You need to ensure the format matches what the model was trained on.
+        # Common format for instruction models:
+        # prompt = f"### Instruction:\n{problem_statement}\n\n### Response:\n"
+        # Unsloth's `FastLanguageModel.chat_template` or `apply_chat_template` is ideal here.
+        # This function generates the correct chat format for the model.
+        messages = [
+            {"role": "user", "content": problem_statement}
+        ]
+        # Apply the chat template. add_generation_prompt=True ensures it's ready for generation.
+        # This adds special tokens like <bos><start_of_turn>user ... <end_of_turn><start_of_turn>model
+        input_text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False, # We want the string, not token IDs
+            add_generation_prompt=True
+        )
+        # Tokenize the input string
         inputs = tokenizer(
             input_text,
             return_tensors="pt",
             padding=True,
             truncation=True,
+            max_length=tokenizer.model_max_length # Use model's max_length
+        ).to(model.device) # Ensure inputs are on the same device as the model
+        # Generate text
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             num_return_sequences=1,
             do_sample=True,
             top_k=50,
             top_p=top_p,
             temperature=temperature,
+            pad_token_id=tokenizer.eos_token_id, # Ensure pad_token_id is set
+            # Stopping criteria: for instruction-tuned models, often <eos_token> or specific strings.
+            # If your friend's model generates "<end_of_turn>" specifically, keep that.
+            # Otherwise, the default generation stopping (tokenizer.eos_token_id) usually suffices.
+            # `stop_sequences=["<end_of_turn>"]`
         )
+        # Decode the generated text
+        # We need to skip the input prompt from the generated text
+        # `skip_special_tokens=True` for clean text, but check if it affects your specific `<end_of_turn>`
         generated_sequence = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        # Extract only the model's response.
+        # The `apply_chat_template` typically produces something like:
+        # "<bos><start_of_turn>user\n{problem_statement}<end_of_turn>\n<start_of_turn>model\n"
+        # We want to find the start of the model's response and take everything after it.
+        response_start_marker = "<start_of_turn>model\n" # or similar based on template
+        if response_start_marker in generated_sequence:
+            editorial_content = generated_sequence.split(response_start_marker)[-1].strip()
         else:
+            # Fallback if marker not found, or if generated_sequence starts with input
             editorial_content = generated_sequence.strip()
+            if editorial_content.startswith(input_text):
+                editorial_content = editorial_content[len(input_text):].strip()
+        # Remove any lingering special tokens like <end_of_turn> or <eos_token>
+        # (tokenizer.decode with skip_special_tokens=True might handle this, but manual clean is safer)
+        editorial_content = editorial_content.replace("<end_of_turn>", "").replace(tokenizer.eos_token, "").strip()
         return editorial_content
     except Exception as e:
         return f"An error occurred during editorial generation: {e}"
 # --- Gradio Interface Setup ---
+iface = gr.Interface(
     fn=generate_editorial,
     inputs=[
         gr.Textbox(lines=10, label="Problem Statement", placeholder="Paste your problem statement here...", autofocus=True),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
     ],
     outputs=gr.Textbox(label="Generated Editorial"),
+    title="Codeforces Editorial Assistant (Gemma LoRA)",
+    description="Paste a Codeforces problem statement and get a generated editorial from neuralnets/cf_codebot (Gemma-3-4b-it LoRA).",
     flagging_mode="auto", # Updated from allow_flagging
     examples=[
         [
 )
 if __name__ == "__main__":
+    iface.launch()