File size: 4,290 Bytes

0b9851a

#!/usr/bin/env python3

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer



# Load model and tokenizer (same as server.py)
model_name = "models/Llama-3.2-1B-Instruct"
tok = None
lm = None



def chat_current(system_prompt: str, user_prompt: str) -> str:
    """
    Current implementation (same as server.py) - will show warnings
    """
    print("🔴 Running CURRENT implementation (with warnings)...")
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    input_ids = tok.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(lm.device)

    with torch.inference_mode():
        output_ids = lm.generate(
            input_ids,  # No attention_mask, no pad_token_id
            max_new_tokens=2048,
            do_sample=True,
            temperature=0.2,
            repetition_penalty=1.1,
            top_k=100,
            top_p=0.95,
        )

    answer = tok.decode(
        output_ids[0][input_ids.shape[-1]:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )
    return answer.strip()

com_add = "5F71XTGBnBGzxiPxCK4EbWMnhckH21tGWSRfe6NrMdxMe6kg"


def chat_fixed(system_prompt: str, user_prompt: str) -> str:
    """
    Fixed implementation - proper attention mask and pad token
    """
    print("🟢 Running FIXED implementation (no warnings)...")
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    # Get both input_ids and attention_mask
    inputs = tok.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True  # Returns dict with input_ids and attention_mask
    )
    
    # Move to device
    input_ids = inputs["input_ids"].to(lm.device)
    attention_mask = inputs["attention_mask"].to(lm.device)

    with torch.inference_mode():
        output_ids = lm.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,  # Proper attention mask
            pad_token_id=tok.eos_token_id,  # Explicit pad token
            max_new_tokens=2048,
            do_sample=True,
            temperature=0.2,
            repetition_penalty=1.1,
            top_k=100,
            top_p=0.95,
        )

    answer = tok.decode(
        output_ids[0][input_ids.shape[-1]:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )
    return answer.strip()




def compare_generations():
    """Compare both implementations"""
    system_prompt = "You are a helpful assistant who tries to help answer the user's question."
    user_prompt = "Create a report on anxiety in work. How do I manage time and stress effectively?"
    
    print("=" * 60)
    print("COMPARING GENERATION METHODS")
    print("=" * 60)
    print(f"System: {system_prompt}")
    print(f"User: {user_prompt}")
    print("=" * 60)
    
    # Test current implementation
    print("\n" + "=" * 60)
    current_output = chat_current(system_prompt, user_prompt)
    print(f"CURRENT OUTPUT:\n{current_output}")
    
    print("\n" + "=" * 60)
    # Test fixed implementation  
    fixed_output = chat_fixed(system_prompt, user_prompt)
    print(f"FIXED OUTPUT:\n{fixed_output}")
    
    print("\n" + "=" * 60)
    print("COMPARISON:")
    print(f"Outputs are identical: {current_output == fixed_output}")
    print(f"Current length: {len(current_output)} chars")
    print(f"Fixed length: {len(fixed_output)} chars")


# if __name__ == "__main__":
#     # Set pad token for the fixed version
#     if tok.pad_token is None:
#         tok.pad_token = tok.eos_token
    
#     compare_generations()



def filter_by_word_count(data, max_words=3):
    """Return only phrases with word count <= max_words."""
    return {k: v for k, v in data.items() if len(v.split()) <= max_words}



def filter_by_keyword(data, keyword):
    """Return phrases containing a specific keyword."""
    return {k: v for k, v in data.items() if keyword.lower() in v.lower()}




example_prompt = "As an answer of 5 points with scale from 5 to 10. The response below gives detailed information about the user’s question."