from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import gradio as gr import torch model_id = "saikrishnagorijala/friday-V1" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) # Define quantization config for 8-bit inference bnb_config = BitsAndBytesConfig( load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf4", bnb_8bit_compute_dtype=torch.float16 ) # Load model with quantization_config model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", quantization_config=bnb_config ) def chat(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=200, do_sample=True, temperature=1.2, top_p=0.9 ) return tokenizer.decode(outputs[0], skip_special_tokens=True) demo = gr.Interface(fn=chat, inputs="text", outputs="text", title="Friday-V1 Chatbot") demo.launch()