import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig model_id = "TildeAI/TildeOpen-30b" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) # Load model in 4-bit quantization bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16 ) def chat(message, max_new_tokens=256): inputs = tokenizer(message, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, repetition_penalty=1.2, do_sample=True, temperature=0.7, top_p=0.9, ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio UI demo = gr.Interface( fn=chat, inputs=[gr.Textbox(label="Ask something"), gr.Slider(50, 1024, 256)], outputs="text", title="TildeOpen-30b Chat (HF Space)" ) if __name__ == "__main__": demo.launch(share=True)