Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| model_id = "TildeAI/TildeOpen-30b" | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False) | |
| # Load model in 4-bit quantization | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| def chat(message, max_new_tokens=256): | |
| inputs = tokenizer(message, return_tensors="pt").to(model.device) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| repetition_penalty=1.2, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| ) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Gradio UI | |
| demo = gr.Interface( | |
| fn=chat, | |
| inputs=[gr.Textbox(label="Ask something"), gr.Slider(50, 1024, 256)], | |
| outputs="text", | |
| title="TildeOpen-30b Chat (HF Space)" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |