Spaces:

Tonic
/

Native_1-bit_LLM

Running

App Files Files Community

Native_1-bit_LLM / app.py

Tonic

💋 Launch( No Sharing Allowed ) (#9)

b1e1add verified 7 months ago

raw

history blame contribute delete

5.71 kB

	# 🤖⚡ ▄▀ [ I M P O R T S ]

	import accelerate
	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

	# 🧠🔧 ▄▀ [ M O D E L ]

	microsoft_model = None
	microsoft_tokenizer = None

	def load_model():
	global microsoft_model, microsoft_tokenizer
	if microsoft_model is None or microsoft_tokenizer is None:
	model_id = "microsoft/bitnet-b1.58-2B-4T"
	microsoft_tokenizer = AutoTokenizer.from_pretrained(model_id)
	config = AutoConfig.from_pretrained(model_id)
	microsoft_model = AutoModelForCausalLM.from_pretrained(
	model_id,
	config=config,
	torch_dtype=torch.bfloat16
	)
	return microsoft_model, microsoft_tokenizer

	# 🗂️🕰️ ▄▀ [ C O N V E R S A T I O N - H I S T O R Y ]

	def manage_history(history):
	# Limit to 3 turns (each turn is user + assistant = 2 messages)
	max_messages = 6 # 3 turns * 2 messages per turn
	if len(history) > max_messages:
	history = history[-max_messages:]

	# Limit total character count to 300
	total_chars = sum(len(msg["content"]) for msg in history)
	while total_chars > 300 and history:
	history.pop(0) # Remove oldest message
	total_chars = sum(len(msg["content"]) for msg in history)

	return history

	# 💬✨ ▄▀ [ G E N E R A T E - R E S P O N S E ]

	def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
	model, tokenizer = load_model()

	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_input},
	]

	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)

	# Generate Response
	chat_outputs = model.generate(
	**chat_input,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	do_sample=True
	)

	# Decode Response
	response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)

	# Update History
	history.append({"role": "user", "content": user_input})
	history.append({"role": "assistant", "content": response})

	# Manage History Limits
	history = manage_history(history)

	return history, history

	# 🎛️🖥️ ▄▀ [ G R A D I O - I N T E R F A C E ]

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# BitNet b1.58 2B4T Demo")

	with gr.Row():
	with gr.Column():
	gr.Markdown("""
	## About BitNet b1.58 2B4T
	BitNet b1.58 2B4T is the first open-source, native 1-bit Large Language Model with 2 billion parameters,
	developed by Microsoft Research. Trained on 4 trillion tokens, it matches the performance of full-precision
	models while offering significant efficiency gains in memory, energy, and latency. Features include:
	- Transformer-based architecture with BitLinear layers
	- Native 1.58-bit weights and 8-bit activations
	- Maximum context length of 4096 tokens
	- Optimized for efficient inference with bitnet.cpp
	""")

	with gr.Column():
	gr.Markdown("""
	## About Tonic AI
	Tonic AI is a vibrant community of AI enthusiasts and developers always building cool demos and pushing
	the boundaries of what's possible with AI. We're passionate about creating innovative, accessible, and
	engaging AI experiences for everyone. Join us in exploring the future of AI!
	""")

	with gr.Row():
	with gr.Column():
	user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
	system_prompt = gr.Textbox(
	label="System Prompt",
	value="You are a helpful AI assistant.",
	placeholder="Enter system prompt..."
	)

	with gr.Accordion("Advanced Options", open=False):
	max_new_tokens = gr.Slider(
	minimum=10,
	maximum=500,
	value=50,
	step=10,
	label="Max New Tokens"
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	)
	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P"
	)
	top_k = gr.Slider(
	minimum=1,
	maximum=100,
	value=50,
	step=1,
	label="Top K"
	)

	submit_btn = gr.Button("Send")

	with gr.Column():
	chatbot = gr.Chatbot(label="Conversation", type="messages")

	chat_history = gr.State([])

	submit_btn.click(
	fn=generate_response,
	inputs=[
	user_input,
	system_prompt,
	max_new_tokens,
	temperature,
	top_p,
	top_k,
	chat_history
	],
	outputs=[chatbot, chat_history]
	)

	# 🚀🔥 ▄▀ [ M A I N ]

	if __name__ == "__main__":
	load_model()
	demo.launch(ssr_mode=False, share=False)