Spaces:

davanstrien
/

magpie

Running on Zero

App Files Files Community

magpie / app.py

davanstrien HF Staff

Super-squash branch 'main' using huggingface_hub

5188e86 verified over 1 year ago

raw

history blame

2.97 kB

	import gradio as gr
	import transformers
	import torch
	import json
	from transformers import AutoTokenizer
	import os
	from huggingface_hub import login
	import spaces

	HF_TOKEN = os.getenv("HF_TOKEN")
	login(HF_TOKEN)
	# Load the model
	model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_id, add_special_tokens=True)

	pipeline = transformers.pipeline(
	"text-generation",
	model=model_id,
	model_kwargs={"torch_dtype": torch.bfloat16},
	device="cuda",
	)

	# Load the model configuration
	with open("model_configs.json", "r") as f:
	model_configs = json.load(f)
	model_config = model_configs[model_id]

	# Extract instruction
	extract_input = model_config["extract_input"]


	@spaces.GPU
	def generate_instruction_response():
	terminators = [
	tokenizer.eos_token_id,
	tokenizer.convert_tokens_to_ids("<\|eot_id\|>"),
	]

	instruction = pipeline(
	extract_input,
	max_new_tokens=2048,
	eos_token_id=terminators,
	do_sample=True,
	temperature=1,
	top_p=1,
	)

	sanitized_instruction = instruction[0]["generated_text"][
	len(extract_input) :
	].split("\n")[0]

	response_template = f"""<\|begin_of_text\|><\|start_header_id\|>user<\|end_header_id\|>\n\n{sanitized_instruction}<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>\n\n"""

	response = pipeline(
	response_template,
	max_new_tokens=2048,
	eos_token_id=terminators,
	do_sample=True,
	temperature=1,
	top_p=1,
	)

	user_message = sanitized_instruction
	assistant_response = response[0]["generated_text"][len(response_template) :]

	return user_message, assistant_response


	title = "Magpie demo"
	description = """
	This Gradio demo allows you to explore the approach outlined in the Magpie paper. "Magpie is a data synthesis pipeline that generates high-quality alignment data. Magpie does not rely on prompt engineering or seed questions. Instead, it directly constructs instruction data by prompting aligned LLMs with a pre-query template for sampling instructions." Essentially, instead of prompting the model with a question or a starting query, this approach relies on the pre-query template of the model to generate instructions. Essentially, you are giving the model only the template up to the point where a user instruction would start, and then the model generates the instruction and the response.

	In this demo, you can see how the model generates a user instruction and a model response.

	You can learn more about the approach [in the paper](https://huggingface.co/papers/2406.08464).
	"""
	# Create the Gradio interface
	iface = gr.Interface(
	fn=generate_instruction_response,
	inputs=[],
	outputs=[
	gr.Text(label="Generated User Instruction"),
	gr.Text(label="Generated Model Response"),
	],
	title=title,
	description=description,
	)

	# Launch the app
	iface.launch(debug=True)