Spaces:

purpleriann
/

LLM-Engineers-Handbook

Runtime error

App Files Files Community

LLM-Engineers-Handbook / llm_engineering /model /evaluation /evaluate.py

purpleriann

Upload folder using huggingface_hub

a22e84b verified 6 months ago

raw

history blame contribute delete

8.82 kB

	import concurrent.futures
	import gc
	import json
	import os

	from datasets import Dataset, load_dataset
	from huggingface_hub import HfApi
	from huggingface_hub.utils import RepositoryNotFoundError
	from openai import OpenAI
	from tqdm.auto import tqdm
	from vllm import LLM, SamplingParams

	OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
	DATASET_HUGGINGFACE_WORKSPACE = os.environ["DATASET_HUGGINGFACE_WORKSPACE"]
	MODEL_HUGGINGFACE_WORKSPACE = os.environ["MODEL_HUGGINGFACE_WORKSPACE"]
	IS_DUMMY = os.environ.get("IS_DUMMY", False)

	print("====== EVAL PARAMETERS ======") # noqa
	print(f"{DATASET_HUGGINGFACE_WORKSPACE=}") # noqa
	print(f"{MODEL_HUGGINGFACE_WORKSPACE=}") # noqa
	print(f"{IS_DUMMY=}") # noqa
	print("=============================") # noqa


	def generate_answers(model_id: str, dataset_name: str):
	def format(sample):
	return "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:\n".format(
	sample["instruction"]
	)

	dataset = load_dataset(dataset_name, split="test")
	if IS_DUMMY:
	try:
	dataset = dataset.select(range(10))
	except Exception:
	print("Dummy mode active. Failed to trim the dataset to 10 samples.") # noqa
	print(f"Dataset size: {len(dataset)}") # noqa
	dataset = dataset.map(lambda sample: {"prompt": format(sample)})

	print(f"Generating answers for {model_id}") # noqa
	llm = LLM(model=model_id, max_model_len=2048)
	sampling_params = SamplingParams(temperature=0.8, top_p=0.95, min_p=0.05, max_tokens=2048)
	outputs = llm.generate(dataset["prompt"], sampling_params)

	answers = [output.outputs[0].text for output in outputs]
	dataset = dataset.add_column("answers", answers)

	print(f"Uploading results for {model_id}") # noqa
	dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")
	gc.collect()

	return dataset


	def evaluate_answer(instruction: str, answer: str, client: OpenAI) -> dict:
	prompt = f"""You are an expert judge. Please evaluate the quality of a given answer to an instruction based on two criteria:
	1. Accuracy: How factually correct is the information presented in the answer? You are a technical expert in this topic.
	2. Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or academic language.

	Accuracy scale:
	1 (Poor): Contains factual errors or misleading information
	2 (Good): Mostly accurate with minor errors or omissions
	3 (Excellent): Highly accurate and comprehensive

	Style scale:
	1 (Poor): Too formal, uses some overly complex words
	2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions
	3 (Excellent): Perfectly accessible language for blog/social media, uses simple but precise technical terms when necessary

	Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture.
	Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks.

	Instruction: {instruction}

	Answer: {answer}

	Provide your evaluation in JSON format with the following structure:
	{{
	"accuracy": {{
	"analysis": "...",
	"score": 0
	}},
	"style": {{
	"analysis": "...",
	"score": 0
	}}
	}}
	"""

	completion = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{
	"role": "system",
	"content": "You are a helpful assistant who evaluates answers based on accuracy and style. Provide your response in JSON format with a short analysis and score for each criterion.",
	},
	{"role": "user", "content": prompt},
	],
	response_format={"type": "json_object"},
	max_tokens=1000,
	temperature=0.9,
	)

	# Parse the structured output
	return json.loads(completion.choices[0].message.content)


	def evaluate_batch(batch, start_index):
	client = OpenAI(api_key=OPENAI_API_KEY)
	return [(i, evaluate_answer(instr, ans, client)) for i, (instr, ans) in enumerate(batch, start=start_index)]


	def evaluate_answers(model_id: str, num_threads: int = 10, batch_size: int = 5) -> Dataset:
	# Load the dataset
	dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")

	# Create batches of instruction-answer pairs with their original indices
	batches = [
	(i, list(zip(dataset["instruction"][i : i + batch_size], dataset["answers"][i : i + batch_size], strict=False)))
	for i in range(0, len(dataset), batch_size)
	]

	evaluations = [None] * len(dataset)

	with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
	futures = [executor.submit(evaluate_batch, batch, start_index) for start_index, batch in batches]

	for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
	for index, evaluation in future.result():
	evaluations[index] = evaluation

	# Replace the 'evaluation' column if it exists, otherwise add it
	if "evaluation" in dataset.column_names:
	dataset = dataset.remove_columns(["evaluation"])
	dataset = dataset.add_column("evaluation", evaluations)

	# Post-process evaluations
	accuracy_scores = []
	style_scores = []

	for evaluation in dataset["evaluation"]:
	try:
	eval_dict = json.loads(evaluation) if isinstance(evaluation, str) else evaluation
	accuracy_score = eval_dict["accuracy"]["score"]
	style_score = eval_dict["style"]["score"]

	accuracy_scores.append(accuracy_score)
	style_scores.append(style_score)

	except (json.JSONDecodeError, KeyError, TypeError):
	# If there's an error, append None to maintain alignment
	accuracy_scores.append(None)
	style_scores.append(None)

	# Add new columns to the dataset
	if "accuracy" in dataset.column_names:
	dataset = dataset.remove_columns(["accuracy"])
	dataset = dataset.add_column("accuracy", accuracy_scores)
	if "style" in dataset.column_names:
	dataset = dataset.remove_columns(["style"])
	dataset = dataset.add_column("style", style_scores)

	dataset.push_to_hub(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results")

	return dataset


	def check_if_huggingface_model_exists(model_id: str, default_value: str) -> str:
	api = HfApi()

	try:
	api.model_info(model_id)
	print(f"Found model on HF: '{model_id}'.") # noqa
	except RepositoryNotFoundError:
	print(f"Model '{model_id}' does not exist.") # noqa
	model_id = default_value
	print(f"Defaulting to '{model_id}'") # noqa
	print("Train your own model to avoid this behavior.") # noqa

	return model_id


	def check_if_huggingface_dataset_exists(dataset_id: str, default_value: str) -> str:
	api = HfApi()

	try:
	api.dataset_info(dataset_id)
	print(f"Found dataset on HF: '{dataset_id}'.") # noqa
	except RepositoryNotFoundError:
	print(f"Dataset '{dataset_id}' does not exist.") # noqa
	dataset_id = default_value
	print(f"Defaulting to '{dataset_id}'") # noqa
	print("Use a valid dataset or create your own to avoid this behavior.") # noqa

	return dataset_id


	model_ids = [
	check_if_huggingface_model_exists(
	f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B", default_value="mlabonne/TwinLlama-3.1-8B"
	),
	check_if_huggingface_model_exists(
	f"{MODEL_HUGGINGFACE_WORKSPACE}/TwinLlama-3.1-8B-DPO", default_value="mlabonne/TwinLlama-3.1-8B-DPO"
	),
	"meta-llama/Llama-3.1-8B-Instruct",
	]

	if __name__ == "__main__":
	# Run generation
	for model_id in model_ids:
	dataset_name = check_if_huggingface_dataset_exists(
	f"{DATASET_HUGGINGFACE_WORKSPACE}/llmtwin", default_value="mlabonne/llmtwin"
	)
	generate_answers(model_id, dataset_name=dataset_name)

	# Run evaluation
	for model_id in model_ids:
	evaluate_answers(model_id)

	# Analyze results
	for model_id in model_ids:
	dataset = load_dataset(f"{DATASET_HUGGINGFACE_WORKSPACE}/{model_id.split('/')[-1]}-results", split="all")

	score = sum(dataset["accuracy"]) / len(dataset["accuracy"])
	print(f"{model_id.split('/')[-1]} - Accuracy: {score:.2f}") # noqa

	score = sum(dataset["style"]) / len(dataset["style"])
	print(f"{model_id.split('/')[-1]} - Style: {score:.2f}") # noqa