Spaces:

technicolor
/

InteractiveSurvey

Sleeping

App Files Files Community

InteractiveSurvey / src /evaluation /evaluation.py

technicolor

Add Django InteractiveSurvey project

a97d040 5 months ago

raw

history blame contribute delete

7.11 kB

	import os
	import json
	from dotenv import load_dotenv
	from openai import OpenAI

	load_dotenv()

	# --------------------------------------------------------------------------------
	# PROMPTS & CLIENT UTILS
	# --------------------------------------------------------------------------------
	COVERAGE_PROMPT = '''
	Here is an academic survey about the topic "[TOPIC]":
	---
	[SURVEY]
	---

	<instruction>
	Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
	---
	Criterion Description: Coverage assesses the extent to which the survey encapsulates all relevant aspects of the topic.
	---
	Score 1 Description: The survey has very limited coverage, only touching on a small portion of the topic and lacking discussion on key areas.
	Score 2 Description: The survey covers some parts of the topic but has noticeable omissions, with significant areas either underrepresented or missing.
	Score 3 Description: The survey is generally comprehensive but still misses a few key points.
	Score 4 Description: The survey covers most key areas comprehensively, with only very minor topics left out.
	Score 5 Description: The survey comprehensively covers all key and peripheral topics, providing detailed discussions and extensive information.
	---
	Return the score without any other information:
	'''

	STRUCTURE_PROMPT = '''
	Here is an academic survey about the topic "[TOPIC]":
	---
	[SURVEY]
	---

	<instruction>
	Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
	---
	Criterion Description: Structure evaluates the logical organization and coherence of sections and subsections.
	---
	Score 1 Description: The survey lacks logic, with no clear connections between sections.
	Score 2 Description: The survey has weak logical flow with some disordered content.
	Score 3 Description: The survey has a generally reasonable logical structure.
	Score 4 Description: The survey has good logical consistency, with content well arranged.
	Score 5 Description: The survey is tightly structured and logically clear.
	---
	Return the score without any other information:
	'''

	RELEVANCE_PROMPT = '''
	Here is an academic survey about the topic "[TOPIC]":
	---
	[SURVEY]
	---

	<instruction>
	Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description:
	---
	Criterion Description: Relevance measures how well the content aligns with the research topic.
	---
	Score 1 Description: The content is outdated or unrelated to the field.
	Score 2 Description: The survey is somewhat on topic but with several digressions.
	Score 3 Description: The survey is generally on topic, despite a few unrelated details.
	Score 4 Description: The survey is mostly on topic and focused.
	Score 5 Description: The survey is exceptionally focused and entirely on topic.
	---
	Return the score without any other information:
	'''

	def getQwenClient():
	openai_api_key = os.getenv("OPENAI_API_KEY")
	openai_api_base = os.getenv("OPENAI_API_BASE")
	client = OpenAI(
	api_key=openai_api_key,
	base_url=openai_api_base,
	)
	return client

	def generateResponse(client, prompt):
	chat_response = client.chat.completions.create(
	model=os.environ.get("MODEL"),
	max_tokens=128,
	temperature=0.5,
	stop="<\|im_end\|>",
	stream=True,
	messages=[{"role": "user", "content": prompt}]
	)
	text = ""
	for chunk in chat_response:
	if chunk.choices[0].delta.content:
	text += chunk.choices[0].delta.content
	return text

	def evaluate_survey(topic, survey_content, client, prompt_template):
	prompt = prompt_template.replace("[TOPIC]", topic).replace("[SURVEY]", survey_content)
	response = generateResponse(client, prompt)
	return response.strip()

	def evaluate_coverage(topic, survey_content, client):
	return evaluate_survey(topic, survey_content, client, COVERAGE_PROMPT)

	def evaluate_structure(topic, survey_content, client):
	return evaluate_survey(topic, survey_content, client, STRUCTURE_PROMPT)

	def evaluate_relevance(topic, survey_content, client):
	return evaluate_survey(topic, survey_content, client, RELEVANCE_PROMPT)

	# --------------------------------------------------------------------------------
	# MAIN LOGIC
	# --------------------------------------------------------------------------------
	if __name__ == "__main__":
	client = getQwenClient()

	category_folders = [
	"Computer Science",
	"Mathematics",
	"Physics",
	"Statistics",
	"Electrical Engineering and Systems Science",
	"Quantitative Biology",
	"Quantitative Finance",
	"Economics"
	]

	evaluation_results = {}

	for category in category_folders:
	if not os.path.isdir(category):
	# If the folder doesn't exist, skip
	print(f"Skipping: '{category}' - directory not found.")
	continue

	# Initialize a dict for this category in the results
	evaluation_results[category] = {}

	# For each .md file found in this category folder
	for filename in os.listdir(category):
	# We only want .md files that follow the naming pattern "survey_{topic}.md"
	if filename.lower().endswith(".md") and filename.startswith("survey_"):
	# Extract the topic from the filename
	# e.g., "survey_LLM for In-Context Learning.md" -> "LLM for In-Context Learning"
	topic = filename[len("survey_") : -len(".md")]

	md_file_path = os.path.join(category, filename)
	if not os.path.isfile(md_file_path):
	continue

	# Read the content of the survey file
	with open(md_file_path, "r", encoding="utf-8") as f:
	survey_content = f.read()

	# Evaluate
	try:
	coverage_score = evaluate_coverage(topic, survey_content, client)
	structure_score = evaluate_structure(topic, survey_content, client)
	relevance_score = evaluate_relevance(topic, survey_content, client)

	# Store in nested dictionary: results[category][topic] = ...
	evaluation_results[category][topic] = {
	"coverage": coverage_score,
	"structure": structure_score,
	"relevance": relevance_score
	}

	print(f"Evaluated: {category} / {topic}")
	except Exception as e:
	print(f"Error evaluating '{category} / {topic}': {e}")

	# Write everything to a single JSON file
	output_file = "evaluation_results.json"
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(evaluation_results, f, indent=4, ensure_ascii=False)

	print(f"Evaluation completed. Results saved to: {output_file}")