Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| load_dotenv() | |
| # -------------------------------------------------------------------------------- | |
| # PROMPTS & CLIENT UTILS | |
| # -------------------------------------------------------------------------------- | |
| COVERAGE_PROMPT = ''' | |
| Here is an academic survey about the topic "[TOPIC]": | |
| --- | |
| [SURVEY] | |
| --- | |
| <instruction> | |
| Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description: | |
| --- | |
| Criterion Description: Coverage assesses the extent to which the survey encapsulates all relevant aspects of the topic. | |
| --- | |
| Score 1 Description: The survey has very limited coverage, only touching on a small portion of the topic and lacking discussion on key areas. | |
| Score 2 Description: The survey covers some parts of the topic but has noticeable omissions, with significant areas either underrepresented or missing. | |
| Score 3 Description: The survey is generally comprehensive but still misses a few key points. | |
| Score 4 Description: The survey covers most key areas comprehensively, with only very minor topics left out. | |
| Score 5 Description: The survey comprehensively covers all key and peripheral topics, providing detailed discussions and extensive information. | |
| --- | |
| Return the score without any other information: | |
| ''' | |
| STRUCTURE_PROMPT = ''' | |
| Here is an academic survey about the topic "[TOPIC]": | |
| --- | |
| [SURVEY] | |
| --- | |
| <instruction> | |
| Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description: | |
| --- | |
| Criterion Description: Structure evaluates the logical organization and coherence of sections and subsections. | |
| --- | |
| Score 1 Description: The survey lacks logic, with no clear connections between sections. | |
| Score 2 Description: The survey has weak logical flow with some disordered content. | |
| Score 3 Description: The survey has a generally reasonable logical structure. | |
| Score 4 Description: The survey has good logical consistency, with content well arranged. | |
| Score 5 Description: The survey is tightly structured and logically clear. | |
| --- | |
| Return the score without any other information: | |
| ''' | |
| RELEVANCE_PROMPT = ''' | |
| Here is an academic survey about the topic "[TOPIC]": | |
| --- | |
| [SURVEY] | |
| --- | |
| <instruction> | |
| Please evaluate this survey about the topic "[TOPIC]" based on the criterion provided below and give a score from 1 to 5 according to the score description: | |
| --- | |
| Criterion Description: Relevance measures how well the content aligns with the research topic. | |
| --- | |
| Score 1 Description: The content is outdated or unrelated to the field. | |
| Score 2 Description: The survey is somewhat on topic but with several digressions. | |
| Score 3 Description: The survey is generally on topic, despite a few unrelated details. | |
| Score 4 Description: The survey is mostly on topic and focused. | |
| Score 5 Description: The survey is exceptionally focused and entirely on topic. | |
| --- | |
| Return the score without any other information: | |
| ''' | |
| def getQwenClient(): | |
| openai_api_key = os.getenv("OPENAI_API_KEY") | |
| openai_api_base = os.getenv("OPENAI_API_BASE") | |
| client = OpenAI( | |
| api_key=openai_api_key, | |
| base_url=openai_api_base, | |
| ) | |
| return client | |
| def generateResponse(client, prompt): | |
| chat_response = client.chat.completions.create( | |
| model=os.environ.get("MODEL"), | |
| max_tokens=128, | |
| temperature=0.5, | |
| stop="<|im_end|>", | |
| stream=True, | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| text = "" | |
| for chunk in chat_response: | |
| if chunk.choices[0].delta.content: | |
| text += chunk.choices[0].delta.content | |
| return text | |
| def evaluate_survey(topic, survey_content, client, prompt_template): | |
| prompt = prompt_template.replace("[TOPIC]", topic).replace("[SURVEY]", survey_content) | |
| response = generateResponse(client, prompt) | |
| return response.strip() | |
| def evaluate_coverage(topic, survey_content, client): | |
| return evaluate_survey(topic, survey_content, client, COVERAGE_PROMPT) | |
| def evaluate_structure(topic, survey_content, client): | |
| return evaluate_survey(topic, survey_content, client, STRUCTURE_PROMPT) | |
| def evaluate_relevance(topic, survey_content, client): | |
| return evaluate_survey(topic, survey_content, client, RELEVANCE_PROMPT) | |
| # -------------------------------------------------------------------------------- | |
| # MAIN LOGIC | |
| # -------------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| client = getQwenClient() | |
| category_folders = [ | |
| "Computer Science", | |
| "Mathematics", | |
| "Physics", | |
| "Statistics", | |
| "Electrical Engineering and Systems Science", | |
| "Quantitative Biology", | |
| "Quantitative Finance", | |
| "Economics" | |
| ] | |
| evaluation_results = {} | |
| for category in category_folders: | |
| if not os.path.isdir(category): | |
| # If the folder doesn't exist, skip | |
| print(f"Skipping: '{category}' - directory not found.") | |
| continue | |
| # Initialize a dict for this category in the results | |
| evaluation_results[category] = {} | |
| # For each .md file found in this category folder | |
| for filename in os.listdir(category): | |
| # We only want .md files that follow the naming pattern "survey_{topic}.md" | |
| if filename.lower().endswith(".md") and filename.startswith("survey_"): | |
| # Extract the topic from the filename | |
| # e.g., "survey_LLM for In-Context Learning.md" -> "LLM for In-Context Learning" | |
| topic = filename[len("survey_") : -len(".md")] | |
| md_file_path = os.path.join(category, filename) | |
| if not os.path.isfile(md_file_path): | |
| continue | |
| # Read the content of the survey file | |
| with open(md_file_path, "r", encoding="utf-8") as f: | |
| survey_content = f.read() | |
| # Evaluate | |
| try: | |
| coverage_score = evaluate_coverage(topic, survey_content, client) | |
| structure_score = evaluate_structure(topic, survey_content, client) | |
| relevance_score = evaluate_relevance(topic, survey_content, client) | |
| # Store in nested dictionary: results[category][topic] = ... | |
| evaluation_results[category][topic] = { | |
| "coverage": coverage_score, | |
| "structure": structure_score, | |
| "relevance": relevance_score | |
| } | |
| print(f"Evaluated: {category} / {topic}") | |
| except Exception as e: | |
| print(f"Error evaluating '{category} / {topic}': {e}") | |
| # Write everything to a single JSON file | |
| output_file = "evaluation_results.json" | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(evaluation_results, f, indent=4, ensure_ascii=False) | |
| print(f"Evaluation completed. Results saved to: {output_file}") | |