Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from datasets import load_dataset, get_dataset_config_names | |
| import random | |
| from typing import List, Tuple | |
| import logging | |
| # Set up logging | |
| logging.basicConfig( | |
| level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
| ) | |
| # Popular evaluation datasets with their configurations | |
| EVAL_DATASETS = { | |
| "openai/gsm8k": { | |
| "name": "GSM8K - Grade School Math", | |
| "type": "qa", | |
| "config": "main", | |
| "question_field": "question", | |
| "answer_field": "answer", | |
| "split": "train", | |
| }, | |
| "cais/mmlu": { | |
| "name": "MMLU - Massive Multitask Language Understanding", | |
| "type": "multiple_choice", | |
| "config": "all", | |
| "question_field": "question", | |
| "choices_field": "choices", | |
| "answer_field": "answer", | |
| "split": "test", | |
| }, | |
| "allenai/ai2_arc": { | |
| "name": "AI2 ARC - Science Questions", | |
| "type": "multiple_choice", | |
| "config": "ARC-Challenge", | |
| "question_field": "question", | |
| "choices_field": "choices", | |
| "answer_field": "answerKey", | |
| "split": "train", | |
| }, | |
| "Rowan/hellaswag": { | |
| "name": "HellaSwag - Commonsense NLI", | |
| "type": "multiple_choice", | |
| "question_field": "ctx", | |
| "choices_field": "endings", | |
| "answer_field": "label", | |
| "split": "train", | |
| }, | |
| "allenai/winogrande": { | |
| "name": "WinoGrande - Winograd Schema", | |
| "type": "binary_choice", | |
| "config": "winogrande_xl", | |
| "question_field": "sentence", | |
| "option1_field": "option1", | |
| "option2_field": "option2", | |
| "answer_field": "answer", | |
| "split": "train", | |
| }, | |
| "google/boolq": { | |
| "name": "BoolQ - Boolean Questions", | |
| "type": "true_false", | |
| "question_field": "question", | |
| "context_field": "passage", | |
| "answer_field": "answer", | |
| "split": "train", | |
| }, | |
| "rajpurkar/squad": { | |
| "name": "SQuAD - Reading Comprehension", | |
| "type": "extractive_qa", | |
| "question_field": "question", | |
| "context_field": "context", | |
| "answer_field": "answers", | |
| "split": "train", | |
| }, | |
| "allenai/piqa": { | |
| "name": "PIQA - Physical Reasoning", | |
| "type": "binary_choice", | |
| "question_field": "goal", | |
| "option1_field": "sol1", | |
| "option2_field": "sol2", | |
| "answer_field": "label", | |
| "split": "train", | |
| }, | |
| } | |
| class QuizApp: | |
| def __init__(self): | |
| self.current_dataset = None | |
| self.current_dataset_name = None | |
| self.questions = [] | |
| self.current_question_idx = 0 | |
| self.score = 0 | |
| self.total_questions = 0 | |
| def load_dataset_questions(self, dataset_name: str, num_questions: int = 10): | |
| """Load random questions from the selected dataset""" | |
| try: | |
| config = EVAL_DATASETS[dataset_name] | |
| # Try to load dataset with config if specified | |
| try: | |
| if "config" in config: | |
| dataset = load_dataset( | |
| dataset_name, config["config"], split=config["split"] | |
| ) | |
| else: | |
| dataset = load_dataset(dataset_name, split=config["split"]) | |
| except ValueError as e: | |
| # If config is missing, try to get available configs | |
| if "Config name is missing" in str(e): | |
| configs = get_dataset_config_names(dataset_name) | |
| # Use first config or "all" if available | |
| if "all" in configs: | |
| selected_config = "all" | |
| else: | |
| selected_config = configs[0] | |
| print( | |
| f"Auto-selected config '{selected_config}' for {dataset_name}" | |
| ) | |
| dataset = load_dataset( | |
| dataset_name, selected_config, split=config["split"] | |
| ) | |
| else: | |
| raise e | |
| # Sample random questions | |
| total_examples = len(dataset) | |
| num_questions = min(num_questions, total_examples) | |
| indices = random.sample(range(total_examples), num_questions) | |
| self.questions = [] | |
| for idx in indices: | |
| example = dataset[idx] | |
| self.questions.append(example) | |
| self.current_dataset = config | |
| self.current_dataset_name = dataset_name | |
| self.current_question_idx = 0 | |
| self.score = 0 | |
| self.total_questions = len(self.questions) | |
| return True, f"Loaded {num_questions} questions from {config['name']}" | |
| except Exception as e: | |
| return False, f"Error loading dataset: {str(e)}" | |
| def get_current_question(self) -> Tuple[str, List[str], str]: | |
| """Get the current question formatted for display""" | |
| if not self.questions or self.current_question_idx >= len(self.questions): | |
| return "", [], "" | |
| question_data = self.questions[self.current_question_idx] | |
| config = self.current_dataset | |
| logging.info(f"\n{'=' * 60}") | |
| logging.info(f"Dataset: {self.current_dataset_name}") | |
| logging.info(f"Question {self.current_question_idx + 1}/{self.total_questions}") | |
| logging.info(f"Raw question data: {repr(question_data)}") | |
| logging.info(f"{'=' * 60}\n") | |
| # Format question based on dataset type | |
| question_type = config["type"] | |
| if question_type == "multiple_choice": | |
| question = question_data[config["question_field"]] | |
| choices = question_data[config["choices_field"]] | |
| if config["answer_field"] in question_data: | |
| answer = question_data[config["answer_field"]] | |
| else: | |
| answer = "" | |
| # Format choices with letters | |
| formatted_choices = [ | |
| f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices) | |
| ] | |
| return question, formatted_choices, question_type | |
| elif question_type == "true_false": | |
| question = question_data[config["question_field"]] | |
| if "context_field" in config: | |
| context = question_data[config["context_field"]] | |
| question = f"Context: {context}\n\nQuestion: {question}" | |
| return question, ["True", "False"], question_type | |
| elif question_type == "binary_choice": | |
| question = question_data[config["question_field"]] | |
| option1 = question_data[config["option1_field"]] | |
| option2 = question_data[config["option2_field"]] | |
| return question, [f"A. {option1}", f"B. {option2}"], question_type | |
| elif question_type == "qa" or question_type == "extractive_qa": | |
| question = question_data[config["question_field"]] | |
| if "context_field" in config and config["context_field"] in question_data: | |
| context = question_data[config["context_field"]] | |
| question = f"Context: {context[:500]}...\n\nQuestion: {question}" | |
| return question, [], question_type | |
| return "", [], "" | |
| def format_answer(self, answer: str, dataset_name: str) -> str: | |
| """Format answer based on dataset type for better readability""" | |
| import re | |
| # Convert <<equation>> to show the math clearly | |
| # Extract the equation and its result, show just the result with equation in parentheses | |
| def format_equation(match): | |
| equation = match.group(1) | |
| # Check if it's in format "calculation=result" | |
| if '=' in equation: | |
| parts = equation.split('=') | |
| if len(parts) == 2: | |
| calculation, result = parts[0], parts[1] | |
| return f"{result} (={calculation})" | |
| return f"[{equation}]" | |
| answer = re.sub(r"<<([^>]+)>>", format_equation, answer) | |
| # Dataset-specific formatting | |
| if dataset_name == "openai/gsm8k": | |
| # Format the final answer line | |
| answer = answer.replace("####", "\n\nFinal Answer:") | |
| # Ensure proper line breaks after periods for readability | |
| answer = re.sub(r'\. (?=[A-Z])', '.\n', answer) | |
| return answer | |
| def check_answer(self, user_answer: str) -> Tuple[bool, str]: | |
| """Check if the user's answer is correct""" | |
| if not self.questions or self.current_question_idx >= len(self.questions): | |
| return False, "No question available" | |
| question_data = self.questions[self.current_question_idx] | |
| config = self.current_dataset | |
| question_type = config["type"] | |
| if question_type == "multiple_choice": | |
| correct_answer_idx = question_data[config["answer_field"]] | |
| # Handle both numeric and letter answers | |
| if isinstance(correct_answer_idx, int): | |
| correct_letter = chr(65 + correct_answer_idx) | |
| else: | |
| correct_letter = str(correct_answer_idx) | |
| user_letter = user_answer.strip().upper()[0] if user_answer else "" | |
| is_correct = user_letter == correct_letter | |
| if is_correct: | |
| return True, '✅ Correct!' | |
| else: | |
| choices = question_data[config["choices_field"]] | |
| correct_choice = ( | |
| choices[correct_answer_idx] | |
| if isinstance(correct_answer_idx, int) | |
| else correct_answer_idx | |
| ) | |
| logging.info(f"Raw answer (multiple choice): {repr(correct_choice)}") | |
| formatted_answer = self.format_answer( | |
| correct_choice, self.current_dataset_name | |
| ) | |
| return ( | |
| False, | |
| f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}', | |
| ) | |
| elif question_type == "true_false": | |
| correct_answer = question_data[config["answer_field"]] | |
| user_bool = user_answer.lower().strip() == "true" | |
| is_correct = user_bool == correct_answer | |
| if is_correct: | |
| return True, '✅ Correct!' | |
| else: | |
| return ( | |
| False, | |
| f'❌ Incorrect\n\nThe correct answer was {correct_answer}', | |
| ) | |
| elif question_type == "binary_choice": | |
| correct_answer_idx = question_data[config["answer_field"]] | |
| user_idx = 0 if user_answer.strip().upper().startswith("A") else 1 | |
| is_correct = user_idx == correct_answer_idx | |
| if is_correct: | |
| return True, '✅ Correct!' | |
| else: | |
| correct_letter = "A" if correct_answer_idx == 0 else "B" | |
| option_field = ( | |
| config["option1_field"] | |
| if correct_answer_idx == 0 | |
| else config["option2_field"] | |
| ) | |
| correct_option = question_data[option_field] | |
| logging.info(f"Raw answer (binary choice): {repr(correct_option)}") | |
| formatted_answer = self.format_answer( | |
| correct_option, self.current_dataset_name | |
| ) | |
| return ( | |
| False, | |
| f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}', | |
| ) | |
| elif question_type in ["qa", "extractive_qa"]: | |
| # For QA, we'll do a simple check - in real app, you'd want more sophisticated matching | |
| correct_answer = question_data[config["answer_field"]] | |
| if isinstance(correct_answer, dict) and "text" in correct_answer: | |
| correct_answer = ( | |
| correct_answer["text"][0] if correct_answer["text"] else "" | |
| ) | |
| elif isinstance(correct_answer, list) and len(correct_answer) > 0: | |
| correct_answer = ( | |
| correct_answer[0]["text"] | |
| if isinstance(correct_answer[0], dict) | |
| else str(correct_answer[0]) | |
| ) | |
| else: | |
| correct_answer = str(correct_answer) | |
| # Extract final answer for GSM8K and similar datasets | |
| import re | |
| # For GSM8K, extract the final answer after #### | |
| if "####" in correct_answer: | |
| final_answer_match = re.search(r"####\s*(.+)", correct_answer) | |
| if final_answer_match: | |
| final_answer = final_answer_match.group(1).strip() | |
| else: | |
| final_answer = correct_answer | |
| else: | |
| final_answer = correct_answer | |
| # First check if user answer is empty | |
| if not user_answer or not user_answer.strip(): | |
| is_correct = False | |
| else: | |
| # Extract numbers from both answers for comparison | |
| correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer) | |
| user_numbers = re.findall(r"-?\d+\.?\d*", user_answer) | |
| # Check if answers match | |
| is_correct = False | |
| # If both have numbers, compare the numbers | |
| if correct_numbers and user_numbers: | |
| # Convert to float for comparison to handle decimals | |
| try: | |
| correct_num = float( | |
| correct_numbers[-1] | |
| ) # Take the last number as final answer | |
| user_num = float(user_numbers[-1]) # Take the last number from user | |
| is_correct = ( | |
| abs(correct_num - user_num) < 0.0001 | |
| ) # Small tolerance for float comparison | |
| except ValueError: | |
| # Fall back to string comparison | |
| is_correct = correct_numbers[-1] == user_numbers[-1] | |
| elif correct_numbers and not user_numbers: | |
| # If correct answer has numbers but user answer doesn't, it's wrong | |
| is_correct = False | |
| else: | |
| # Fall back to substring matching for non-numeric answers | |
| # But ensure both strings are non-empty | |
| is_correct = ( | |
| user_answer.lower().strip() in correct_answer.lower() | |
| or correct_answer.lower() in user_answer.lower().strip() | |
| ) and len(user_answer.strip()) > 0 | |
| if is_correct: | |
| return True, '✅ Correct!' | |
| else: | |
| logging.info(f"Raw answer (QA): {repr(correct_answer)}") | |
| logging.info(f"Extracted final answer: {repr(final_answer)}") | |
| logging.info( | |
| f"Correct numbers: {correct_numbers}, User numbers: {user_numbers}" | |
| ) | |
| formatted_answer = self.format_answer( | |
| correct_answer, self.current_dataset_name | |
| ) | |
| # Debug: log the formatted answer | |
| logging.info(f"Formatted answer with LaTeX: {repr(formatted_answer)}") | |
| return ( | |
| False, | |
| f'❌ Incorrect\n\nThe correct answer was:\n\n{formatted_answer}', | |
| ) | |
| return False, "Unknown question type" | |
| # Create global quiz app instance | |
| quiz_app = QuizApp() | |
| def create_dataset_display(): | |
| """Create the dataset listing display""" | |
| dataset_info = [] | |
| for dataset_id, config in EVAL_DATASETS.items(): | |
| dataset_info.append( | |
| f"**{config['name']}**\n- Dataset: {dataset_id}\n- Type: {config['type']}" | |
| ) | |
| return "\n\n".join(dataset_info) | |
| def start_quiz(dataset_choice: str, num_questions: int): | |
| """Start a new quiz with the selected dataset""" | |
| # Extract dataset ID from the choice | |
| dataset_id = None | |
| for did, config in EVAL_DATASETS.items(): | |
| if config["name"] in dataset_choice: | |
| dataset_id = did | |
| break | |
| if not dataset_id: | |
| return ( | |
| "Please select a dataset", | |
| gr.update(visible=False), # question_display | |
| gr.update(visible=False), # answer_radio | |
| gr.update(visible=False), # answer_textbox | |
| gr.update(visible=False), # submit_button | |
| gr.update(visible=False), # progress_text | |
| ) | |
| success, message = quiz_app.load_dataset_questions(dataset_id, num_questions) | |
| if success: | |
| question, choices, q_type = quiz_app.get_current_question() | |
| if q_type in ["multiple_choice", "true_false", "binary_choice"]: | |
| return ( | |
| message, | |
| gr.update(value=question, visible=True), # question_display | |
| gr.update(choices=choices, visible=True, value=None), # answer_radio | |
| gr.update(visible=False), # answer_textbox | |
| gr.update(visible=True), # submit_button | |
| gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text | |
| ) | |
| else: | |
| return ( | |
| message, | |
| gr.update(value=question, visible=True), # question_display | |
| gr.update(visible=False), # answer_radio | |
| gr.update(visible=True, value=""), # answer_textbox | |
| gr.update(visible=True), # submit_button | |
| gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text | |
| ) | |
| else: | |
| return ( | |
| message, | |
| gr.update(visible=False), # question_display | |
| gr.update(visible=False), # answer_radio | |
| gr.update(visible=False), # answer_textbox | |
| gr.update(visible=False), # submit_button | |
| gr.update(visible=False), # progress_text | |
| ) | |
| def submit_answer(answer_choice, answer_text): | |
| """Submit answer and show feedback""" | |
| # Determine which answer to use | |
| if answer_choice: | |
| answer = answer_choice | |
| else: | |
| answer = answer_text | |
| is_correct, feedback = quiz_app.check_answer(answer) | |
| if is_correct: | |
| quiz_app.score += 1 | |
| return gr.update(value=feedback, visible=True), gr.update(visible=True) | |
| def next_question(): | |
| """Move to the next question""" | |
| quiz_app.current_question_idx += 1 | |
| if quiz_app.current_question_idx >= quiz_app.total_questions: | |
| # Quiz complete | |
| final_score = f'🎉 Quiz Complete!\n\nYour score: {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)' | |
| return ( | |
| gr.update(value=final_score, visible=True), | |
| "", | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| "Quiz Complete", | |
| ) | |
| question, choices, q_type = quiz_app.get_current_question() | |
| if q_type in ["multiple_choice", "true_false", "binary_choice"]: | |
| return ( | |
| gr.update(value="", visible=False), # Clear feedback | |
| gr.update(value=question), # question_display | |
| gr.update(choices=choices, visible=True, value=None), | |
| gr.update(visible=False), | |
| gr.update(visible=True), | |
| gr.update(visible=False), | |
| gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"), | |
| ) | |
| else: | |
| return ( | |
| gr.update(value="", visible=False), # Clear feedback | |
| gr.update(value=question), # question_display | |
| gr.update(visible=False), | |
| gr.update(visible=True, value=""), | |
| gr.update(visible=True), | |
| gr.update(visible=False), | |
| gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"), | |
| ) | |
| # Create Gradio interface | |
| with gr.Blocks(title="HuggingFace Evaluation Dataset Quiz") as demo: | |
| gr.Markdown("# 🤗 Evaluation Dataset Quiz") | |
| gr.Markdown( | |
| "Test yourself with questions from popular HuggingFace evaluation datasets!" | |
| ) | |
| # Dataset Selection Section | |
| with gr.Row(): | |
| dataset_dropdown = gr.Dropdown( | |
| choices=[config["name"] for config in EVAL_DATASETS.values()], | |
| label="Select Dataset", | |
| value=list(EVAL_DATASETS.values())[0]["name"], | |
| ) | |
| num_questions_slider = gr.Slider( | |
| minimum=5, maximum=20, value=10, step=1, label="Number of Questions" | |
| ) | |
| start_button = gr.Button("Start Quiz", variant="primary") | |
| status_message = gr.Textbox(label="Status", interactive=False) | |
| # Quiz Section - shown when quiz starts | |
| gr.Markdown("---") # Separator | |
| progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False, visible=False) | |
| question_display = gr.Textbox(label="Question", lines=5, interactive=False, visible=False) | |
| # Answer inputs (one will be visible at a time) | |
| answer_radio = gr.Radio(label="Select your answer", visible=False) | |
| answer_textbox = gr.Textbox(label="Type your answer (Raw number)", visible=False) | |
| submit_button = gr.Button("Submit Answer", variant="primary", visible=False) | |
| feedback_display = gr.Textbox( | |
| label="Feedback", | |
| visible=False, | |
| lines=10, | |
| max_lines=20, | |
| interactive=False | |
| ) | |
| next_button = gr.Button("Next Question", visible=False) | |
| # Connect events | |
| start_button.click( | |
| start_quiz, | |
| inputs=[dataset_dropdown, num_questions_slider], | |
| outputs=[ | |
| status_message, | |
| question_display, | |
| answer_radio, | |
| answer_textbox, | |
| submit_button, | |
| progress_text, | |
| ], | |
| ) | |
| submit_button.click( | |
| submit_answer, | |
| inputs=[answer_radio, answer_textbox], | |
| outputs=[feedback_display, next_button], | |
| ) | |
| next_button.click( | |
| next_question, | |
| outputs=[ | |
| feedback_display, | |
| question_display, | |
| answer_radio, | |
| answer_textbox, | |
| submit_button, | |
| next_button, | |
| progress_text, | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |