diff --git "a/main_log.txt" "b/main_log.txt" new file mode 100644--- /dev/null +++ "b/main_log.txt" @@ -0,0 +1,7974 @@ +🔥 Starting benchmark for meta-llama_Llama-3.2-1B-Instruct +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 2 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/meta-llama_Llama-3.2-1B-Instruct), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (2) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3380|± |0.0150| +|anli_r2 | 1|none | 0|acc |↑ | 0.3340|± |0.0149| +|anli_r3 | 1|none | 0|acc |↑ | 0.3725|± |0.0140| +|arc_challenge | 1|none | 0|acc |↑ | 0.3567|± |0.0140| +| | |none | 0|acc_norm |↑ | 0.3805|± |0.0142| +|bbh | 3|get-answer | |exact_match|↑ | 0.3781|± |0.0055| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4225|± |0.0362| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5320|± |0.0316| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0280|± |0.0105| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5200|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.2640|± |0.0279| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2320|± |0.0268| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1440|± |0.0222| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.3680|± |0.0306| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.4720|± |0.0316| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.2400|± |0.0271| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6080|± |0.0309| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5320|± |0.0316| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.4315|± |0.0411| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.2720|± |0.0282| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4800|± |0.0317| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.2320|± |0.0268| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4607|± |0.0375| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2160|± |0.0261| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1840|± |0.0246| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.0960|± |0.0187| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3080|± |0.0293| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.6040|± |0.0310| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0760|�� |0.0168| +|boolq | 2|none | 0|acc |↑ | 0.6948|± |0.0081| +|drop | 3|none | 0|em |↑ | 0.0497|± |0.0022| +| | |none | 0|f1 |↑ | 0.1635|± |0.0029| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1111|± |0.0224| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1010|± |0.0215| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2727|± |0.0317| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2576|± |0.0312| +| | |none | 0|acc_norm |↑ | 0.2576|± |0.0312| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2121|± |0.0291| +| | |none | 0|acc_norm |↑ | 0.2121|± |0.0291| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1007|± |0.0129| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0934|± |0.0125| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2198|± |0.0177| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2656|± |0.0189| +| | |none | 0|acc_norm |↑ | 0.2656|± |0.0189| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2802|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.2802|± |0.0192| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0982|± |0.0141| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0982|± |0.0141| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2321|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2790|± |0.0212| +| | |none | 0|acc_norm |↑ | 0.2790|± |0.0212| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2746|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.2746|± |0.0211| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.3374|± |0.0130| +| | |strict-match | 5|exact_match|↑ | 0.3374|± |0.0130| +|hellaswag | 1|none | 0|acc |↑ | 0.4512|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6088|± |0.0049| +|mmlu | 2|none | |acc |↑ | 0.4589|± |0.0041| +| - humanities | 2|none | |acc |↑ | 0.4389|± |0.0071| +| - formal_logic | 1|none | 0|acc |↑ | 0.3095|± |0.0413| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6242|± |0.0378| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.5784|± |0.0347| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.6540|± |0.0310| +| - international_law | 1|none | 0|acc |↑ | 0.5950|± |0.0448| +| - jurisprudence | 1|none | 0|acc |↑ | 0.5185|± |0.0483| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.4540|± |0.0391| +| - moral_disputes | 1|none | 0|acc |↑ | 0.4595|± |0.0268| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3318|± |0.0157| +| - philosophy | 1|none | 0|acc |↑ | 0.5177|± |0.0284| +| - prehistory | 1|none | 0|acc |↑ | 0.5278|± |0.0278| +| - professional_law | 1|none | 0|acc |↑ | 0.3651|± |0.0123| +| - world_religions | 1|none | 0|acc |↑ | 0.5848|± |0.0378| +| - other | 2|none | |acc |↑ | 0.5182|± |0.0088| +| - business_ethics | 1|none | 0|acc |↑ | 0.4500|± |0.0500| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.4679|± |0.0307| +| - college_medicine | 1|none | 0|acc |↑ | 0.3815|± |0.0370| +| - global_facts | 1|none | 0|acc |↑ | 0.3200|± |0.0469| +| - human_aging | 1|none | 0|acc |↑ | 0.5381|± |0.0335| +| - management | 1|none | 0|acc |↑ | 0.5340|± |0.0494| +| - marketing | 1|none | 0|acc |↑ | 0.6795|± |0.0306| +| - medical_genetics | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6003|± |0.0175| +| - nutrition | 1|none | 0|acc |↑ | 0.5588|± |0.0284| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3546|± |0.0285| +| - professional_medicine | 1|none | 0|acc |↑ | 0.5588|± |0.0302| +| - virology | 1|none | 0|acc |↑ | 0.4157|± |0.0384| +| - social sciences | 2|none | |acc |↑ | 0.5080|± |0.0088| +| - econometrics | 1|none | 0|acc |↑ | 0.2281|± |0.0395| +| - high_school_geography | 1|none | 0|acc |↑ | 0.5556|± |0.0354| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.5181|± |0.0361| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4103|± |0.0249| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.4538|± |0.0323| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.6294|± |0.0207| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5344|± |0.0437| +| - professional_psychology | 1|none | 0|acc |↑ | 0.4265|± |0.0200| +| - public_relations | 1|none | 0|acc |↑ | 0.4727|± |0.0478| +| - security_studies | 1|none | 0|acc |↑ | 0.5388|± |0.0319| +| - sociology | 1|none | 0|acc |↑ | 0.6468|± |0.0338| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - stem | 2|none | |acc |↑ | 0.3825|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.2400|± |0.0429| +| - anatomy | 1|none | 0|acc |↑ | 0.4815|± |0.0432| +| - astronomy | 1|none | 0|acc |↑ | 0.5395|± |0.0406| +| - college_biology | 1|none | 0|acc |↑ | 0.4931|± |0.0418| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3700|± |0.0485| +| - college_computer_science | 1|none | 0|acc |↑ | 0.3600|± |0.0482| +| - college_mathematics | 1|none | 0|acc |↑ | 0.2800|± |0.0451| +| - college_physics | 1|none | 0|acc |↑ | 0.2549|± |0.0434| +| - computer_security | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.4340|± |0.0324| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5448|± |0.0415| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2910|± |0.0234| +| - high_school_biology | 1|none | 0|acc |↑ | 0.4968|± |0.0284| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.3547|± |0.0337| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2630|± |0.0268| +| - high_school_physics | 1|none | 0|acc |↑ | 0.2980|± |0.0373| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.3472|± |0.0325| +| - machine_learning | 1|none | 0|acc |↑ | 0.3125|± |0.0440| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0565|± |0.0038| +|openbookqa | 1|none | 0|acc |↑ | 0.2440|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.3460|± |0.0213| +|piqa | 1|none | 0|acc |↑ | 0.7437|± |0.0102| +| | |none | 0|acc_norm |↑ | 0.7421|± |0.0102| +|qnli | 1|none | 0|acc |↑ | 0.4946|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9390|± |0.0076| +| | |none | 0|acc_norm |↑ | 0.8970|± |0.0096| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.2499|± |0.0032| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3647|± |0.0169| +| | |none | 0|bleu_diff |↑ |-2.3149|± |0.6149| +| | |none | 0|bleu_max |↑ |17.3007|± |0.6267| +| | |none | 0|rouge1_acc |↑ | 0.3550|± |0.0168| +| | |none | 0|rouge1_diff|↑ |-4.7074|± |0.9203| +| | |none | 0|rouge1_max |↑ |38.8035|± |0.8508| +| | |none | 0|rouge2_acc |↑ | 0.2411|± |0.0150| +| | |none | 0|rouge2_diff|↑ |-4.5701|± |0.9279| +| | |none | 0|rouge2_max |↑ |22.7998|± |0.8999| +| | |none | 0|rougeL_acc |↑ | 0.3672|± |0.0169| +| | |none | 0|rougeL_diff|↑ |-4.7792|± |0.9300| +| | |none | 0|rougeL_max |↑ |36.3004|± |0.8466| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2717|± |0.0156| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4383|± |0.0144| +|winogrande | 1|none | 0|acc |↑ | 0.6014|± |0.0138| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.3781|± |0.0055| +|mmlu | 2|none | |acc |↑ |0.4589|± |0.0041| +| - humanities | 2|none | |acc |↑ |0.4389|± |0.0071| +| - other | 2|none | |acc |↑ |0.5182|± |0.0088| +| - social sciences| 2|none | |acc |↑ |0.5080|± |0.0088| +| - stem | 2|none | |acc |↑ |0.3825|± |0.0085| + +meta-llama_Llama-3.2-1B-Instruct: 3h 31m 2s +✅ Benchmark completed for meta-llama_Llama-3.2-1B-Instruct + +🔥 Starting benchmark for meta-llama_Llama-3.2-3B-Instruct +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 2 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/meta-llama_Llama-3.2-3B-Instruct), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (2) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4470|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.4180|± |0.0156| +|anli_r3 | 1|none | 0|acc |↑ | 0.4308|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.4326|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.4590|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.5564|± |0.0056| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8840|± |0.0203| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5134|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7080|± |0.0288| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5560|± |0.0315| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0600|± |0.0151| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.3800|± |0.0308| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3840|± |0.0308| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.7120|± |0.0287| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4640|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7880|± |0.0259| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.6520|± |0.0302| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.7720|± |0.0266| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8280|± |0.0239| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5822|± |0.0410| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6560|± |0.0301| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4480|± |0.0315| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5899|± |0.0370| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8440|± |0.0230| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.4960|± |0.0317| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3600|± |0.0304| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.2040|± |0.0255| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2440|± |0.0272| +|boolq | 2|none | 0|acc |↑ | 0.7847|± |0.0072| +|drop | 3|none | 0|em |↑ | 0.0259|± |0.0016| +| | |none | 0|f1 |↑ | 0.1554|± |0.0025| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0960|± |0.0210| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1162|± |0.0228| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2172|± |0.0294| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3030|± |0.0327| +| | |none | 0|acc_norm |↑ | 0.3030|± |0.0327| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2828|± |0.0321| +| | |none | 0|acc_norm |↑ | 0.2828|± |0.0321| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1136|± |0.0136| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1374|± |0.0147| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2326|± |0.0181| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2802|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.2802|± |0.0192| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3407|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3407|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1317|± |0.0160| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1183|± |0.0153| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2344|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3147|± |0.0220| +| | |none | 0|acc_norm |↑ | 0.3147|± |0.0220| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3281|± |0.0222| +| | |none | 0|acc_norm |↑ | 0.3281|± |0.0222| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6497|± |0.0131| +| | |strict-match | 5|exact_match|↑ | 0.6422|± |0.0132| +|hellaswag | 1|none | 0|acc |↑ | 0.5225|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.7054|± |0.0045| +|mmlu | 2|none | |acc |↑ | 0.6052|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.5949|± |0.0070| +| - formal_logic | 1|none | 0|acc |↑ | 0.3968|± |0.0438| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7333|± |0.0345| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7500|± |0.0304| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7975|± |0.0262| +| - international_law | 1|none | 0|acc |↑ | 0.7355|± |0.0403| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6204|± |0.0469| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7239|± |0.0351| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6503|± |0.0257| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.5955|± |0.0164| +| - philosophy | 1|none | 0|acc |↑ | 0.6559|± |0.0270| +| - prehistory | 1|none | 0|acc |↑ | 0.6512|± |0.0265| +| - professional_law | 1|none | 0|acc |↑ | 0.4622|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.7602|± |0.0327| +| - other | 2|none | |acc |↑ | 0.6598|± |0.0082| +| - business_ethics | 1|none | 0|acc |↑ | 0.5600|± |0.0499| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6226|± |0.0298| +| - college_medicine | 1|none | 0|acc |↑ | 0.5896|± |0.0375| +| - global_facts | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - human_aging | 1|none | 0|acc |↑ | 0.5830|± |0.0331| +| - management | 1|none | 0|acc |↑ | 0.7670|± |0.0419| +| - marketing | 1|none | 0|acc |↑ | 0.8761|± |0.0216| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7400|± |0.0441| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7535|± |0.0154| +| - nutrition | 1|none | 0|acc |↑ | 0.6634|± |0.0271| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4752|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7463|± |0.0264| +| - virology | 1|none | 0|acc |↑ | 0.4518|± |0.0387| +| - social sciences | 2|none | |acc |↑ | 0.6675|± |0.0083| +| - econometrics | 1|none | 0|acc |↑ | 0.3947|± |0.0460| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7273|± |0.0317| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.7513|± |0.0312| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5590|± |0.0252| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.6218|± |0.0315| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7651|± |0.0182| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6794|± |0.0409| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6111|± |0.0197| +| - public_relations | 1|none | 0|acc |↑ | 0.6091|± |0.0467| +| - security_studies | 1|none | 0|acc |↑ | 0.6612|± |0.0303| +| - sociology | 1|none | 0|acc |↑ | 0.8109|± |0.0277| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8200|± |0.0386| +| - stem | 2|none | |acc |↑ | 0.5059|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - anatomy | 1|none | 0|acc |↑ | 0.6000|± |0.0423| +| - astronomy | 1|none | 0|acc |↑ | 0.6776|± |0.0380| +| - college_biology | 1|none | 0|acc |↑ | 0.7083|± |0.0380| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3600|± |0.0482| +| - college_computer_science | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - college_physics | 1|none | 0|acc |↑ | 0.3529|± |0.0476| +| - computer_security | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5106|± |0.0327| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5793|± |0.0411| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4127|± |0.0254| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7065|± |0.0259| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5369|± |0.0351| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6100|± |0.0490| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3667|± |0.0294| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4040|± |0.0401| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4167|± |0.0336| +| - machine_learning | 1|none | 0|acc |↑ | 0.5000|± |0.0475| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1391|± |0.0058| +|openbookqa | 1|none | 0|acc |↑ | 0.2740|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3580|± |0.0215| +|piqa | 1|none | 0|acc |↑ | 0.7552|± |0.0100| +| | |none | 0|acc_norm |↑ | 0.7552|± |0.0100| +|qnli | 1|none | 0|acc |↑ | 0.5451|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9520|± |0.0068| +| | |none | 0|acc_norm |↑ | 0.9320|± |0.0080| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3389|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5581|± |0.0174| +| | |none | 0|bleu_diff |↑ |13.1234|± |1.1545| +| | |none | 0|bleu_max |↑ |35.5541|± |0.8709| +| | |none | 0|rouge1_acc |↑ | 0.5508|± |0.0174| +| | |none | 0|rouge1_diff|↑ |18.8892|± |1.6130| +| | |none | 0|rouge1_max |↑ |60.6706|± |0.9979| +| | |none | 0|rouge2_acc |↑ | 0.5067|± |0.0175| +| | |none | 0|rouge2_diff|↑ |19.4222|± |1.7185| +| | |none | 0|rouge2_max |↑ |47.9947|± |1.2171| +| | |none | 0|rougeL_acc |↑ | 0.5361|± |0.0175| +| | |none | 0|rougeL_diff|↑ |18.4665|± |1.6326| +| | |none | 0|rougeL_max |↑ |58.5696|± |1.0349| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3268|± |0.0164| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4976|± |0.0148| +|winogrande | 1|none | 0|acc |↑ | 0.6709|± |0.0132| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5564|± |0.0056| +|mmlu | 2|none | |acc |↑ |0.6052|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.5949|± |0.0070| +| - other | 2|none | |acc |↑ |0.6598|± |0.0082| +| - social sciences| 2|none | |acc |↑ |0.6675|± |0.0083| +| - stem | 2|none | |acc |↑ |0.5059|± |0.0086| + +meta-llama_Llama-3.2-3B-Instruct: 7h 12m 29s +✅ Benchmark completed for meta-llama_Llama-3.2-3B-Instruct + +🔥 Starting benchmark for meta-llama_Llama-3.1-8B-Instruct +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/meta-llama_Llama-3.1-8B-Instruct), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4820|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4670|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4433|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.5179|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5503|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.7156|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9160|± |0.0176| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5775|± |0.0362| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.1200|± |0.0206| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.4920|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.5160|± |0.0317| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4280|± |0.0314| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8400|± |0.0232| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7800|± |0.0263| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.6680|± |0.0298| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8760|± |0.0209| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7945|± |0.0336| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.7680|± |0.0268| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5720|± |0.0314| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6517|± |0.0358| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9400|± |0.0151| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.8840|± |0.0203| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.8280|± |0.0239| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8400|± |0.0232| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +|boolq | 2|none | 0|acc |↑ | 0.8416|± |0.0064| +|drop | 3|none | 0|em |↑ | 0.0448|± |0.0021| +| | |none | 0|f1 |↑ | 0.1937|± |0.0028| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1010|± |0.0215| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1313|± |0.0241| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3182|± |0.0332| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3535|± |0.0341| +| | |none | 0|acc_norm |↑ | 0.3535|± |0.0341| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3232|± |0.0333| +| | |none | 0|acc_norm |↑ | 0.3232|± |0.0333| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1374|± |0.0147| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1703|± |0.0161| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2894|± |0.0194| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3150|± |0.0199| +| | |none | 0|acc_norm |↑ | 0.3150|± |0.0199| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3132|± |0.0199| +| | |none | 0|acc_norm |↑ | 0.3132|± |0.0199| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1362|± |0.0162| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1272|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2723|± |0.0211| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3393|± |0.0224| +| | |none | 0|acc_norm |↑ | 0.3393|± |0.0224| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3438|± |0.0225| +| | |none | 0|acc_norm |↑ | 0.3438|± |0.0225| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7779|± |0.0114| +| | |strict-match | 5|exact_match|↑ | 0.7544|± |0.0119| +|hellaswag | 1|none | 0|acc |↑ | 0.5909|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7921|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.6793|± |0.0038| +| - humanities | 2|none | |acc |↑ | 0.6427|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.4762|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7636|± |0.0332| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8431|± |0.0255| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8565|± |0.0228| +| - international_law | 1|none | 0|acc |↑ | 0.8182|± |0.0352| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7778|± |0.0402| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7914|± |0.0319| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7457|± |0.0234| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.5721|± |0.0165| +| - philosophy | 1|none | 0|acc |↑ | 0.7203|± |0.0255| +| - prehistory | 1|none | 0|acc |↑ | 0.7438|± |0.0243| +| - professional_law | 1|none | 0|acc |↑ | 0.5039|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8363|± |0.0284| +| - other | 2|none | |acc |↑ | 0.7422|± |0.0075| +| - business_ethics | 1|none | 0|acc |↑ | 0.6800|± |0.0469| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7849|± |0.0253| +| - college_medicine | 1|none | 0|acc |↑ | 0.6936|± |0.0351| +| - global_facts | 1|none | 0|acc |↑ | 0.3800|± |0.0488| +| - human_aging | 1|none | 0|acc |↑ | 0.7040|± |0.0306| +| - management | 1|none | 0|acc |↑ | 0.8155|± |0.0384| +| - marketing | 1|none | 0|acc |↑ | 0.8932|± |0.0202| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8404|± |0.0131| +| - nutrition | 1|none | 0|acc |↑ | 0.7549|± |0.0246| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5532|± |0.0297| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7831|± |0.0250| +| - virology | 1|none | 0|acc |↑ | 0.5181|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7689|± |0.0074| +| - econometrics | 1|none | 0|acc |↑ | 0.5000|± |0.0470| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7929|± |0.0289| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8756|± |0.0238| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6795|± |0.0237| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7941|± |0.0263| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8587|± |0.0149| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7863|± |0.0360| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7173|± |0.0182| +| - public_relations | 1|none | 0|acc |↑ | 0.6818|± |0.0446| +| - security_studies | 1|none | 0|acc |↑ | 0.7510|± |0.0277| +| - sociology | 1|none | 0|acc |↑ | 0.8607|± |0.0245| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8700|± |0.0338| +| - stem | 2|none | |acc |↑ | 0.5845|± |0.0084| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - anatomy | 1|none | 0|acc |↑ | 0.6889|± |0.0400| +| - astronomy | 1|none | 0|acc |↑ | 0.7566|± |0.0349| +| - college_biology | 1|none | 0|acc |↑ | 0.8125|± |0.0326| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5800|± |0.0496| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - college_physics | 1|none | 0|acc |↑ | 0.4314|± |0.0493| +| - computer_security | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.6000|± |0.0320| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6552|± |0.0396| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4868|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8065|± |0.0225| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6404|± |0.0338| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4148|± |0.0300| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4636|± |0.0407| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5463|± |0.0340| +| - machine_learning | 1|none | 0|acc |↑ | 0.4643|± |0.0473| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1776|± |0.0064| +|openbookqa | 1|none | 0|acc |↑ | 0.3360|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.4320|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.8009|± |0.0093| +| | |none | 0|acc_norm |↑ | 0.8063|± |0.0092| +|qnli | 1|none | 0|acc |↑ | 0.5014|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9670|± |0.0057| +| | |none | 0|acc_norm |↑ | 0.9620|± |0.0060| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5182|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.6255|± |0.0169| +| | |none | 0|bleu_diff |↑ |15.3392|± |1.0952| +| | |none | 0|bleu_max |↑ |36.1393|± |0.8796| +| | |none | 0|rouge1_acc |↑ | 0.6083|± |0.0171| +| | |none | 0|rouge1_diff|↑ |21.4366|± |1.5980| +| | |none | 0|rouge1_max |↑ |60.7499|± |0.9981| +| | |none | 0|rouge2_acc |↑ | 0.5606|± |0.0174| +| | |none | 0|rouge2_diff|↑ |23.0331|± |1.6393| +| | |none | 0|rouge2_max |↑ |48.3161|± |1.2088| +| | |none | 0|rougeL_acc |↑ | 0.6083|± |0.0171| +| | |none | 0|rougeL_diff|↑ |21.4950|± |1.6133| +| | |none | 0|rougeL_max |↑ |58.8871|± |1.0298| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3660|± |0.0169| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5412|± |0.0150| +|winogrande | 1|none | 0|acc |↑ | 0.7388|± |0.0123| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.7156|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.6793|± |0.0038| +| - humanities | 2|none | |acc |↑ |0.6427|± |0.0067| +| - other | 2|none | |acc |↑ |0.7422|± |0.0075| +| - social sciences| 2|none | |acc |↑ |0.7689|± |0.0074| +| - stem | 2|none | |acc |↑ |0.5845|± |0.0084| + +meta-llama_Llama-3.1-8B-Instruct: 12h 19m 31s +✅ Benchmark completed for meta-llama_Llama-3.1-8B-Instruct + +🔥 Starting benchmark for meta-llama_Llama-2-7b-chat-hf +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 4 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 4 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/meta-llama_Llama-2-7b-chat-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4170|± |0.0156| +|anli_r2 | 1|none | 0|acc |↑ | 0.4100|± |0.0156| +|anli_r3 | 1|none | 0|acc |↑ | 0.4075|± |0.0142| +|arc_challenge | 1|none | 0|acc |↑ | 0.4411|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.4428|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.4013|± |0.0055| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.6520|± |0.0302| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5455|± |0.0365| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5680|± |0.0314| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0360|± |0.0118| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5280|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3280|± |0.0298| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3560|± |0.0303| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1960|± |0.0252| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.6080|± |0.0309| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.6800|± |0.0296| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0200|± |0.0089| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6040|± |0.0310| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.4960|± |0.0317| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.3288|± |0.0390| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.3840|± |0.0308| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4720|± |0.0316| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4320|± |0.0314| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5056|± |0.0376| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9040|± |0.0187| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.1160|± |0.0203| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1680|± |0.0237| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1560|± |0.0230| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.4840|± |0.0317| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0640|± |0.0155| +|boolq | 2|none | 0|acc |↑ | 0.7979|± |0.0070| +|drop | 3|none | 0|em |↑ | 0.0358|± |0.0019| +| | |none | 0|f1 |↑ | 0.1175|± |0.0025| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1717|± |0.0269| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1717|± |0.0269| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2980|± |0.0326| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2929|± |0.0324| +| | |none | 0|acc_norm |↑ | 0.2929|± |0.0324| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2525|± |0.0310| +| | |none | 0|acc_norm |↑ | 0.2525|± |0.0310| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2216|± |0.0178| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2106|± |0.0175| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2546|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2637|± |0.0189| +| | |none | 0|acc_norm |↑ | 0.2637|± |0.0189| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2692|± |0.0190| +| | |none | 0|acc_norm |↑ | 0.2692|± |0.0190| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1629|± |0.0175| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1786|± |0.0181| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2522|± |0.0205| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2946|± |0.0216| +| | |none | 0|acc_norm |↑ | 0.2946|± |0.0216| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2612|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.2612|± |0.0208| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.2320|± |0.0116| +| | |strict-match | 5|exact_match|↑ | 0.2320|± |0.0116| +|hellaswag | 1|none | 0|acc |↑ | 0.5779|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7548|± |0.0043| +|mmlu | 2|none | |acc |↑ | 0.4636|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.4332|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.2381|± |0.0381| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.5818|± |0.0385| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.6618|± |0.0332| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.6203|± |0.0316| +| - international_law | 1|none | 0|acc |↑ | 0.5950|± |0.0448| +| - jurisprudence | 1|none | 0|acc |↑ | 0.5741|± |0.0478| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.5767|± |0.0388| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5058|± |0.0269| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.5273|± |0.0284| +| - prehistory | 1|none | 0|acc |↑ | 0.5463|± |0.0277| +| - professional_law | 1|none | 0|acc |↑ | 0.3592|± |0.0123| +| - world_religions | 1|none | 0|acc |↑ | 0.6901|± |0.0355| +| - other | 2|none | |acc |↑ | 0.5488|± |0.0086| +| - business_ethics | 1|none | 0|acc |↑ | 0.4500|± |0.0500| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5509|± |0.0306| +| - college_medicine | 1|none | 0|acc |↑ | 0.3815|± |0.0370| +| - global_facts | 1|none | 0|acc |↑ | 0.4000|± |0.0492| +| - human_aging | 1|none | 0|acc |↑ | 0.5830|± |0.0331| +| - management | 1|none | 0|acc |↑ | 0.6796|± |0.0462| +| - marketing | 1|none | 0|acc |↑ | 0.7564|± |0.0281| +| - medical_genetics | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6897|± |0.0165| +| - nutrition | 1|none | 0|acc |↑ | 0.4902|± |0.0286| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3652|± |0.0287| +| - professional_medicine | 1|none | 0|acc |↑ | 0.4154|± |0.0299| +| - virology | 1|none | 0|acc |↑ | 0.4639|± |0.0388| +| - social sciences | 2|none | |acc |↑ | 0.5304|± |0.0087| +| - econometrics | 1|none | 0|acc |↑ | 0.2982|± |0.0430| +| - high_school_geography | 1|none | 0|acc |↑ | 0.5909|± |0.0350| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.6839|± |0.0336| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4000|± |0.0248| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.3613|± |0.0312| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.6349|± |0.0206| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5649|± |0.0435| +| - professional_psychology | 1|none | 0|acc |↑ | 0.4673|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.5364|± |0.0478| +| - security_studies | 1|none | 0|acc |↑ | 0.4980|± |0.0320| +| - sociology | 1|none | 0|acc |↑ | 0.7413|± |0.0310| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - stem | 2|none | |acc |↑ | 0.3600|± |0.0084| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3000|± |0.0461| +| - anatomy | 1|none | 0|acc |↑ | 0.4444|± |0.0429| +| - astronomy | 1|none | 0|acc |↑ | 0.4934|± |0.0407| +| - college_biology | 1|none | 0|acc |↑ | 0.4514|± |0.0416| +| - college_chemistry | 1|none | 0|acc |↑ | 0.2500|± |0.0435| +| - college_computer_science | 1|none | 0|acc |↑ | 0.3200|± |0.0469| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - college_physics | 1|none | 0|acc |↑ | 0.1961|± |0.0395| +| - computer_security | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.4000|± |0.0320| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4483|± |0.0414| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2751|± |0.0230| +| - high_school_biology | 1|none | 0|acc |↑ | 0.4935|± |0.0284| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.3350|± |0.0332| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.4000|± |0.0492| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2704|± |0.0271| +| - high_school_physics | 1|none | 0|acc |↑ | 0.2781|± |0.0366| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.2685|± |0.0302| +| - machine_learning | 1|none | 0|acc |↑ | 0.3571|± |0.0455| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0668|± |0.0042| +|openbookqa | 1|none | 0|acc |↑ | 0.3340|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.4380|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7644|± |0.0099| +| | |none | 0|acc_norm |↑ | 0.7715|± |0.0098| +|qnli | 1|none | 0|acc |↑ | 0.5801|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9400|± |0.0075| +| | |none | 0|acc_norm |↑ | 0.8780|± |0.0104| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.1904|± |0.0029| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4517|± |0.0174| +| | |none | 0|bleu_diff |↑ |-1.6714|± |0.6136| +| | |none | 0|bleu_max |↑ |20.5268|± |0.7001| +| | |none | 0|rouge1_acc |↑ | 0.4468|± |0.0174| +| | |none | 0|rouge1_diff|↑ |-1.6626|± |0.7580| +| | |none | 0|rouge1_max |↑ |45.4458|± |0.8003| +| | |none | 0|rouge2_acc |↑ | 0.3807|± |0.0170| +| | |none | 0|rouge2_diff|↑ |-3.1513|± |0.8822| +| | |none | 0|rouge2_max |↑ |30.2564|± |0.8906| +| | |none | 0|rougeL_acc |↑ | 0.4480|± |0.0174| +| | |none | 0|rougeL_diff|↑ |-1.9429|± |0.7563| +| | |none | 0|rougeL_max |↑ |42.1653|± |0.8032| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3023|± |0.0161| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4532|± |0.0156| +|winogrande | 1|none | 0|acc |↑ | 0.6646|± |0.0133| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4013|± |0.0055| +|mmlu | 2|none | |acc |↑ |0.4636|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.4332|± |0.0069| +| - other | 2|none | |acc |↑ |0.5488|± |0.0086| +| - social sciences| 2|none | |acc |↑ |0.5304|± |0.0087| +| - stem | 2|none | |acc |↑ |0.3600|± |0.0084| + +meta-llama_Llama-2-7b-chat-hf: 6h 58m 9s +✅ Benchmark completed for meta-llama_Llama-2-7b-chat-hf + +🔥 Starting benchmark for mistralai_Mistral-Nemo-Instruct-2407 +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +mistralai_Mistral-Nemo-Instruct-2407: 0h 7m 53s +✅ Benchmark completed for mistralai_Mistral-Nemo-Instruct-2407 + +🔥 Starting benchmark for mistralai_Ministral-8B-Instruct-2410 +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/mistralai_Ministral-8B-Instruct-2410), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4880|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4870|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4658|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5452|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5623|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.6925|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.6096|± |0.0358| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.8200|± |0.0243| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6000|± |0.0310| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0560|± |0.0146| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.6240|± |0.0307| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4280|± |0.0314| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7200|± |0.0285| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.6360|± |0.0305| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8160|± |0.0246| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.9000|± |0.0190| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7260|± |0.0370| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.6400|± |0.0304| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4800|± |0.0317| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6517|± |0.0358| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9240|± |0.0168| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.7720|± |0.0266| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.4200|± |0.0313| +|boolq | 2|none | 0|acc |↑ | 0.8602|± |0.0061| +|drop | 3|none | 0|em |↑ | 0.0229|± |0.0015| +| | |none | 0|f1 |↑ | 0.0714|± |0.0021| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1566|± |0.0259| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2121|± |0.0291| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2828|± |0.0321| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3081|± |0.0329| +| | |none | 0|acc_norm |↑ | 0.3081|± |0.0329| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3384|± |0.0337| +| | |none | 0|acc_norm |↑ | 0.3384|± |0.0337| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1978|± |0.0171| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2546|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3095|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3645|± |0.0206| +| | |none | 0|acc_norm |↑ | 0.3645|± |0.0206| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2985|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2985|± |0.0196| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2299|± |0.0199| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2254|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3058|± |0.0218| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3125|± |0.0219| +| | |none | 0|acc_norm |↑ | 0.3125|± |0.0219| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3415|± |0.0224| +| | |none | 0|acc_norm |↑ | 0.3415|± |0.0224| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7786|± |0.0114| +| | |strict-match | 5|exact_match|↑ | 0.7748|± |0.0115| +|hellaswag | 1|none | 0|acc |↑ | 0.5959|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7911|± |0.0041| +|mmlu | 2|none | |acc |↑ | 0.6407|± |0.0038| +| - humanities | 2|none | |acc |↑ | 0.5792|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.4365|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7515|± |0.0337| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8480|± |0.0252| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8186|± |0.0251| +| - international_law | 1|none | 0|acc |↑ | 0.7851|± |0.0375| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7870|± |0.0396| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7546|± |0.0338| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6792|± |0.0251| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3453|± |0.0159| +| - philosophy | 1|none | 0|acc |↑ | 0.7042|± |0.0259| +| - prehistory | 1|none | 0|acc |↑ | 0.6821|± |0.0259| +| - professional_law | 1|none | 0|acc |↑ | 0.4922|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8012|± |0.0306| +| - other | 2|none | |acc |↑ | 0.7123|± |0.0079| +| - business_ethics | 1|none | 0|acc |↑ | 0.6200|± |0.0488| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6792|± |0.0287| +| - college_medicine | 1|none | 0|acc |↑ | 0.6590|± |0.0361| +| - global_facts | 1|none | 0|acc |↑ | 0.4400|± |0.0499| +| - human_aging | 1|none | 0|acc |↑ | 0.6906|± |0.0310| +| - management | 1|none | 0|acc |↑ | 0.7767|± |0.0412| +| - marketing | 1|none | 0|acc |↑ | 0.8675|± |0.0222| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8186|± |0.0138| +| - nutrition | 1|none | 0|acc |↑ | 0.7549|± |0.0246| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5142|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7206|± |0.0273| +| - virology | 1|none | 0|acc |↑ | 0.5542|± |0.0387| +| - social sciences | 2|none | |acc |↑ | 0.7439|± |0.0077| +| - econometrics | 1|none | 0|acc |↑ | 0.4386|± |0.0467| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7980|± |0.0286| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8860|± |0.0229| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6641|± |0.0239| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7017|± |0.0297| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8532|± |0.0152| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7481|± |0.0381| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6667|± |0.0191| +| - public_relations | 1|none | 0|acc |↑ | 0.6818|± |0.0446| +| - security_studies | 1|none | 0|acc |↑ | 0.7469|± |0.0278| +| - sociology | 1|none | 0|acc |↑ | 0.8358|± |0.0262| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8700|± |0.0338| +| - stem | 2|none | |acc |↑ | 0.5614|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3500|± |0.0479| +| - anatomy | 1|none | 0|acc |↑ | 0.6741|± |0.0405| +| - astronomy | 1|none | 0|acc |↑ | 0.7237|± |0.0364| +| - college_biology | 1|none | 0|acc |↑ | 0.7708|± |0.0351| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3700|± |0.0485| +| - college_physics | 1|none | 0|acc |↑ | 0.3725|± |0.0481| +| - computer_security | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5745|± |0.0323| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5517|± |0.0414| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4577|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7968|± |0.0229| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5320|± |0.0351| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3889|± |0.0297| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4371|± |0.0405| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5972|± |0.0334| +| - machine_learning | 1|none | 0|acc |↑ | 0.5179|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1576|± |0.0061| +|openbookqa | 1|none | 0|acc |↑ | 0.3640|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.4660|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.8096|± |0.0092| +| | |none | 0|acc_norm |↑ | 0.8232|± |0.0089| +|qnli | 1|none | 0|acc |↑ | 0.4950|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9680|± |0.0056| +| | |none | 0|acc_norm |↑ | 0.9560|± |0.0065| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5278|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.0734|± |0.0091| +| | |none | 0|bleu_diff |↑ |-1.3264|± |0.3132| +| | |none | 0|bleu_max |↑ | 4.1401|± |0.4048| +| | |none | 0|rouge1_acc |↑ | 0.0759|± |0.0093| +| | |none | 0|rouge1_diff|↑ |-2.7245|± |0.4932| +| | |none | 0|rouge1_max |↑ | 9.7318|± |0.7309| +| | |none | 0|rouge2_acc |↑ | 0.0661|± |0.0087| +| | |none | 0|rouge2_diff|↑ |-2.5882|± |0.5097| +| | |none | 0|rouge2_max |↑ | 6.3193|± |0.5845| +| | |none | 0|rougeL_acc |↑ | 0.0722|± |0.0091| +| | |none | 0|rougeL_diff|↑ |-2.8073|± |0.4921| +| | |none | 0|rougeL_max |↑ | 9.1139|± |0.6993| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3256|± |0.0164| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4867|± |0.0147| +|winogrande | 1|none | 0|acc |↑ | 0.7380|± |0.0124| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6925|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.6407|± |0.0038| +| - humanities | 2|none | |acc |↑ |0.5792|± |0.0068| +| - other | 2|none | |acc |↑ |0.7123|± |0.0079| +| - social sciences| 2|none | |acc |↑ |0.7439|± |0.0077| +| - stem | 2|none | |acc |↑ |0.5614|± |0.0085| + +mistralai_Ministral-8B-Instruct-2410: 10h 46m 2s +✅ Benchmark completed for mistralai_Ministral-8B-Instruct-2410 + +🔥 Starting benchmark for mistralai_Ministral-8B-Instruct-2410 +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/mistralai_Ministral-8B-Instruct-2410), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4880|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4870|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4658|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5452|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5623|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.6925|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.6096|± |0.0358| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.8200|± |0.0243| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6000|± |0.0310| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0560|± |0.0146| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.6240|± |0.0307| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4280|± |0.0314| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7200|± |0.0285| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.6360|± |0.0305| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8160|± |0.0246| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.9000|± |0.0190| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7260|± |0.0370| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.6400|± |0.0304| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4800|± |0.0317| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6517|± |0.0358| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9240|± |0.0168| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.7720|± |0.0266| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.4200|± |0.0313| +|boolq | 2|none | 0|acc |↑ | 0.8602|± |0.0061| +|drop | 3|none | 0|em |↑ | 0.0229|± |0.0015| +| | |none | 0|f1 |↑ | 0.0714|± |0.0021| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1566|± |0.0259| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2121|± |0.0291| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2828|± |0.0321| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3081|± |0.0329| +| | |none | 0|acc_norm |↑ | 0.3081|± |0.0329| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3384|± |0.0337| +| | |none | 0|acc_norm |↑ | 0.3384|± |0.0337| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1978|± |0.0171| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2546|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3095|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3645|± |0.0206| +| | |none | 0|acc_norm |↑ | 0.3645|± |0.0206| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2985|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2985|± |0.0196| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2299|± |0.0199| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2254|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3058|± |0.0218| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3125|± |0.0219| +| | |none | 0|acc_norm |↑ | 0.3125|± |0.0219| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3415|± |0.0224| +| | |none | 0|acc_norm |↑ | 0.3415|± |0.0224| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7786|± |0.0114| +| | |strict-match | 5|exact_match|↑ | 0.7748|± |0.0115| +|hellaswag | 1|none | 0|acc |↑ | 0.5959|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7911|± |0.0041| +|mmlu | 2|none | |acc |↑ | 0.6407|± |0.0038| +| - humanities | 2|none | |acc |↑ | 0.5792|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.4365|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7515|± |0.0337| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8480|± |0.0252| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8186|± |0.0251| +| - international_law | 1|none | 0|acc |↑ | 0.7851|± |0.0375| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7870|± |0.0396| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7546|± |0.0338| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6792|± |0.0251| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3453|± |0.0159| +| - philosophy | 1|none | 0|acc |↑ | 0.7042|± |0.0259| +| - prehistory | 1|none | 0|acc |↑ | 0.6821|± |0.0259| +| - professional_law | 1|none | 0|acc |↑ | 0.4922|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8012|± |0.0306| +| - other | 2|none | |acc |↑ | 0.7123|± |0.0079| +| - business_ethics | 1|none | 0|acc |↑ | 0.6200|± |0.0488| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6792|± |0.0287| +| - college_medicine | 1|none | 0|acc |↑ | 0.6590|± |0.0361| +| - global_facts | 1|none | 0|acc |↑ | 0.4400|± |0.0499| +| - human_aging | 1|none | 0|acc |↑ | 0.6906|± |0.0310| +| - management | 1|none | 0|acc |↑ | 0.7767|± |0.0412| +| - marketing | 1|none | 0|acc |↑ | 0.8675|± |0.0222| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8186|± |0.0138| +| - nutrition | 1|none | 0|acc |↑ | 0.7549|± |0.0246| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5142|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7206|± |0.0273| +| - virology | 1|none | 0|acc |↑ | 0.5542|± |0.0387| +| - social sciences | 2|none | |acc |↑ | 0.7439|± |0.0077| +| - econometrics | 1|none | 0|acc |↑ | 0.4386|± |0.0467| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7980|± |0.0286| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8860|± |0.0229| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6641|± |0.0239| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7017|± |0.0297| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8532|± |0.0152| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7481|± |0.0381| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6667|± |0.0191| +| - public_relations | 1|none | 0|acc |↑ | 0.6818|± |0.0446| +| - security_studies | 1|none | 0|acc |↑ | 0.7469|± |0.0278| +| - sociology | 1|none | 0|acc |↑ | 0.8358|± |0.0262| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8700|± |0.0338| +| - stem | 2|none | |acc |↑ | 0.5614|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3500|± |0.0479| +| - anatomy | 1|none | 0|acc |↑ | 0.6741|± |0.0405| +| - astronomy | 1|none | 0|acc |↑ | 0.7237|± |0.0364| +| - college_biology | 1|none | 0|acc |↑ | 0.7708|± |0.0351| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3700|± |0.0485| +| - college_physics | 1|none | 0|acc |↑ | 0.3725|± |0.0481| +| - computer_security | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5745|± |0.0323| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5517|± |0.0414| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4577|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7968|± |0.0229| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5320|± |0.0351| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3889|± |0.0297| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4371|± |0.0405| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5972|± |0.0334| +| - machine_learning | 1|none | 0|acc |↑ | 0.5179|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1576|± |0.0061| +|openbookqa | 1|none | 0|acc |↑ | 0.3640|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.4660|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.8096|± |0.0092| +| | |none | 0|acc_norm |↑ | 0.8232|± |0.0089| +|qnli | 1|none | 0|acc |↑ | 0.4950|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9680|± |0.0056| +| | |none | 0|acc_norm |↑ | 0.9560|± |0.0065| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5278|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.0734|± |0.0091| +| | |none | 0|bleu_diff |↑ |-1.3264|± |0.3132| +| | |none | 0|bleu_max |↑ | 4.1401|± |0.4048| +| | |none | 0|rouge1_acc |↑ | 0.0759|± |0.0093| +| | |none | 0|rouge1_diff|↑ |-2.7245|± |0.4932| +| | |none | 0|rouge1_max |↑ | 9.7318|± |0.7309| +| | |none | 0|rouge2_acc |↑ | 0.0661|± |0.0087| +| | |none | 0|rouge2_diff|↑ |-2.5882|± |0.5097| +| | |none | 0|rouge2_max |↑ | 6.3193|± |0.5845| +| | |none | 0|rougeL_acc |↑ | 0.0722|± |0.0091| +| | |none | 0|rougeL_diff|↑ |-2.8073|± |0.4921| +| | |none | 0|rougeL_max |↑ | 9.1139|± |0.6993| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3256|± |0.0164| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4867|± |0.0147| +|winogrande | 1|none | 0|acc |↑ | 0.7380|± |0.0124| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6925|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.6407|± |0.0038| +| - humanities | 2|none | |acc |↑ |0.5792|± |0.0068| +| - other | 2|none | |acc |↑ |0.7123|± |0.0079| +| - social sciences| 2|none | |acc |↑ |0.7439|± |0.0077| +| - stem | 2|none | |acc |↑ |0.5614|± |0.0085| + +mistralai_Ministral-8B-Instruct-2410: 10h 46m 19s +✅ Benchmark completed for mistralai_Ministral-8B-Instruct-2410 + +🔥 Starting benchmark for google_gemma-3-4b-it +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 4 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 4 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/google_gemma-3-4b-it), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4920|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4710|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4683|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5341|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5708|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.7094|± |0.0050| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9000|± |0.0190| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5775|± |0.0362| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7760|± |0.0264| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.3440|± |0.0301| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9760|± |0.0097| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.6040|± |0.0310| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7000|± |0.0290| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.8400|± |0.0232| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8680|± |0.0215| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.9120|± |0.0180| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7466|± |0.0361| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.7240|± |0.0283| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.6760|± |0.0297| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5240|± |0.0316| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6404|± |0.0361| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8880|± |0.0200| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.8600|± |0.0220| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.8080|± |0.0250| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.7840|± |0.0261| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.9760|± |0.0097| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9960|± |0.0040| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2640|± |0.0279| +|boolq | 2|none | 0|acc |↑ | 0.8398|± |0.0064| +|drop | 3|none | 0|em |↑ | 0.0055|± |0.0008| +| | |none | 0|f1 |↑ | 0.0893|± |0.0018| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0266| +| | |strict-match | 0|exact_match|↑ | 0.0152|± |0.0087| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1515|± |0.0255| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3384|± |0.0337| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3434|± |0.0338| +| | |none | 0|acc_norm |↑ | 0.3434|± |0.0338| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3535|± |0.0341| +| | |none | 0|acc_norm |↑ | 0.3535|± |0.0341| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1740|± |0.0162| +| | |strict-match | 0|exact_match|↑ | 0.0147|± |0.0051| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1612|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2985|± |0.0196| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2711|± |0.0190| +| | |none | 0|acc_norm |↑ | 0.2711|± |0.0190| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3407|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3407|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1496|± |0.0169| +| | |strict-match | 0|exact_match|↑ | 0.0067|± |0.0039| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1496|± |0.0169| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3125|± |0.0219| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3080|± |0.0218| +| | |none | 0|acc_norm |↑ | 0.3080|± |0.0218| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2879|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.2879|± |0.0214| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7665|± |0.0117| +| | |strict-match | 5|exact_match|↑ | 0.7619|± |0.0117| +|hellaswag | 1|none | 0|acc |↑ | 0.5599|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.7414|± |0.0044| +|mmlu | 2|none | |acc |↑ | 0.5756|± |0.0039| +| - humanities | 2|none | |acc |↑ | 0.5163|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.3571|± |0.0429| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7455|± |0.0340| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7451|± |0.0306| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7468|± |0.0283| +| - international_law | 1|none | 0|acc |↑ | 0.7438|± |0.0398| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7037|± |0.0441| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7178|± |0.0354| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6301|± |0.0260| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2480|± |0.0144| +| - philosophy | 1|none | 0|acc |↑ | 0.6592|± |0.0269| +| - prehistory | 1|none | 0|acc |↑ | 0.6821|± |0.0259| +| - professional_law | 1|none | 0|acc |↑ | 0.4237|± |0.0126| +| - world_religions | 1|none | 0|acc |↑ | 0.7778|± |0.0319| +| - other | 2|none | |acc |↑ | 0.6369|± |0.0083| +| - business_ethics | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6566|± |0.0292| +| - college_medicine | 1|none | 0|acc |↑ | 0.5723|± |0.0377| +| - global_facts | 1|none | 0|acc |↑ | 0.2900|± |0.0456| +| - human_aging | 1|none | 0|acc |↑ | 0.6278|± |0.0324| +| - management | 1|none | 0|acc |↑ | 0.7282|± |0.0441| +| - marketing | 1|none | 0|acc |↑ | 0.8462|± |0.0236| +| - medical_genetics | 1|none | 0|acc |↑ | 0.6300|± |0.0485| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7573|± |0.0153| +| - nutrition | 1|none | 0|acc |↑ | 0.6438|± |0.0274| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3901|± |0.0291| +| - professional_medicine | 1|none | 0|acc |↑ | 0.5772|± |0.0300| +| - virology | 1|none | 0|acc |↑ | 0.5060|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.6744|± |0.0083| +| - econometrics | 1|none | 0|acc |↑ | 0.4649|± |0.0469| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7020|± |0.0326| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8135|± |0.0281| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5718|± |0.0251| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.6387|± |0.0312| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7780|± |0.0178| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6641|± |0.0414| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5948|± |0.0199| +| - public_relations | 1|none | 0|acc |↑ | 0.6455|± |0.0458| +| - security_studies | 1|none | 0|acc |↑ | 0.6980|± |0.0294| +| - sociology | 1|none | 0|acc |↑ | 0.7612|± |0.0301| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - stem | 2|none | |acc |↑ | 0.5071|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3200|± |0.0469| +| - anatomy | 1|none | 0|acc |↑ | 0.5481|± |0.0430| +| - astronomy | 1|none | 0|acc |↑ | 0.6908|± |0.0376| +| - college_biology | 1|none | 0|acc |↑ | 0.6875|± |0.0388| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4000|± |0.0492| +| - college_computer_science | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3700|± |0.0485| +| - college_physics | 1|none | 0|acc |↑ | 0.3725|± |0.0481| +| - computer_security | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5404|± |0.0326| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5310|± |0.0416| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4841|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7065|± |0.0259| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5074|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3815|± |0.0296| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3245|± |0.0382| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4074|± |0.0335| +| - machine_learning | 1|none | 0|acc |↑ | 0.3571|± |0.0455| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1094|± |0.0052| +|openbookqa | 1|none | 0|acc |↑ | 0.3640|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.4660|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.7628|± |0.0099| +| | |none | 0|acc_norm |↑ | 0.7720|± |0.0098| +|qnli | 1|none | 0|acc |↑ | 0.5660|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9550|± |0.0066| +| | |none | 0|acc_norm |↑ | 0.9310|± |0.0080| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3148|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4333|± |0.0173| +| | |none | 0|bleu_diff |↑ |-1.4479|± |0.5140| +| | |none | 0|bleu_max |↑ |18.0994|± |0.6738| +| | |none | 0|rouge1_acc |↑ | 0.4235|± |0.0173| +| | |none | 0|rouge1_diff|↑ |-2.6851|± |0.6899| +| | |none | 0|rouge1_max |↑ |41.5023|± |0.8412| +| | |none | 0|rouge2_acc |↑ | 0.3195|± |0.0163| +| | |none | 0|rouge2_diff|↑ |-4.2870|± |0.7901| +| | |none | 0|rouge2_max |↑ |25.1379|± |0.9049| +| | |none | 0|rougeL_acc |↑ | 0.4247|± |0.0173| +| | |none | 0|rougeL_diff|↑ |-2.9853|± |0.6819| +| | |none | 0|rougeL_max |↑ |38.8207|± |0.8446| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3488|± |0.0167| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5188|± |0.0160| +|winogrande | 1|none | 0|acc |↑ | 0.7009|± |0.0129| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.7094|± |0.0050| +|mmlu | 2|none | |acc |↑ |0.5756|± |0.0039| +| - humanities | 2|none | |acc |↑ |0.5163|± |0.0068| +| - other | 2|none | |acc |↑ |0.6369|± |0.0083| +| - social sciences| 2|none | |acc |↑ |0.6744|± |0.0083| +| - stem | 2|none | |acc |↑ |0.5071|± |0.0086| + +google_gemma-3-4b-it: 4h 51m 14s +✅ Benchmark completed for google_gemma-3-4b-it + +🔥 Starting benchmark for google_gemma-3-1b-it +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/google_gemma-3-1b-it), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|-------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3320|± |0.0149| +|anli_r2 | 1|none | 0|acc |↑ | 0.3540|± |0.0151| +|anli_r3 | 1|none | 0|acc |↑ | 0.3567|± |0.0138| +|arc_challenge | 1|none | 0|acc |↑ | 0.3532|± |0.0140| +| | |none | 0|acc_norm |↑ | 0.3805|± |0.0142| +|bbh | 3|get-answer | |exact_match|↑ | 0.3823|± |0.0053| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8320|± |0.0237| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5134|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.2680|± |0.0281| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.3800|± |0.0308| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0920|± |0.0183| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5040|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.1240|± |0.0209| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5240|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1560|± |0.0230| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.3960|± |0.0310| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.3080|± |0.0293| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.6560|± |0.0301| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.7440|± |0.0277| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.1712|± |0.0313| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.1600|± |0.0232| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.3640|± |0.0305| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.1160|± |0.0203| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5112|± |0.0376| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.6120|± |0.0309| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2240|± |0.0264| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1600|± |0.0232| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.5040|± |0.0317| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.8360|± |0.0235| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0360|± |0.0118| +|boolq | 2|none | 0|acc |↑ | 0.7581|± |0.0075| +|drop | 3|none | 0|em |↑ | 0.0018|± |0.0004| +| | |none | 0|f1 |↑ | 0.0762|± |0.0017| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1263|± |0.0237| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1364|± |0.0245| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2172|± |0.0294| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2626|± |0.0314| +| | |none | 0|acc_norm |↑ | 0.2626|± |0.0314| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2374|± |0.0303| +| | |none | 0|acc_norm |↑ | 0.2374|± |0.0303| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1300|± |0.0144| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1520|± |0.0154| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2601|± |0.0188| +| | |strict-match | 0|exact_match|↑ | 0.0037|± |0.0026| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2491|± |0.0185| +| | |none | 0|acc_norm |↑ | 0.2491|± |0.0185| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2637|± |0.0189| +| | |none | 0|acc_norm |↑ | 0.2637|± |0.0189| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1183|± |0.0153| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1607|± |0.0174| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2344|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2679|± |0.0209| +| | |none | 0|acc_norm |↑ | 0.2679|± |0.0209| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2656|± |0.0209| +| | |none | 0|acc_norm |↑ | 0.2656|± |0.0209| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.2502|± |0.0119| +| | |strict-match | 5|exact_match|↑ | 0.2472|± |0.0119| +|hellaswag | 1|none | 0|acc |↑ | 0.4338|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.5783|± |0.0049| +|mmlu | 2|none | |acc |↑ | 0.3859|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.3626|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.3492|± |0.0426| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.4909|± |0.0390| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.4706|± |0.0350| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.4726|± |0.0325| +| - international_law | 1|none | 0|acc |↑ | 0.5372|± |0.0455| +| - jurisprudence | 1|none | 0|acc |↑ | 0.4722|± |0.0483| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.4417|± |0.0390| +| - moral_disputes | 1|none | 0|acc |↑ | 0.4220|± |0.0266| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2335|± |0.0141| +| - philosophy | 1|none | 0|acc |↑ | 0.4244|± |0.0281| +| - prehistory | 1|none | 0|acc |↑ | 0.4414|± |0.0276| +| - professional_law | 1|none | 0|acc |↑ | 0.3057|± |0.0118| +| - world_religions | 1|none | 0|acc |↑ | 0.5029|± |0.0383| +| - other | 2|none | |acc |↑ | 0.4335|± |0.0087| +| - business_ethics | 1|none | 0|acc |↑ | 0.3700|± |0.0485| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.4226|± |0.0304| +| - college_medicine | 1|none | 0|acc |↑ | 0.3815|± |0.0370| +| - global_facts | 1|none | 0|acc |↑ | 0.3600|± |0.0482| +| - human_aging | 1|none | 0|acc |↑ | 0.4529|± |0.0334| +| - management | 1|none | 0|acc |↑ | 0.5631|± |0.0491| +| - marketing | 1|none | 0|acc |↑ | 0.6239|± |0.0317| +| - medical_genetics | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - miscellaneous | 1|none | 0|acc |↑ | 0.5147|± |0.0179| +| - nutrition | 1|none | 0|acc |↑ | 0.4150|± |0.0282| +| - professional_accounting | 1|none | 0|acc |↑ | 0.2730|± |0.0266| +| - professional_medicine | 1|none | 0|acc |↑ | 0.2941|± |0.0277| +| - virology | 1|none | 0|acc |↑ | 0.3795|± |0.0378| +| - social sciences | 2|none | |acc |↑ | 0.4482|± |0.0088| +| - econometrics | 1|none | 0|acc |↑ | 0.2193|± |0.0389| +| - high_school_geography | 1|none | 0|acc |↑ | 0.5000|± |0.0356| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.4870|± |0.0361| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.3282|± |0.0238| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.3782|± |0.0315| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.5321|± |0.0214| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5038|± |0.0439| +| - professional_psychology | 1|none | 0|acc |↑ | 0.3644|± |0.0195| +| - public_relations | 1|none | 0|acc |↑ | 0.4727|± |0.0478| +| - security_studies | 1|none | 0|acc |↑ | 0.5388|± |0.0319| +| - sociology | 1|none | 0|acc |↑ | 0.5970|± |0.0347| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - stem | 2|none | |acc |↑ | 0.3130|± |0.0081| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.2700|± |0.0446| +| - anatomy | 1|none | 0|acc |↑ | 0.4593|± |0.0430| +| - astronomy | 1|none | 0|acc |↑ | 0.3684|± |0.0393| +| - college_biology | 1|none | 0|acc |↑ | 0.3403|± |0.0396| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - college_computer_science | 1|none | 0|acc |↑ | 0.2600|± |0.0441| +| - college_mathematics | 1|none | 0|acc |↑ | 0.2300|± |0.0423| +| - college_physics | 1|none | 0|acc |↑ | 0.2059|± |0.0402| +| - computer_security | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.3745|± |0.0316| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4345|± |0.0413| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2302|± |0.0217| +| - high_school_biology | 1|none | 0|acc |↑ | 0.4323|± |0.0282| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.2857|± |0.0318| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2370|± |0.0259| +| - high_school_physics | 1|none | 0|acc |↑ | 0.2252|± |0.0341| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.2083|± |0.0277| +| - machine_learning | 1|none | 0|acc |↑ | 0.3750|± |0.0460| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0357|± |0.0031| +|openbookqa | 1|none | 0|acc |↑ | 0.3020|± |0.0206| +| | |none | 0|acc_norm |↑ | 0.3880|± |0.0218| +|piqa | 1|none | 0|acc |↑ | 0.7182|± |0.0105| +| | |none | 0|acc_norm |↑ | 0.7209|± |0.0105| +|qnli | 1|none | 0|acc |↑ | 0.4941|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9040|± |0.0093| +| | |none | 0|acc_norm |↑ | 0.8580|± |0.0110| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.1897|± |0.0029| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3060|± |0.0161| +| | |none | 0|bleu_diff |↑ | -7.1778|± |0.7355| +| | |none | 0|bleu_max |↑ | 23.2944|± |0.7624| +| | |none | 0|rouge1_acc |↑ | 0.2644|± |0.0154| +| | |none | 0|rouge1_diff|↑ |-10.0231|± |0.7875| +| | |none | 0|rouge1_max |↑ | 46.4515|± |0.9083| +| | |none | 0|rouge2_acc |↑ | 0.2044|± |0.0141| +| | |none | 0|rouge2_diff|↑ |-11.5180|± |0.9589| +| | |none | 0|rouge2_max |↑ | 30.6640|± |0.9977| +| | |none | 0|rougeL_acc |↑ | 0.2570|± |0.0153| +| | |none | 0|rougeL_diff|↑ |-10.3014|± |0.7848| +| | |none | 0|rougeL_max |↑ | 43.9439|± |0.9131| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2460|± |0.0151| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.3875|± |0.0152| +|winogrande | 1|none | 0|acc |↑ | 0.5896|± |0.0138| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.3823|± |0.0053| +|mmlu | 2|none | |acc |↑ |0.3859|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.3626|± |0.0069| +| - other | 2|none | |acc |↑ |0.4335|± |0.0087| +| - social sciences| 2|none | |acc |↑ |0.4482|± |0.0088| +| - stem | 2|none | |acc |↑ |0.3130|± |0.0081| + +google_gemma-3-1b-it: 6h 50m 53s +✅ Benchmark completed for google_gemma-3-1b-it + +🔥 Starting benchmark for google_gemma-3-12b-it +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 2 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 2 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/google_gemma-3-12b-it), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (2) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.6030|± |0.0155| +|anli_r2 | 1|none | 0|acc |↑ | 0.5600|± |0.0157| +|anli_r3 | 1|none | 0|acc |↑ | 0.5958|± |0.0142| +|arc_challenge | 1|none | 0|acc |↑ | 0.6084|± |0.0143| +| | |none | 0|acc_norm |↑ | 0.6109|± |0.0142| +|bbh | 3|get-answer | |exact_match|↑ | 0.8019|± |0.0044| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9160|± |0.0176| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5829|± |0.0362| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.8880|± |0.0200| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.7080|± |0.0288| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.6000|± |0.0310| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3440|± |0.0301| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9960|± |0.0040| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7960|± |0.0255| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4800|± |0.0317| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9520|± |0.0135| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9760|± |0.0097| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.9680|± |0.0112| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.8836|± |0.0266| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.9160|± |0.0176| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.7920|± |0.0257| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.8371|± |0.0278| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9680|± |0.0112| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.6760|± |0.0297| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.9720|± |0.0105| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.5480|± |0.0315| +|boolq | 2|none | 0|acc |↑ | 0.8746|± |0.0058| +|drop | 3|none | 0|em |↑ | 0.0214|± |0.0015| +| | |none | 0|f1 |↑ | 0.1396|± |0.0023| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1616|± |0.0262| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0909|± |0.0205| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2374|± |0.0303| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3434|± |0.0338| +| | |none | 0|acc_norm |↑ | 0.3434|± |0.0338| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3384|± |0.0337| +| | |none | 0|acc_norm |↑ | 0.3384|± |0.0337| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1575|± |0.0156| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1410|± |0.0149| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2436|± |0.0184| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3278|± |0.0201| +| | |none | 0|acc_norm |↑ | 0.3278|± |0.0201| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3077|± |0.0198| +| | |none | 0|acc_norm |↑ | 0.3077|± |0.0198| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1763|± |0.0180| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1518|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2277|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3371|± |0.0224| +| | |none | 0|acc_norm |↑ | 0.3371|± |0.0224| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3371|± |0.0224| +| | |none | 0|acc_norm |↑ | 0.3371|± |0.0224| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8848|± |0.0088| +| | |strict-match | 5|exact_match|↑ | 0.8772|± |0.0090| +|hellaswag | 1|none | 0|acc |↑ | 0.6266|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.8188|± |0.0038| +|mmlu | 2|none | |acc |↑ | 0.7161|± |0.0036| +| - humanities | 2|none | |acc |↑ | 0.6387|± |0.0065| +| - formal_logic | 1|none | 0|acc |↑ | 0.5556|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8606|± |0.0270| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8431|± |0.0255| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8945|± |0.0200| +| - international_law | 1|none | 0|acc |↑ | 0.8595|± |0.0317| +| - jurisprudence | 1|none | 0|acc |↑ | 0.8056|± |0.0383| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8344|± |0.0292| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7717|± |0.0226| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3676|± |0.0161| +| - philosophy | 1|none | 0|acc |↑ | 0.7910|± |0.0231| +| - prehistory | 1|none | 0|acc |↑ | 0.8148|± |0.0216| +| - professional_law | 1|none | 0|acc |↑ | 0.5424|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.8421|± |0.0280| +| - other | 2|none | |acc |↑ | 0.7692|± |0.0073| +| - business_ethics | 1|none | 0|acc |↑ | 0.7700|± |0.0423| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7962|± |0.0248| +| - college_medicine | 1|none | 0|acc |↑ | 0.7225|± |0.0341| +| - global_facts | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - human_aging | 1|none | 0|acc |↑ | 0.7668|± |0.0284| +| - management | 1|none | 0|acc |↑ | 0.8350|± |0.0368| +| - marketing | 1|none | 0|acc |↑ | 0.9017|± |0.0195| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8300|± |0.0378| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8608|± |0.0124| +| - nutrition | 1|none | 0|acc |↑ | 0.7680|± |0.0242| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5461|± |0.0297| +| - professional_medicine | 1|none | 0|acc |↑ | 0.8088|± |0.0239| +| - virology | 1|none | 0|acc |↑ | 0.5723|± |0.0385| +| - social sciences | 2|none | |acc |↑ | 0.8213|± |0.0068| +| - econometrics | 1|none | 0|acc |↑ | 0.6053|± |0.0460| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8535|± |0.0252| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9326|± |0.0181| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7821|± |0.0209| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8487|± |0.0233| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8954|± |0.0131| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8321|± |0.0328| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7712|± |0.0170| +| - public_relations | 1|none | 0|acc |↑ | 0.7091|± |0.0435| +| - security_studies | 1|none | 0|acc |↑ | 0.7633|± |0.0272| +| - sociology | 1|none | 0|acc |↑ | 0.8806|± |0.0229| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.9100|± |0.0288| +| - stem | 2|none | |acc |↑ | 0.6768|± |0.0080| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - anatomy | 1|none | 0|acc |↑ | 0.7037|± |0.0394| +| - astronomy | 1|none | 0|acc |↑ | 0.8487|± |0.0292| +| - college_biology | 1|none | 0|acc |↑ | 0.8819|± |0.0270| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - college_mathematics | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - college_physics | 1|none | 0|acc |↑ | 0.6275|± |0.0481| +| - computer_security | 1|none | 0|acc |↑ | 0.8000|± |0.0402| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7745|± |0.0273| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6690|± |0.0392| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6455|± |0.0246| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8677|± |0.0193| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6847|± |0.0327| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8300|± |0.0378| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4926|± |0.0305| +| - high_school_physics | 1|none | 0|acc |↑ | 0.5430|± |0.0407| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6713|± |0.0320| +| - machine_learning | 1|none | 0|acc |↑ | 0.5893|± |0.0467| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1571|± |0.0061| +|openbookqa | 1|none | 0|acc |↑ | 0.4220|± |0.0221| +| | |none | 0|acc_norm |↑ | 0.4980|± |0.0224| +|piqa | 1|none | 0|acc |↑ | 0.8014|± |0.0093| +| | |none | 0|acc_norm |↑ | 0.7807|± |0.0097| +|qnli | 1|none | 0|acc |↑ | 0.7457|± |0.0059| +|sciq | 1|none | 0|acc |↑ | 0.9720|± |0.0052| +| | |none | 0|acc_norm |↑ | 0.9540|± |0.0066| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.2752|± |0.0033| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4786|± |0.0175| +| | |none | 0|bleu_diff |↑ |-0.4518|± |0.3853| +| | |none | 0|bleu_max |↑ |12.5016|± |0.5371| +| | |none | 0|rouge1_acc |↑ | 0.5141|± |0.0175| +| | |none | 0|rouge1_diff|↑ |-0.2991|± |0.5781| +| | |none | 0|rouge1_max |↑ |35.1025|± |0.7280| +| | |none | 0|rouge2_acc |↑ | 0.4125|± |0.0172| +| | |none | 0|rouge2_diff|↑ |-1.6691|± |0.6548| +| | |none | 0|rouge2_max |↑ |20.2480|± |0.7443| +| | |none | 0|rougeL_acc |↑ | 0.4957|± |0.0175| +| | |none | 0|rougeL_diff|↑ |-0.8130|± |0.5698| +| | |none | 0|rougeL_max |↑ |31.3688|± |0.7201| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4051|± |0.0172| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5812|± |0.0160| +|winogrande | 1|none | 0|acc |↑ | 0.7443|± |0.0123| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.8019|± |0.0044| +|mmlu | 2|none | |acc |↑ |0.7161|± |0.0036| +| - humanities | 2|none | |acc |↑ |0.6387|± |0.0065| +| - other | 2|none | |acc |↑ |0.7692|± |0.0073| +| - social sciences| 2|none | |acc |↑ |0.8213|± |0.0068| +| - stem | 2|none | |acc |↑ |0.6768|± |0.0080| + +google_gemma-3-12b-it: 15h 46m 6s +✅ Benchmark completed for google_gemma-3-12b-it + +🔥 Starting benchmark for meta-llama_Llama-2-13b-chat-hf +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/meta-llama_Llama-2-13b-chat-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4300|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.4300|± |0.0157| +|anli_r3 | 1|none | 0|acc |↑ | 0.4142|± |0.0142| +|arc_challenge | 1|none | 0|acc |↑ | 0.4616|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5017|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.4780|± |0.0055| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.7200|± |0.0285| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5829|± |0.0362| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.6600|± |0.0300| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.4560|± |0.0316| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0640|± |0.0155| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5240|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.4200|± |0.0313| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3960|± |0.0310| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3480|± |0.0302| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.6640|± |0.0299| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0320|± |0.0112| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6440|± |0.0303| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.4452|± |0.0413| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4120|± |0.0312| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4400|± |0.0315| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.7191|± |0.0338| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9280|± |0.0164| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2000|± |0.0253| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2160|± |0.0261| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1840|± |0.0246| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3520|± |0.0303| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9480|± |0.0141| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2080|± |0.0257| +|boolq | 2|none | 0|acc |↑ | 0.8165|± |0.0068| +|drop | 3|none | 0|em |↑ | 0.0073|± |0.0009| +| | |none | 0|f1 |↑ | 0.0915|± |0.0020| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1212|± |0.0233| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1414|± |0.0248| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2071|± |0.0289| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2424|± |0.0305| +| | |none | 0|acc_norm |↑ | 0.2424|± |0.0305| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2222|± |0.0296| +| | |none | 0|acc_norm |↑ | 0.2222|± |0.0296| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1630|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2033|± |0.0172| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1960|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2875|± |0.0194| +| | |none | 0|acc_norm |↑ | 0.2875|± |0.0194| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2766|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.2766|± |0.0192| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1741|± |0.0179| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1786|± |0.0181| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1607|± |0.0174| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3192|± |0.0220| +| | |none | 0|acc_norm |↑ | 0.3192|± |0.0220| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2991|± |0.0217| +| | |none | 0|acc_norm |↑ | 0.2991|± |0.0217| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.3556|± |0.0132| +| | |strict-match | 5|exact_match|↑ | 0.3472|± |0.0131| +|hellaswag | 1|none | 0|acc |↑ | 0.6071|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7967|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.5313|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.4978|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.2381|± |0.0381| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6667|± |0.0368| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7402|± |0.0308| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7257|± |0.0290| +| - international_law | 1|none | 0|acc |↑ | 0.7190|± |0.0410| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6944|± |0.0445| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.6871|± |0.0364| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6012|± |0.0264| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2760|± |0.0150| +| - philosophy | 1|none | 0|acc |↑ | 0.6463|± |0.0272| +| - prehistory | 1|none | 0|acc |↑ | 0.6265|± |0.0269| +| - professional_law | 1|none | 0|acc |↑ | 0.4003|± |0.0125| +| - world_religions | 1|none | 0|acc |↑ | 0.7719|± |0.0322| +| - other | 2|none | |acc |↑ | 0.6061|± |0.0084| +| - business_ethics | 1|none | 0|acc |↑ | 0.5400|± |0.0501| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5887|± |0.0303| +| - college_medicine | 1|none | 0|acc |↑ | 0.4220|± |0.0377| +| - global_facts | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - human_aging | 1|none | 0|acc |↑ | 0.6233|± |0.0325| +| - management | 1|none | 0|acc |↑ | 0.6893|± |0.0458| +| - marketing | 1|none | 0|acc |↑ | 0.7991|± |0.0262| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5800|± |0.0496| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7663|± |0.0151| +| - nutrition | 1|none | 0|acc |↑ | 0.6111|± |0.0279| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4078|± |0.0293| +| - professional_medicine | 1|none | 0|acc |↑ | 0.4963|± |0.0304| +| - virology | 1|none | 0|acc |↑ | 0.4639|± |0.0388| +| - social sciences | 2|none | |acc |↑ | 0.6136|± |0.0085| +| - econometrics | 1|none | 0|acc |↑ | 0.2456|± |0.0405| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6515|± |0.0339| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.7565|± |0.0310| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5026|± |0.0254| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.5126|± |0.0325| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7156|± |0.0193| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6412|± |0.0421| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5408|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.6273|± |0.0463| +| - security_studies | 1|none | 0|acc |↑ | 0.6571|± |0.0304| +| - sociology | 1|none | 0|acc |↑ | 0.7512|± |0.0306| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - stem | 2|none | |acc |↑ | 0.4272|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3000|± |0.0461| +| - anatomy | 1|none | 0|acc |↑ | 0.5185|± |0.0432| +| - astronomy | 1|none | 0|acc |↑ | 0.5789|± |0.0402| +| - college_biology | 1|none | 0|acc |↑ | 0.5833|± |0.0412| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - college_computer_science | 1|none | 0|acc |↑ | 0.4400|± |0.0499| +| - college_mathematics | 1|none | 0|acc |↑ | 0.2900|± |0.0456| +| - college_physics | 1|none | 0|acc |↑ | 0.2745|± |0.0444| +| - computer_security | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.4128|± |0.0322| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5448|± |0.0415| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.3201|± |0.0240| +| - high_school_biology | 1|none | 0|acc |↑ | 0.6419|± |0.0273| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4286|± |0.0348| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.5500|± |0.0500| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2741|± |0.0272| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3377|± |0.0386| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.3426|± |0.0324| +| - machine_learning | 1|none | 0|acc |↑ | 0.3304|± |0.0446| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1030|± |0.0051| +|openbookqa | 1|none | 0|acc |↑ | 0.3520|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.4400|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7780|± |0.0097| +| | |none | 0|acc_norm |↑ | 0.7933|± |0.0094| +|qnli | 1|none | 0|acc |↑ | 0.5438|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9510|± |0.0068| +| | |none | 0|acc_norm |↑ | 0.9050|± |0.0093| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.2725|± |0.0033| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4088|± |0.0172| +| | |none | 0|bleu_diff |↑ |-2.0148|± |0.7195| +| | |none | 0|bleu_max |↑ |26.0719|± |0.7783| +| | |none | 0|rouge1_acc |↑ | 0.4235|± |0.0173| +| | |none | 0|rouge1_diff|↑ |-3.1237|± |0.8531| +| | |none | 0|rouge1_max |↑ |51.9853|± |0.8214| +| | |none | 0|rouge2_acc |↑ | 0.3501|± |0.0167| +| | |none | 0|rouge2_diff|↑ |-4.0918|± |0.9904| +| | |none | 0|rouge2_max |↑ |36.4465|± |0.9660| +| | |none | 0|rougeL_acc |↑ | 0.4186|± |0.0173| +| | |none | 0|rougeL_diff|↑ |-3.1432|± |0.8645| +| | |none | 0|rougeL_max |↑ |49.1291|± |0.8443| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2803|± |0.0157| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4396|± |0.0157| +|winogrande | 1|none | 0|acc |↑ | 0.7119|± |0.0127| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4780|± |0.0055| +|mmlu | 2|none | |acc |↑ |0.5313|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.4978|± |0.0068| +| - other | 2|none | |acc |↑ |0.6061|± |0.0084| +| - social sciences| 2|none | |acc |↑ |0.6136|± |0.0085| +| - stem | 2|none | |acc |↑ |0.4272|± |0.0085| + +meta-llama_Llama-2-13b-chat-hf: 17h 9m 0s +✅ Benchmark completed for meta-llama_Llama-2-13b-chat-hf + +🔥 Starting benchmark for meta-llama_Llama-2-13b-hf +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/meta-llama_Llama-2-13b-hf), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|-------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3770|± |0.0153| +|anli_r2 | 1|none | 0|acc |↑ | 0.3900|± |0.0154| +|anli_r3 | 1|none | 0|acc |↑ | 0.3850|± |0.0141| +|arc_challenge | 1|none | 0|acc |↑ | 0.4829|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.4898|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.4777|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5401|± |0.0365| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7240|± |0.0283| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0440|± |0.0130| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5120|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5520|± |0.0315| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4640|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4160|± |0.0312| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.6520|± |0.0302| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7280|± |0.0282| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0240|± |0.0097| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.7400|± |0.0278| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.4932|± |0.0415| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6040|± |0.0310| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4080|± |0.0311| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.2760|± |0.0283| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5506|± |0.0374| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2600|± |0.0278| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2160|± |0.0261| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1680|± |0.0237| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3560|± |0.0303| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9480|± |0.0141| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.1960|± |0.0252| +|boolq | 2|none | 0|acc |↑ | 0.8064|± |0.0069| +|drop | 3|none | 0|em |↑ | 0.0033|± |0.0006| +| | |none | 0|f1 |↑ | 0.0301|± |0.0011| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1111|± |0.0224| +| | |strict-match | 0|exact_match|↑ | 0.0152|± |0.0087| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1061|± |0.0219| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2626|± |0.0314| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2727|± |0.0317| +| | |none | 0|acc_norm |↑ | 0.2727|± |0.0317| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2525|± |0.0310| +| | |none | 0|acc_norm |↑ | 0.2525|± |0.0310| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1282|± |0.0143| +| | |strict-match | 0|exact_match|↑ | 0.0183|± |0.0057| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1410|± |0.0149| +| | |strict-match | 0|exact_match|↑ | 0.0055|± |0.0032| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2985|± |0.0196| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2912|± |0.0195| +| | |none | 0|acc_norm |↑ | 0.2912|± |0.0195| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2784|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.2784|± |0.0192| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1362|± |0.0162| +| | |strict-match | 0|exact_match|↑ | 0.0134|± |0.0054| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1094|± |0.0148| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2879|± |0.0214| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2879|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.2879|± |0.0214| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2545|± |0.0206| +| | |none | 0|acc_norm |↑ | 0.2545|± |0.0206| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.2328|± |0.0116| +| | |strict-match | 5|exact_match|↑ | 0.2297|± |0.0116| +|hellaswag | 1|none | 0|acc |↑ | 0.6005|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7939|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.5209|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.4795|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.2857|± |0.0404| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6182|± |0.0379| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.6716|± |0.0330| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7089|± |0.0296| +| - international_law | 1|none | 0|acc |↑ | 0.7190|± |0.0410| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6481|± |0.0462| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.6319|± |0.0379| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5318|± |0.0269| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2469|± |0.0144| +| - philosophy | 1|none | 0|acc |↑ | 0.6431|± |0.0272| +| - prehistory | 1|none | 0|acc |↑ | 0.6080|± |0.0272| +| - professional_law | 1|none | 0|acc |↑ | 0.4048|± |0.0125| +| - world_religions | 1|none | 0|acc |↑ | 0.7602|± |0.0327| +| - other | 2|none | |acc |↑ | 0.5935|± |0.0085| +| - business_ethics | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5887|± |0.0303| +| - college_medicine | 1|none | 0|acc |↑ | 0.4971|± |0.0381| +| - global_facts | 1|none | 0|acc |↑ | 0.3200|± |0.0469| +| - human_aging | 1|none | 0|acc |↑ | 0.5650|± |0.0333| +| - management | 1|none | 0|acc |↑ | 0.7379|± |0.0435| +| - marketing | 1|none | 0|acc |↑ | 0.7564|± |0.0281| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5500|± |0.0500| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7229|± |0.0160| +| - nutrition | 1|none | 0|acc |↑ | 0.6209|± |0.0278| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4043|± |0.0293| +| - professional_medicine | 1|none | 0|acc |↑ | 0.5257|± |0.0303| +| - virology | 1|none | 0|acc |↑ | 0.4337|± |0.0386| +| - social sciences | 2|none | |acc |↑ | 0.6113|± |0.0085| +| - econometrics | 1|none | 0|acc |↑ | 0.2281|± |0.0395| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6818|± |0.0332| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.7565|± |0.0310| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4949|± |0.0253| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.5378|± |0.0324| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7083|± |0.0195| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6641|± |0.0414| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5278|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.6091|± |0.0467| +| - security_studies | 1|none | 0|acc |↑ | 0.6449|± |0.0306| +| - sociology | 1|none | 0|acc |↑ | 0.7512|± |0.0306| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - stem | 2|none | |acc |↑ | 0.4231|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.2700|± |0.0446| +| - anatomy | 1|none | 0|acc |↑ | 0.4815|± |0.0432| +| - astronomy | 1|none | 0|acc |↑ | 0.5724|± |0.0403| +| - college_biology | 1|none | 0|acc |↑ | 0.5208|± |0.0418| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4400|± |0.0499| +| - college_computer_science | 1|none | 0|acc |↑ | 0.3600|± |0.0482| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3200|± |0.0469| +| - college_physics | 1|none | 0|acc |↑ | 0.2451|± |0.0428| +| - computer_security | 1|none | 0|acc |↑ | 0.6500|± |0.0479| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.4043|± |0.0321| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5172|± |0.0416| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.3122|± |0.0239| +| - high_school_biology | 1|none | 0|acc |↑ | 0.6516|± |0.0271| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4680|± |0.0351| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2556|± |0.0266| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3179|± |0.0380| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4352|± |0.0338| +| - machine_learning | 1|none | 0|acc |↑ | 0.2589|± |0.0416| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.2363|± |0.0071| +|openbookqa | 1|none | 0|acc |↑ | 0.3520|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.4520|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.7900|± |0.0095| +| | |none | 0|acc_norm |↑ | 0.8052|± |0.0092| +|qnli | 1|none | 0|acc |↑ | 0.4953|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9460|± |0.0072| +| | |none | 0|acc_norm |↑ | 0.9350|± |0.0078| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.6088|± |0.0036| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3011|± |0.0161| +| | |none | 0|bleu_diff |↑ |-10.3037|± |0.8896| +| | |none | 0|bleu_max |↑ | 29.5100|± |0.8236| +| | |none | 0|rouge1_acc |↑ | 0.3072|± |0.0162| +| | |none | 0|rouge1_diff|↑ |-12.4090|± |0.8679| +| | |none | 0|rouge1_max |↑ | 55.4793|± |0.8343| +| | |none | 0|rouge2_acc |↑ | 0.2791|± |0.0157| +| | |none | 0|rouge2_diff|↑ |-14.9613|± |1.1075| +| | |none | 0|rouge2_max |↑ | 39.8908|± |1.0021| +| | |none | 0|rougeL_acc |↑ | 0.2950|± |0.0160| +| | |none | 0|rougeL_diff|↑ |-12.8909|± |0.8812| +| | |none | 0|rougeL_max |↑ | 52.5536|± |0.8487| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2595|± |0.0153| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.3690|± |0.0136| +|winogrande | 1|none | 0|acc |↑ | 0.7222|± |0.0126| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4777|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.5209|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.4795|± |0.0069| +| - other | 2|none | |acc |↑ |0.5935|± |0.0085| +| - social sciences| 2|none | |acc |↑ |0.6113|± |0.0085| +| - stem | 2|none | |acc |↑ |0.4231|± |0.0085| + +meta-llama_Llama-2-13b-hf: 19h 21m 36s +✅ Benchmark completed for meta-llama_Llama-2-13b-hf + +🔥 Starting benchmark for Qwen_Qwen2-7B-Instruct +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/Qwen_Qwen2-7B-Instruct), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5730|± |0.0156| +|anli_r2 | 1|none | 0|acc |↑ | 0.5250|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5225|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5085|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5401|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.5775|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9360|± |0.0155| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4492|± |0.0365| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5480|± |0.0315| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6400|± |0.0304| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0480|± |0.0135| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3320|± |0.0298| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9000|± |0.0190| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4160|± |0.0312| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.2560|± |0.0277| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8240|± |0.0241| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5680|± |0.0314| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.8320|± |0.0237| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9000|± |0.0190| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6280|± |0.0306| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.1918|± |0.0327| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.4800|± |0.0317| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4720|± |0.0316| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4960|± |0.0317| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6124|± |0.0366| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.5680|± |0.0314| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5160|± |0.0317| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.4560|± |0.0316| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.6840|± |0.0295| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9760|± |0.0097| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +|boolq | 2|none | 0|acc |↑ | 0.8563|± |0.0061| +|drop | 3|none | 0|em |↑ | 0.0001|± |0.0001| +| | |none | 0|f1 |↑ | 0.0520|± |0.0012| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1364|± |0.0245| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1364|± |0.0245| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3737|± |0.0345| +| | |none | 0|acc_norm |↑ | 0.3737|± |0.0345| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3182|± |0.0332| +| | |none | 0|acc_norm |↑ | 0.3182|± |0.0332| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1703|± |0.0161| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1410|± |0.0149| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2015|± |0.0172| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3480|± |0.0204| +| | |none | 0|acc_norm |↑ | 0.3480|± |0.0204| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3352|± |0.0202| +| | |none | 0|acc_norm |↑ | 0.3352|± |0.0202| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1429|± |0.0166| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1384|± |0.0163| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1875|± |0.0185| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3125|± |0.0219| +| | |none | 0|acc_norm |↑ | 0.3125|± |0.0219| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3147|± |0.0220| +| | |none | 0|acc_norm |↑ | 0.3147|± |0.0220| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7362|± |0.0121| +| | |strict-match | 5|exact_match|↑ | 0.6467|± |0.0132| +|hellaswag | 1|none | 0|acc |↑ | 0.6118|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.8060|± |0.0039| +|mmlu | 2|none | |acc |↑ | 0.6994|± |0.0037| +| - humanities | 2|none | |acc |↑ | 0.6338|± |0.0066| +| - formal_logic | 1|none | 0|acc |↑ | 0.5079|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8061|± |0.0309| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8725|± |0.0234| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8397|± |0.0239| +| - international_law | 1|none | 0|acc |↑ | 0.8264|± |0.0346| +| - jurisprudence | 1|none | 0|acc |↑ | 0.8519|± |0.0343| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8037|± |0.0312| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7717|± |0.0226| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4324|± |0.0166| +| - philosophy | 1|none | 0|acc |↑ | 0.7814|± |0.0235| +| - prehistory | 1|none | 0|acc |↑ | 0.7840|± |0.0229| +| - professional_law | 1|none | 0|acc |↑ | 0.5163|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8304|± |0.0288| +| - other | 2|none | |acc |↑ | 0.7586|± |0.0074| +| - business_ethics | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7849|± |0.0253| +| - college_medicine | 1|none | 0|acc |↑ | 0.6879|± |0.0353| +| - global_facts | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - human_aging | 1|none | 0|acc |↑ | 0.7489|± |0.0291| +| - management | 1|none | 0|acc |↑ | 0.7961|± |0.0399| +| - marketing | 1|none | 0|acc |↑ | 0.9017|± |0.0195| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8300|± |0.0378| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8570|± |0.0125| +| - nutrition | 1|none | 0|acc |↑ | 0.7778|± |0.0238| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5887|± |0.0294| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7353|± |0.0268| +| - virology | 1|none | 0|acc |↑ | 0.5241|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.8021|± |0.0071| +| - econometrics | 1|none | 0|acc |↑ | 0.5877|± |0.0463| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8788|± |0.0233| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9275|± |0.0187| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7692|± |0.0214| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8319|± |0.0243| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8642|± |0.0147| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7710|± |0.0369| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7418|± |0.0177| +| - public_relations | 1|none | 0|acc |↑ | 0.7364|± |0.0422| +| - security_studies | 1|none | 0|acc |↑ | 0.7388|± |0.0281| +| - sociology | 1|none | 0|acc |↑ | 0.8756|± |0.0233| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8600|± |0.0349| +| - stem | 2|none | |acc |↑ | 0.6388|± |0.0083| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - anatomy | 1|none | 0|acc |↑ | 0.6000|± |0.0423| +| - astronomy | 1|none | 0|acc |↑ | 0.7697|± |0.0343| +| - college_biology | 1|none | 0|acc |↑ | 0.7917|± |0.0340| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6200|± |0.0488| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4000|± |0.0492| +| - college_physics | 1|none | 0|acc |↑ | 0.4020|± |0.0488| +| - computer_security | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7064|± |0.0298| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7103|± |0.0378| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6376|± |0.0248| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8387|± |0.0209| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6207|± |0.0341| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4963|± |0.0305| +| - high_school_physics | 1|none | 0|acc |↑ | 0.5099|± |0.0408| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6759|± |0.0319| +| - machine_learning | 1|none | 0|acc |↑ | 0.4732|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0133|± |0.0019| +|openbookqa | 1|none | 0|acc |↑ | 0.3460|± |0.0213| +| | |none | 0|acc_norm |↑ | 0.4620|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.7954|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.8058|± |0.0092| +|qnli | 1|none | 0|acc |↑ | 0.5471|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9540|± |0.0066| +| | |none | 0|acc_norm |↑ | 0.9160|± |0.0088| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0081|± |0.0007| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4774|± |0.0175| +| | |none | 0|bleu_diff |↑ | 4.0052|± |0.6796| +| | |none | 0|bleu_max |↑ |19.4152|± |0.7487| +| | |none | 0|rouge1_acc |↑ | 0.5043|± |0.0175| +| | |none | 0|rouge1_diff|↑ | 5.0515|± |0.9714| +| | |none | 0|rouge1_max |↑ |42.5509|± |0.9066| +| | |none | 0|rouge2_acc |↑ | 0.4186|± |0.0173| +| | |none | 0|rouge2_diff|↑ | 5.1321|± |1.0491| +| | |none | 0|rouge2_max |↑ |29.4151|± |0.9889| +| | |none | 0|rougeL_acc |↑ | 0.4908|± |0.0175| +| | |none | 0|rougeL_diff|↑ | 5.0408|± |0.9758| +| | |none | 0|rougeL_max |↑ |39.6681|± |0.9155| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4051|± |0.0172| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5734|± |0.0154| +|winogrande | 1|none | 0|acc |↑ | 0.6985|± |0.0129| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5775|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.6994|± |0.0037| +| - humanities | 2|none | |acc |↑ |0.6338|± |0.0066| +| - other | 2|none | |acc |↑ |0.7586|± |0.0074| +| - social sciences| 2|none | |acc |↑ |0.8021|± |0.0071| +| - stem | 2|none | |acc |↑ |0.6388|± |0.0083| + +Qwen_Qwen2-7B-Instruct: 11h 30m 41s +✅ Benchmark completed for Qwen_Qwen2-7B-Instruct + +🔥 Starting benchmark for deepseek-ai_DeepSeek-R1-0528-Qwen3-8B +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/home/jaymin/Documents/llm/llm_models/deepseek-ai_DeepSeek-R1-0528-Qwen3-8B), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5110|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4640|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4767|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5137|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5495|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.5841|± |0.0052| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5348|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5000|± |0.0317| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5520|± |0.0315| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.1240|± |0.0209| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.3280|± |0.0298| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3360|± |0.0299| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.7440|± |0.0277| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.0560|± |0.0146| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7080|± |0.0288| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.6200|± |0.0308| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9520|± |0.0135| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8240|± |0.0241| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8640|± |0.0217| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.6918|± |0.0383| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6000|± |0.0310| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5506|± |0.0374| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8680|± |0.0215| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.4960|± |0.0317| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.5240|± |0.0316| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8400|± |0.0232| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9920|± |0.0056| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.3640|± |0.0305| +|boolq | 2|none | 0|acc |↑ | 0.8483|± |0.0063| +|drop | 3|none | 0|em |↑ | 0.0018|± |0.0004| +| | |none | 0|f1 |↑ | 0.0533|± |0.0013| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0556|± |0.0163| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0556|± |0.0163| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0266| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3434|± |0.0338| +| | |none | 0|acc_norm |↑ | 0.3434|± |0.0338| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0806|± |0.0117| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0861|± |0.0120| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2106|± |0.0175| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3516|± |0.0205| +| | |none | 0|acc_norm |↑ | 0.3516|± |0.0205| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3755|± |0.0207| +| | |none | 0|acc_norm |↑ | 0.3755|± |0.0207| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0848|± |0.0132| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0737|± |0.0124| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2299|± |0.0199| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3683|± |0.0228| +| | |none | 0|acc_norm |↑ | 0.3683|± |0.0228| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3728|± |0.0229| +| | |none | 0|acc_norm |↑ | 0.3728|± |0.0229| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8241|± |0.0105| +| | |strict-match | 5|exact_match|↑ | 0.8127|± |0.0107| +|hellaswag | 1|none | 0|acc |↑ | 0.5781|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7564|± |0.0043| +|mmlu | 2|none | |acc |↑ | 0.6830|± |0.0037| +| - humanities | 2|none | |acc |↑ | 0.5690|± |0.0066| +| - formal_logic | 1|none | 0|acc |↑ | 0.6349|± |0.0431| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8121|± |0.0305| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7941|± |0.0284| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8354|± |0.0241| +| - international_law | 1|none | 0|acc |↑ | 0.7521|± |0.0394| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7685|± |0.0408| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7607|± |0.0335| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7139|± |0.0243| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2793|± |0.0150| +| - philosophy | 1|none | 0|acc |↑ | 0.7267|± |0.0253| +| - prehistory | 1|none | 0|acc |↑ | 0.7747|± |0.0232| +| - professional_law | 1|none | 0|acc |↑ | 0.4531|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.7953|± |0.0309| +| - other | 2|none | |acc |↑ | 0.7399|± |0.0076| +| - business_ethics | 1|none | 0|acc |↑ | 0.7400|± |0.0441| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7509|± |0.0266| +| - college_medicine | 1|none | 0|acc |↑ | 0.7110|± |0.0346| +| - global_facts | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - human_aging | 1|none | 0|acc |↑ | 0.7085|± |0.0305| +| - management | 1|none | 0|acc |↑ | 0.8835|± |0.0318| +| - marketing | 1|none | 0|acc |↑ | 0.8632|± |0.0225| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8327|± |0.0133| +| - nutrition | 1|none | 0|acc |↑ | 0.7614|± |0.0244| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5567|± |0.0296| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7610|± |0.0259| +| - virology | 1|none | 0|acc |↑ | 0.4880|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7927|± |0.0072| +| - econometrics | 1|none | 0|acc |↑ | 0.5965|± |0.0462| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8384|± |0.0262| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9016|± |0.0215| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7513|± |0.0219| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8739|± |0.0216| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8716|± |0.0143| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8397|± |0.0322| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7059|± |0.0184| +| - public_relations | 1|none | 0|acc |↑ | 0.7182|± |0.0431| +| - security_studies | 1|none | 0|acc |↑ | 0.7551|± |0.0275| +| - sociology | 1|none | 0|acc |↑ | 0.8060|± |0.0280| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8700|± |0.0338| +| - stem | 2|none | |acc |↑ | 0.6898|± |0.0079| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4900|± |0.0502| +| - anatomy | 1|none | 0|acc |↑ | 0.6815|± |0.0402| +| - astronomy | 1|none | 0|acc |↑ | 0.8816|± |0.0263| +| - college_biology | 1|none | 0|acc |↑ | 0.8681|± |0.0283| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5600|± |0.0499| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6100|± |0.0490| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - college_physics | 1|none | 0|acc |↑ | 0.5588|± |0.0494| +| - computer_security | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.8170|± |0.0253| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7517|± |0.0360| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6614|± |0.0244| +| - high_school_biology | 1|none | 0|acc |↑ | 0.9000|± |0.0171| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6700|± |0.0331| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8200|± |0.0386| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4778|± |0.0305| +| - high_school_physics | 1|none | 0|acc |↑ | 0.6424|± |0.0391| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6852|± |0.0317| +| - machine_learning | 1|none | 0|acc |↑ | 0.5000|± |0.0475| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0183|± |0.0022| +|openbookqa | 1|none | 0|acc |↑ | 0.3080|± |0.0207| +| | |none | 0|acc_norm |↑ | 0.4300|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7633|± |0.0099| +| | |none | 0|acc_norm |↑ | 0.7568|± |0.0100| +|qnli | 1|none | 0|acc |↑ | 0.5578|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9600|± |0.0062| +| | |none | 0|acc_norm |↑ | 0.9410|± |0.0075| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0295|± |0.0013| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5398|± |0.0174| +| | |none | 0|bleu_diff |↑ | 5.8931|± |0.7222| +| | |none | 0|bleu_max |↑ |19.7647|± |0.7053| +| | |none | 0|rouge1_acc |↑ | 0.5569|± |0.0174| +| | |none | 0|rouge1_diff|↑ | 9.9292|± |1.0724| +| | |none | 0|rouge1_max |↑ |45.0401|± |0.8645| +| | |none | 0|rouge2_acc |↑ | 0.4627|± |0.0175| +| | |none | 0|rouge2_diff|↑ | 9.8762|± |1.1402| +| | |none | 0|rouge2_max |↑ |30.6518|± |0.9760| +| | |none | 0|rougeL_acc |↑ | 0.5435|± |0.0174| +| | |none | 0|rougeL_diff|↑ | 9.8078|± |1.0753| +| | |none | 0|rougeL_max |↑ |41.9636|± |0.8847| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3574|± |0.0168| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5590|± |0.0152| +|winogrande | 1|none | 0|acc |↑ | 0.6756|± |0.0132| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5841|± |0.0052| +|mmlu | 2|none | |acc |↑ |0.6830|± |0.0037| +| - humanities | 2|none | |acc |↑ |0.5690|± |0.0066| +| - other | 2|none | |acc |↑ |0.7399|± |0.0076| +| - social sciences| 2|none | |acc |↑ |0.7927|± |0.0072| +| - stem | 2|none | |acc |↑ |0.6898|± |0.0079| + +deepseek-ai_DeepSeek-R1-0528-Qwen3-8B: 17h 58m 4s +✅ Benchmark completed for deepseek-ai_DeepSeek-R1-0528-Qwen3-8B + +🔥 Starting benchmark for 01-ai_Yi-1.5-9B-Chat +2025-07-27:12:03:18 INFO [loggers.evaluation_tracker:209] Saving results aggregated +hf (pretrained=/home/jaymin/Documents/llm/llm_models/01-ai_Yi-1.5-9B-Chat), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5350|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.5090|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5258|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5572|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.5870|± |0.0144| +|bbh | 3|get-answer | |exact_match|↑ | 0.6107|± |0.0053| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8960|± |0.0193| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5508|± |0.0365| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7760|± |0.0264| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.3640|± |0.0305| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.0240|± |0.0097| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.5880|± |0.0312| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.6040|± |0.0310| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5560|± |0.0315| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3640|± |0.0305| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9120|± |0.0180| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7560|± |0.0272| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.2680|± |0.0281| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8600|± |0.0220| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.8356|± |0.0308| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6640|± |0.0299| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.3760|± |0.0307| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5000|± |0.0317| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6854|± |0.0349| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9160|± |0.0176| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.7160|± |0.0286| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7280|± |0.0282| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.6600|± |0.0300| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9720|± |0.0105| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.3720|± |0.0306| +|boolq | 2|none | 0|acc |↑ | 0.8682|± |0.0059| +|drop | 3|none | 0|em |↑ | 0.0149|± |0.0012| +| | |none | 0|f1 |↑ | 0.1253|± |0.0021| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1313|± |0.0241| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1414|± |0.0248| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0266| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3384|± |0.0337| +| | |none | 0|acc_norm |↑ | 0.3384|± |0.0337| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3131|± |0.0330| +| | |none | 0|acc_norm |↑ | 0.3131|± |0.0330| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1795|± |0.0164| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1777|± |0.0164| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1960|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3187|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3187|± |0.0200| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3315|± |0.0202| +| | |none | 0|acc_norm |↑ | 0.3315|± |0.0202| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1942|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1920|± |0.0186| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2098|± |0.0193| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3237|± |0.0221| +| | |none | 0|acc_norm |↑ | 0.3237|± |0.0221| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3036|± |0.0217| +| | |none | 0|acc_norm |↑ | 0.3036|± |0.0217| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6732|± |0.0129| +| | |strict-match | 5|exact_match|↑ | 0.7081|± |0.0125| +|hellaswag | 1|none | 0|acc |↑ | 0.5964|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7873|± |0.0041| +|mmlu | 2|none | |acc |↑ | 0.6841|± |0.0037| +| - humanities | 2|none | |acc |↑ | 0.6172|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.5556|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8303|± |0.0293| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8676|± |0.0238| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8354|± |0.0241| +| - international_law | 1|none | 0|acc |↑ | 0.8099|± |0.0358| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7778|± |0.0402| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7975|± |0.0316| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7283|± |0.0239| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4547|± |0.0167| +| - philosophy | 1|none | 0|acc |↑ | 0.7267|± |0.0253| +| - prehistory | 1|none | 0|acc |↑ | 0.7191|± |0.0250| +| - professional_law | 1|none | 0|acc |↑ | 0.4922|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8012|± |0.0306| +| - other | 2|none | |acc |↑ | 0.7300|± |0.0077| +| - business_ethics | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7132|± |0.0278| +| - college_medicine | 1|none | 0|acc |↑ | 0.6705|± |0.0358| +| - global_facts | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - human_aging | 1|none | 0|acc |↑ | 0.7085|± |0.0305| +| - management | 1|none | 0|acc |↑ | 0.8447|± |0.0359| +| - marketing | 1|none | 0|acc |↑ | 0.8974|± |0.0199| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7600|± |0.0429| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8199|± |0.0137| +| - nutrition | 1|none | 0|acc |↑ | 0.7418|± |0.0251| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5993|± |0.0292| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6801|± |0.0283| +| - virology | 1|none | 0|acc |↑ | 0.5542|± |0.0387| +| - social sciences | 2|none | |acc |↑ | 0.7813|± |0.0073| +| - econometrics | 1|none | 0|acc |↑ | 0.6316|± |0.0454| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8333|± |0.0266| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8808|± |0.0234| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7846|± |0.0208| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8277|± |0.0245| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8716|± |0.0143| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6870|± |0.0407| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6977|± |0.0186| +| - public_relations | 1|none | 0|acc |↑ | 0.6455|± |0.0458| +| - security_studies | 1|none | 0|acc |↑ | 0.7469|± |0.0278| +| - sociology | 1|none | 0|acc |↑ | 0.7910|± |0.0287| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8900|± |0.0314| +| - stem | 2|none | |acc |↑ | 0.6438|± |0.0082| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4500|± |0.0500| +| - anatomy | 1|none | 0|acc |↑ | 0.6889|± |0.0400| +| - astronomy | 1|none | 0|acc |↑ | 0.7500|± |0.0352| +| - college_biology | 1|none | 0|acc |↑ | 0.7917|± |0.0340| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6100|± |0.0490| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - college_physics | 1|none | 0|acc |↑ | 0.4902|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.8400|± |0.0368| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7191|± |0.0294| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7172|± |0.0375| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6270|± |0.0249| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8516|± |0.0202| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6305|± |0.0340| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8000|± |0.0402| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4407|± |0.0303| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4636|± |0.0407| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6759|± |0.0319| +| - machine_learning | 1|none | 0|acc |↑ | 0.5536|± |0.0472| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0094|± |0.0016| +|openbookqa | 1|none | 0|acc |↑ | 0.3200|± |0.0209| +| | |none | 0|acc_norm |↑ | 0.4360|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7965|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.8036|± |0.0093| +|qnli | 1|none | 0|acc |↑ | 0.7877|± |0.0055| +|sciq | 1|none | 0|acc |↑ | 0.9590|± |0.0063| +| | |none | 0|acc_norm |↑ | 0.9540|± |0.0066| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3387|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4541|± |0.0174| +| | |none | 0|bleu_diff |↑ |-0.7696|± |0.5079| +| | |none | 0|bleu_max |↑ |18.9123|± |0.6279| +| | |none | 0|rouge1_acc |↑ | 0.4602|± |0.0174| +| | |none | 0|rouge1_diff|↑ |-1.1341|± |0.6159| +| | |none | 0|rouge1_max |↑ |44.4829|± |0.7546| +| | |none | 0|rouge2_acc |↑ | 0.4027|± |0.0172| +| | |none | 0|rouge2_diff|↑ |-1.7922|± |0.7369| +| | |none | 0|rouge2_max |↑ |30.3176|± |0.8139| +| | |none | 0|rougeL_acc |↑ | 0.4517|± |0.0174| +| | |none | 0|rougeL_diff|↑ |-1.6275|± |0.6211| +| | |none | 0|rougeL_max |↑ |40.9909|± |0.7553| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3745|± |0.0169| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5479|± |0.0159| +|winogrande | 1|none | 0|acc |↑ | 0.7466|± |0.0122| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6107|± |0.0053| +|mmlu | 2|none | |acc |↑ |0.6841|± |0.0037| +| - humanities | 2|none | |acc |↑ |0.6172|± |0.0067| +| - other | 2|none | |acc |↑ |0.7300|± |0.0077| +| - social sciences| 2|none | |acc |↑ |0.7813|± |0.0073| +| - stem | 2|none | |acc |↑ |0.6438|± |0.0082| + +01-ai_Yi-1.5-9B-Chat: 13h 54m 25s +✅ Benchmark completed for 01-ai_Yi-1.5-9B-Chat + +🔥 Starting benchmark for 01-ai_Yi-1.5-6B-Chat +fatal: not a git repository (or any of the parent directories): .git +2025-07-27:20:07:27 INFO [loggers.evaluation_tracker:209] Saving results aggregated +hf (pretrained=/home/jaymin/Documents/llm/llm_models/01-ai_Yi-1.5-6B-Chat), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4770|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4530|± |0.0157| +|anli_r3 | 1|none | 0|acc |↑ | 0.4600|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5077|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5392|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.5478|± |0.0055| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9120|± |0.0180| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5080|± |0.0367| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7040|± |0.0289| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5160|± |0.0317| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.1000|± |0.0190| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.8040|± |0.0252| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3240|± |0.0297| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7480|± |0.0275| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5680|± |0.0314| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.7120|± |0.0287| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8800|± |0.0206| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.7280|± |0.0282| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.6096|± |0.0405| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.7000|± |0.0290| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4663|± |0.0375| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8880|± |0.0200| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.4960|± |0.0317| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.3720|± |0.0306| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.1680|± |0.0237| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.7960|± |0.0255| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.1480|± |0.0225| +|boolq | 2|none | 0|acc |↑ | 0.8474|± |0.0063| +|drop | 3|none | 0|em |↑ | 0.0071|± |0.0009| +| | |none | 0|f1 |↑ | 0.1161|± |0.0020| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1465|± |0.0252| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1212|± |0.0233| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2475|± |0.0307| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3333|± |0.0336| +| | |none | 0|acc_norm |↑ | 0.3333|± |0.0336| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3182|± |0.0332| +| | |none | 0|acc_norm |↑ | 0.3182|± |0.0332| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1392|± |0.0148| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1520|± |0.0154| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2198|± |0.0177| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3095|± |0.0198| +| | |none | 0|acc_norm |↑ | 0.3095|± |0.0198| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3205|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3205|± |0.0200| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1384|± |0.0163| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1540|± |0.0171| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1942|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3125|± |0.0219| +| | |none | 0|acc_norm |↑ | 0.3125|± |0.0219| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3571|± |0.0227| +| | |none | 0|acc_norm |↑ | 0.3571|± |0.0227| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6785|± |0.0129| +| | |strict-match | 5|exact_match|↑ | 0.6702|± |0.0129| +|hellaswag | 1|none | 0|acc |↑ | 0.5852|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7675|± |0.0042| +|mmlu | 2|none | |acc |↑ | 0.6179|± |0.0039| +| - humanities | 2|none | |acc |↑ | 0.5392|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.4444|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7697|± |0.0329| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7696|± |0.0296| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7848|± |0.0268| +| - international_law | 1|none | 0|acc |↑ | 0.7355|± |0.0403| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7222|± |0.0433| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7853|± |0.0323| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6763|± |0.0252| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2469|± |0.0144| +| - philosophy | 1|none | 0|acc |↑ | 0.6752|± |0.0266| +| - prehistory | 1|none | 0|acc |↑ | 0.6512|± |0.0265| +| - professional_law | 1|none | 0|acc |↑ | 0.4641|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.7485|± |0.0333| +| - other | 2|none | |acc |↑ | 0.6794|± |0.0081| +| - business_ethics | 1|none | 0|acc |↑ | 0.7400|± |0.0441| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7132|± |0.0278| +| - college_medicine | 1|none | 0|acc |↑ | 0.6647|± |0.0360| +| - global_facts | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - human_aging | 1|none | 0|acc |↑ | 0.6413|± |0.0322| +| - management | 1|none | 0|acc |↑ | 0.8544|± |0.0349| +| - marketing | 1|none | 0|acc |↑ | 0.8590|± |0.0228| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7778|± |0.0149| +| - nutrition | 1|none | 0|acc |↑ | 0.6732|± |0.0269| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4716|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6066|± |0.0297| +| - virology | 1|none | 0|acc |↑ | 0.4759|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7221|± |0.0079| +| - econometrics | 1|none | 0|acc |↑ | 0.5702|± |0.0466| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8232|± |0.0272| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8238|± |0.0275| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7308|± |0.0225| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8067|± |0.0256| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8073|± |0.0169| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6107|± |0.0428| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6046|± |0.0198| +| - public_relations | 1|none | 0|acc |↑ | 0.6182|± |0.0465| +| - security_studies | 1|none | 0|acc |↑ | 0.6449|± |0.0306| +| - sociology | 1|none | 0|acc |↑ | 0.7960|± |0.0285| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8200|± |0.0386| +| - stem | 2|none | |acc |↑ | 0.5728|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - anatomy | 1|none | 0|acc |↑ | 0.5778|± |0.0427| +| - astronomy | 1|none | 0|acc |↑ | 0.6711|± |0.0382| +| - college_biology | 1|none | 0|acc |↑ | 0.7153|± |0.0377| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - college_physics | 1|none | 0|acc |↑ | 0.4510|± |0.0495| +| - computer_security | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.6340|± |0.0315| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6345|± |0.0401| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.5212|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7968|± |0.0229| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5468|± |0.0350| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4259|± |0.0301| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3974|± |0.0400| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5694|± |0.0338| +| - machine_learning | 1|none | 0|acc |↑ | 0.4018|± |0.0465| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0271|± |0.0027| +|openbookqa | 1|none | 0|acc |↑ | 0.3240|± |0.0210| +| | |none | 0|acc_norm |↑ | 0.4360|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7835|± |0.0096| +| | |none | 0|acc_norm |↑ | 0.7878|± |0.0095| +|qnli | 1|none | 0|acc |↑ | 0.6795|± |0.0063| +|sciq | 1|none | 0|acc |↑ | 0.9620|± |0.0060| +| | |none | 0|acc_norm |↑ | 0.9340|± |0.0079| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3310|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4088|± |0.0172| +| | |none | 0|bleu_diff |↑ |-1.4065|± |0.6448| +| | |none | 0|bleu_max |↑ |22.2331|± |0.7119| +| | |none | 0|rouge1_acc |↑ | 0.4125|± |0.0172| +| | |none | 0|rouge1_diff|↑ |-1.8289|± |0.7973| +| | |none | 0|rouge1_max |↑ |48.2058|± |0.7942| +| | |none | 0|rouge2_acc |↑ | 0.3525|± |0.0167| +| | |none | 0|rouge2_diff|↑ |-2.8970|± |0.9086| +| | |none | 0|rouge2_max |↑ |33.2543|± |0.9010| +| | |none | 0|rougeL_acc |↑ | 0.4039|± |0.0172| +| | |none | 0|rougeL_diff|↑ |-1.9691|± |0.8088| +| | |none | 0|rougeL_max |↑ |44.8699|± |0.8098| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3770|± |0.0170| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5344|± |0.0159| +|winogrande | 1|none | 0|acc |↑ | 0.7096|± |0.0128| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5478|± |0.0055| +|mmlu | 2|none | |acc |↑ |0.6179|± |0.0039| +| - humanities | 2|none | |acc |↑ |0.5392|± |0.0068| +| - other | 2|none | |acc |↑ |0.6794|± |0.0081| +| - social sciences| 2|none | |acc |↑ |0.7221|± |0.0079| +| - stem | 2|none | |acc |↑ |0.5728|± |0.0086| + +01-ai_Yi-1.5-6B-Chat: 8h 4m 9s +✅ Benchmark completed for 01-ai_Yi-1.5-6B-Chat + +🔥 Starting benchmark for 01-ai_Yi-1.5-9B +fatal: not a git repository (or any of the parent directories): .git +2025-07-28:07:51:08 INFO [loggers.evaluation_tracker:209] Saving results aggregated +hf (pretrained=/home/jaymin/Documents/llm/llm_models/01-ai_Yi-1.5-9B), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 2 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5320|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4800|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4392|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.5290|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5469|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.7120|± |0.0052| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8800|± |0.0206| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5668|± |0.0363| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.8240|± |0.0241| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.7440|± |0.0277| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.3320|± |0.0298| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5120|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.8800|± |0.0206| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5320|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4440|± |0.0315| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8560|± |0.0222| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.8320|± |0.0237| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.7120|± |0.0287| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9240|± |0.0168| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8800|± |0.0206| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7123|± |0.0376| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.7280|± |0.0282| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.7560|± |0.0272| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5618|± |0.0373| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9800|± |0.0089| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.8880|± |0.0200| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7000|± |0.0290| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.6880|± |0.0294| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.7240|± |0.0283| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +|boolq | 2|none | 0|acc |↑ | 0.8581|± |0.0061| +|drop | 3|none | 0|em |↑ | 0.4148|± |0.0050| +| | |none | 0|f1 |↑ | 0.4457|± |0.0049| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1263|± |0.0237| +| | |strict-match | 0|exact_match|↑ | 0.1010|± |0.0215| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0266| +| | |strict-match | 0|exact_match|↑ | 0.0556|± |0.0163| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3283|± |0.0335| +| | |strict-match | 0|exact_match|↑ | 0.0202|± |0.0100| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3434|± |0.0338| +| | |none | 0|acc_norm |↑ | 0.3434|± |0.0338| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3384|± |0.0337| +| | |none | 0|acc_norm |↑ | 0.3384|± |0.0337| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0160| +| | |strict-match | 0|exact_match|↑ | 0.0916|± |0.0124| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1703|± |0.0161| +| | |strict-match | 0|exact_match|↑ | 0.0440|± |0.0088| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2637|± |0.0189| +| | |strict-match | 0|exact_match|↑ | 0.0183|± |0.0057| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3187|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3187|± |0.0200| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3535|± |0.0205| +| | |none | 0|acc_norm |↑ | 0.3535|± |0.0205| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1562|± |0.0172| +| | |strict-match | 0|exact_match|↑ | 0.0893|± |0.0135| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1875|± |0.0185| +| | |strict-match | 0|exact_match|↑ | 0.0402|± |0.0093| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2790|± |0.0212| +| | |strict-match | 0|exact_match|↑ | 0.0201|± |0.0066| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3594|± |0.0227| +| | |none | 0|acc_norm |↑ | 0.3594|± |0.0227| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2946|± |0.0216| +| | |none | 0|acc_norm |↑ | 0.2946|± |0.0216| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6558|± |0.0131| +| | |strict-match | 5|exact_match|↑ | 0.6391|± |0.0132| +|hellaswag | 1|none | 0|acc |↑ | 0.5922|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7789|± |0.0041| +|mmlu | 2|none | |acc |↑ | 0.6893|± |0.0037| +| - humanities | 2|none | |acc |↑ | 0.6142|± |0.0066| +| - formal_logic | 1|none | 0|acc |↑ | 0.5873|± |0.0440| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8242|± |0.0297| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8186|± |0.0270| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8650|± |0.0222| +| - international_law | 1|none | 0|acc |↑ | 0.8099|± |0.0358| +| - jurisprudence | 1|none | 0|acc |↑ | 0.8056|± |0.0383| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8221|± |0.0300| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7399|± |0.0236| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3419|± |0.0159| +| - philosophy | 1|none | 0|acc |↑ | 0.7814|± |0.0235| +| - prehistory | 1|none | 0|acc |↑ | 0.7716|± |0.0234| +| - professional_law | 1|none | 0|acc |↑ | 0.5156|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8363|± |0.0284| +| - other | 2|none | |acc |↑ | 0.7451|± |0.0075| +| - business_ethics | 1|none | 0|acc |↑ | 0.7700|± |0.0423| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7358|± |0.0271| +| - college_medicine | 1|none | 0|acc |↑ | 0.6821|± |0.0355| +| - global_facts | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - human_aging | 1|none | 0|acc |↑ | 0.7489|± |0.0291| +| - management | 1|none | 0|acc |↑ | 0.8155|± |0.0384| +| - marketing | 1|none | 0|acc |↑ | 0.9231|± |0.0175| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7900|± |0.0409| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8506|± |0.0127| +| - nutrition | 1|none | 0|acc |↑ | 0.7680|± |0.0242| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5603|± |0.0296| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7243|± |0.0271| +| - virology | 1|none | 0|acc |↑ | 0.5060|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7956|± |0.0071| +| - econometrics | 1|none | 0|acc |↑ | 0.5526|± |0.0468| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8485|± |0.0255| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9119|± |0.0205| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7538|± |0.0218| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8529|± |0.0230| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8716|± |0.0143| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7786|± |0.0364| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7288|± |0.0180| +| - public_relations | 1|none | 0|acc |↑ | 0.6818|± |0.0446| +| - security_studies | 1|none | 0|acc |↑ | 0.7633|± |0.0272| +| - sociology | 1|none | 0|acc |↑ | 0.8308|± |0.0265| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.9200|± |0.0273| +| - stem | 2|none | |acc |↑ | 0.6426|± |0.0082| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5000|± |0.0503| +| - anatomy | 1|none | 0|acc |↑ | 0.6889|± |0.0400| +| - astronomy | 1|none | 0|acc |↑ | 0.7632|± |0.0346| +| - college_biology | 1|none | 0|acc |↑ | 0.8264|± |0.0317| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - college_mathematics | 1|none | 0|acc |↑ | 0.5500|± |0.0500| +| - college_physics | 1|none | 0|acc |↑ | 0.4706|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.7900|± |0.0409| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7447|± |0.0285| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6552|± |0.0396| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6270|± |0.0249| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8355|± |0.0211| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6601|± |0.0333| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4185|± |0.0301| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4503|± |0.0406| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6019|± |0.0334| +| - machine_learning | 1|none | 0|acc |↑ | 0.5000|± |0.0475| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1532|± |0.0060| +|openbookqa | 1|none | 0|acc |↑ | 0.3580|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.4560|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.7943|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.8063|± |0.0092| +|qnli | 1|none | 0|acc |↑ | 0.5087|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9580|± |0.0063| +| | |none | 0|acc_norm |↑ | 0.9520|± |0.0068| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5438|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4296|± |0.0173| +| | |none | 0|bleu_diff |↑ | 0.5808|± |0.8548| +| | |none | 0|bleu_max |↑ |27.3910|± |0.8218| +| | |none | 0|rouge1_acc |↑ | 0.4198|± |0.0173| +| | |none | 0|rouge1_diff|↑ | 0.7303|± |1.0868| +| | |none | 0|rouge1_max |↑ |52.5810|± |0.9006| +| | |none | 0|rouge2_acc |↑ | 0.3635|± |0.0168| +| | |none | 0|rouge2_diff|↑ |-0.1698|± |1.2219| +| | |none | 0|rouge2_max |↑ |37.0078|± |1.0709| +| | |none | 0|rougeL_acc |↑ | 0.4186|± |0.0173| +| | |none | 0|rougeL_diff|↑ | 0.4753|± |1.0982| +| | |none | 0|rougeL_max |↑ |49.7474|± |0.9202| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3219|± |0.0164| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4676|± |0.0149| +|winogrande | 1|none | 0|acc |↑ | 0.7261|± |0.0125| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.7120|± |0.0052| +|mmlu | 2|none | |acc |↑ |0.6893|± |0.0037| +| - humanities | 2|none | |acc |↑ |0.6142|± |0.0066| +| - other | 2|none | |acc |↑ |0.7451|± |0.0075| +| - social sciences| 2|none | |acc |↑ |0.7956|± |0.0071| +| - stem | 2|none | |acc |↑ |0.6426|± |0.0082| + +01-ai_Yi-1.5-9B: 11h 43m 41s +✅ Benchmark completed for 01-ai_Yi-1.5-9B + +🔥 Starting benchmark for 01-ai_Yi-1.5-6B, +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 8 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 9 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/01-ai_Yi-1.5-6B,), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (8) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4480|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.4070|± |0.0155| +|anli_r3 | 1|none | 0|acc |↑ | 0.4067|± |0.0142| +|arc_challenge | 1|none | 0|acc |↑ | 0.4667|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.4966|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.5755|± |0.0055| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9200|± |0.0172| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5936|± |0.0360| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6040|± |0.0310| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.1000|± |0.0190| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3720|± |0.0306| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9040|± |0.0187| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4520|± |0.0315| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3400|± |0.0300| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7160|± |0.0286| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.6280|± |0.0306| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8360|± |0.0235| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6800|± |0.0296| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5890|± |0.0409| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4280|± |0.0314| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4400|± |0.0315| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6236|± |0.0364| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.5200|± |0.0317| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4320|± |0.0314| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2480|± |0.0274| +|boolq | 2|none | 0|acc |↑ | 0.8015|± |0.0070| +|drop | 3|none | 0|em |↑ | 0.3668|± |0.0049| +| | |none | 0|f1 |↑ | 0.3995|± |0.0049| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0859|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0404|± |0.0140| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1212|± |0.0233| +| | |strict-match | 0|exact_match|↑ | 0.0354|± |0.0132| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2323|± |0.0301| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3586|± |0.0342| +| | |none | 0|acc_norm |↑ | 0.3586|± |0.0342| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3232|± |0.0333| +| | |none | 0|acc_norm |↑ | 0.3232|± |0.0333| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1886|± |0.0168| +| | |strict-match | 0|exact_match|↑ | 0.0714|± |0.0110| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2198|± |0.0177| +| | |strict-match | 0|exact_match|↑ | 0.1062|± |0.0132| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2601|± |0.0188| +| | |strict-match | 0|exact_match|↑ | 0.0055|± |0.0032| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3315|± |0.0202| +| | |none | 0|acc_norm |↑ | 0.3315|± |0.0202| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2985|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2985|± |0.0196| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1473|± |0.0168| +| | |strict-match | 0|exact_match|↑ | 0.0714|± |0.0122| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1741|± |0.0179| +| | |strict-match | 0|exact_match|↑ | 0.0871|± |0.0133| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2277|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3147|± |0.0220| +| | |none | 0|acc_norm |↑ | 0.3147|± |0.0220| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2902|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.2902|± |0.0215| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.5262|± |0.0138| +| | |strict-match | 5|exact_match|↑ | 0.5224|± |0.0138| +|hellaswag | 1|none | 0|acc |↑ | 0.5668|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7541|± |0.0043| +|mmlu | 2|none | |acc |↑ | 0.6243|± |0.0038| +| - humanities | 2|none | |acc |↑ | 0.5528|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.4841|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7576|± |0.0335| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7990|± |0.0281| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7806|± |0.0269| +| - international_law | 1|none | 0|acc |↑ | 0.7851|± |0.0375| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7222|± |0.0433| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7423|± |0.0344| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6792|± |0.0251| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.7203|± |0.0255| +| - prehistory | 1|none | 0|acc |↑ | 0.6883|± |0.0258| +| - professional_law | 1|none | 0|acc |↑ | 0.4824|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.7836|± |0.0316| +| - other | 2|none | |acc |↑ | 0.6849|± |0.0081| +| - business_ethics | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6717|± |0.0289| +| - college_medicine | 1|none | 0|acc |↑ | 0.6358|± |0.0367| +| - global_facts | 1|none | 0|acc |↑ | 0.3800|± |0.0488| +| - human_aging | 1|none | 0|acc |↑ | 0.6502|± |0.0320| +| - management | 1|none | 0|acc |↑ | 0.8058|± |0.0392| +| - marketing | 1|none | 0|acc |↑ | 0.8675|± |0.0222| +| - medical_genetics | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8008|± |0.0143| +| - nutrition | 1|none | 0|acc |↑ | 0.6895|± |0.0265| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4965|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6287|± |0.0293| +| - virology | 1|none | 0|acc |↑ | 0.5181|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7407|± |0.0077| +| - econometrics | 1|none | 0|acc |↑ | 0.4386|± |0.0467| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7929|± |0.0289| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8601|± |0.0250| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7026|± |0.0232| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8025|± |0.0259| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8367|± |0.0158| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6794|± |0.0409| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6552|± |0.0192| +| - public_relations | 1|none | 0|acc |↑ | 0.6545|± |0.0455| +| - security_studies | 1|none | 0|acc |↑ | 0.7061|± |0.0292| +| - sociology | 1|none | 0|acc |↑ | 0.8159|± |0.0274| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8600|± |0.0349| +| - stem | 2|none | |acc |↑ | 0.5576|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3500|± |0.0479| +| - anatomy | 1|none | 0|acc |↑ | 0.5852|± |0.0426| +| - astronomy | 1|none | 0|acc |↑ | 0.6382|± |0.0391| +| - college_biology | 1|none | 0|acc |↑ | 0.7222|± |0.0375| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5400|± |0.0501| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4500|± |0.0500| +| - college_physics | 1|none | 0|acc |↑ | 0.4020|± |0.0488| +| - computer_security | 1|none | 0|acc |↑ | 0.7700|± |0.0423| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.6596|± |0.0310| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6414|± |0.0400| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.5159|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7935|± |0.0230| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5567|± |0.0350| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3481|± |0.0290| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4238|± |0.0403| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4583|± |0.0340| +| - machine_learning | 1|none | 0|acc |↑ | 0.4375|± |0.0471| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1781|± |0.0064| +|openbookqa | 1|none | 0|acc |↑ | 0.3200|± |0.0209| +| | |none | 0|acc_norm |↑ | 0.4220|± |0.0221| +|piqa | 1|none | 0|acc |↑ | 0.7856|± |0.0096| +| | |none | 0|acc_norm |↑ | 0.8014|± |0.0093| +|qnli | 1|none | 0|acc |↑ | 0.5986|± |0.0066| +|sciq | 1|none | 0|acc |↑ | 0.9540|± |0.0066| +| | |none | 0|acc_norm |↑ | 0.9410|± |0.0075| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.4952|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5129|± |0.0175| +| | |none | 0|bleu_diff |↑ | 8.5287|± |1.1119| +| | |none | 0|bleu_max |↑ |33.0037|± |0.8799| +| | |none | 0|rouge1_acc |↑ | 0.4835|± |0.0175| +| | |none | 0|rouge1_diff|↑ |12.2235|± |1.5196| +| | |none | 0|rouge1_max |↑ |57.2896|± |1.0008| +| | |none | 0|rouge2_acc |↑ | 0.4370|± |0.0174| +| | |none | 0|rouge2_diff|↑ |12.2809|± |1.6323| +| | |none | 0|rouge2_max |↑ |44.2123|± |1.2026| +| | |none | 0|rougeL_acc |↑ | 0.4663|± |0.0175| +| | |none | 0|rougeL_diff|↑ |12.0600|± |1.5359| +| | |none | 0|rougeL_max |↑ |55.2797|± |1.0310| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2999|± |0.0160| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4408|± |0.0148| +|winogrande | 1|none | 0|acc |↑ | 0.7206|± |0.0126| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5755|± |0.0055| +|mmlu | 2|none | |acc |↑ |0.6243|± |0.0038| +| - humanities | 2|none | |acc |↑ |0.5528|± |0.0067| +| - other | 2|none | |acc |↑ |0.6849|± |0.0081| +| - social sciences| 2|none | |acc |↑ |0.7407|± |0.0077| +| - stem | 2|none | |acc |↑ |0.5576|± |0.0085| + +01-ai_Yi-1.5-6B,: 4h 28m 24s +✅ Benchmark completed for 01-ai_Yi-1.5-6B, + +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct-1M, +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-7B-Instruct-1M,), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5850|± |0.0156| +|anli_r2 | 1|none | 0|acc |↑ | 0.5330|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5567|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.5503|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.5853|± |0.0144| +|bbh | 3|get-answer | |exact_match|↑ | 0.2772|± |0.0043| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5668|± |0.0363| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7200|± |0.0285| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0440|± |0.0130| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.0480|± |0.0135| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.1480|± |0.0225| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.0040|± |0.0040| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4240|± |0.0313| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3320|± |0.0298| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8440|± |0.0230| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0040|± |0.0040| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.3280|± |0.0298| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5616|± |0.0412| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.0120|± |0.0069| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.2920|± |0.0288| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5120|± |0.0317| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.7247|± |0.0336| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8800|± |0.0206| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2200|± |0.0263| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.0920|± |0.0183| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3280|± |0.0298| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.1760|± |0.0241| +|boolq | 2|none | 0|acc |↑ | 0.8526|± |0.0062| +|drop | 3|none | 0|em |↑ | 0.0023|± |0.0005| +| | |none | 0|f1 |↑ | 0.0570|± |0.0014| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1414|± |0.0248| +| | |strict-match | 0|exact_match|↑ | 0.0101|± |0.0071| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1111|± |0.0224| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2374|± |0.0303| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3030|± |0.0327| +| | |none | 0|acc_norm |↑ | 0.3030|± |0.0327| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2980|± |0.0326| +| | |none | 0|acc_norm |↑ | 0.2980|± |0.0326| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1758|± |0.0163| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1190|± |0.0139| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2143|± |0.0176| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3388|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3388|± |0.0203| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3388|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3388|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1719|± |0.0178| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1138|± |0.0150| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2299|± |0.0199| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3326|± |0.0223| +| | |none | 0|acc_norm |↑ | 0.3326|± |0.0223| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3393|± |0.0224| +| | |none | 0|acc_norm |↑ | 0.3393|± |0.0224| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8234|± |0.0105| +| | |strict-match | 5|exact_match|↑ | 0.7953|± |0.0111| +|hellaswag | 1|none | 0|acc |↑ | 0.5987|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7900|± |0.0041| +|mmlu | 2|none | |acc |↑ | 0.7166|± |0.0036| +| - humanities | 2|none | |acc |↑ | 0.6361|± |0.0066| +| - formal_logic | 1|none | 0|acc |↑ | 0.5000|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8182|± |0.0301| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8627|± |0.0242| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8650|± |0.0222| +| - international_law | 1|none | 0|acc |↑ | 0.8347|± |0.0339| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7778|± |0.0402| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8344|± |0.0292| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7832|± |0.0222| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4201|± |0.0165| +| - philosophy | 1|none | 0|acc |↑ | 0.7492|± |0.0246| +| - prehistory | 1|none | 0|acc |↑ | 0.8086|± |0.0219| +| - professional_law | 1|none | 0|acc |↑ | 0.5248|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8538|± |0.0271| +| - other | 2|none | |acc |↑ | 0.7634|± |0.0074| +| - business_ethics | 1|none | 0|acc |↑ | 0.7600|± |0.0429| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7736|± |0.0258| +| - college_medicine | 1|none | 0|acc |↑ | 0.7052|± |0.0348| +| - global_facts | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - human_aging | 1|none | 0|acc |↑ | 0.7220|± |0.0301| +| - management | 1|none | 0|acc |↑ | 0.8641|± |0.0339| +| - marketing | 1|none | 0|acc |↑ | 0.9188|± |0.0179| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8500|± |0.0359| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8493|± |0.0128| +| - nutrition | 1|none | 0|acc |↑ | 0.7843|± |0.0236| +| - professional_accounting | 1|none | 0|acc |↑ | 0.6064|± |0.0291| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7831|± |0.0250| +| - virology | 1|none | 0|acc |↑ | 0.5000|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.8229|± |0.0068| +| - econometrics | 1|none | 0|acc |↑ | 0.6754|± |0.0440| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8636|± |0.0245| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9223|± |0.0193| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7821|± |0.0209| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8655|± |0.0222| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8936|± |0.0132| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8092|± |0.0345| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7598|± |0.0173| +| - public_relations | 1|none | 0|acc |↑ | 0.7091|± |0.0435| +| - security_studies | 1|none | 0|acc |↑ | 0.7796|± |0.0265| +| - sociology | 1|none | 0|acc |↑ | 0.8955|± |0.0216| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8800|± |0.0327| +| - stem | 2|none | |acc |↑ | 0.6870|± |0.0080| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5600|± |0.0499| +| - anatomy | 1|none | 0|acc |↑ | 0.6963|± |0.0397| +| - astronomy | 1|none | 0|acc |↑ | 0.8158|± |0.0315| +| - college_biology | 1|none | 0|acc |↑ | 0.8542|± |0.0295| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5500|± |0.0500| +| - college_computer_science | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4400|± |0.0499| +| - college_physics | 1|none | 0|acc |↑ | 0.5392|± |0.0496| +| - computer_security | 1|none | 0|acc |↑ | 0.8000|± |0.0402| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7277|± |0.0291| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7172|± |0.0375| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6534|± |0.0245| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8806|± |0.0184| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6601|± |0.0333| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8400|± |0.0368| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5630|± |0.0302| +| - high_school_physics | 1|none | 0|acc |↑ | 0.5695|± |0.0404| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6898|± |0.0315| +| - machine_learning | 1|none | 0|acc |↑ | 0.5625|± |0.0471| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1576|± |0.0061| +|openbookqa | 1|none | 0|acc |↑ | 0.3580|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.4800|± |0.0224| +|piqa | 1|none | 0|acc |↑ | 0.8009|± |0.0093| +| | |none | 0|acc_norm |↑ | 0.8161|± |0.0090| +|qnli | 1|none | 0|acc |↑ | 0.6782|± |0.0063| +|sciq | 1|none | 0|acc |↑ | 0.9630|± |0.0060| +| | |none | 0|acc_norm |↑ | 0.9500|± |0.0069| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.4205|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4908|± |0.0175| +| | |none | 0|bleu_diff |↑ | 0.0819|± |0.3040| +| | |none | 0|bleu_max |↑ |10.5933|± |0.4922| +| | |none | 0|rouge1_acc |↑ | 0.5067|± |0.0175| +| | |none | 0|rouge1_diff|↑ | 0.1716|± |0.4296| +| | |none | 0|rouge1_max |↑ |31.0117|± |0.6908| +| | |none | 0|rouge2_acc |↑ | 0.3953|± |0.0171| +| | |none | 0|rouge2_diff|↑ |-0.6488|± |0.4941| +| | |none | 0|rouge2_max |↑ |18.4106|± |0.6992| +| | |none | 0|rougeL_acc |↑ | 0.4982|± |0.0175| +| | |none | 0|rougeL_diff|↑ |-0.0253|± |0.4232| +| | |none | 0|rougeL_max |↑ |27.9487|± |0.6871| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4259|± |0.0173| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.6001|± |0.0154| +|winogrande | 1|none | 0|acc |↑ | 0.7277|± |0.0125| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.2772|± |0.0043| +|mmlu | 2|none | |acc |↑ |0.7166|± |0.0036| +| - humanities | 2|none | |acc |↑ |0.6361|± |0.0066| +| - other | 2|none | |acc |↑ |0.7634|± |0.0074| +| - social sciences| 2|none | |acc |↑ |0.8229|± |0.0068| +| - stem | 2|none | |acc |↑ |0.6870|± |0.0080| + +Qwen_Qwen2.5-7B-Instruct-1M,: 11h 17m 22s +✅ Benchmark completed for Qwen_Qwen2.5-7B-Instruct-1M, + +🔥 Starting benchmark for Qwen_Qwen3-8B, +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen3-8B,), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (1) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.6690|± |0.0149| +|anli_r2 | 1|none | 0|acc |↑ | 0.5420|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5558|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.5546|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.5623|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.7976|± |0.0045| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9800|± |0.0089| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5936|± |0.0360| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.8520|± |0.0225| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5960|± |0.0311| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.4440|± |0.0315| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.6800|± |0.0296| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.5920|± |0.0311| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9480|± |0.0141| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7160|± |0.0286| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4080|± |0.0311| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9640|± |0.0118| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9840|± |0.0080| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.9000|± |0.0190| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.8836|± |0.0266| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.8640|± |0.0217| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.7960|± |0.0255| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5880|± |0.0312| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.7079|± |0.0342| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8640|± |0.0217| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.9280|± |0.0164| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.9640|± |0.0118| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.8960|± |0.0193| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.9960|± |0.0040| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.6240|± |0.0307| +|boolq | 2|none | 0|acc |↑ | 0.8657|± |0.0060| +|drop | 3|none | 0|em |↑ | 0.0034|± |0.0006| +| | |none | 0|f1 |↑ | 0.1099|± |0.0020| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1162|± |0.0228| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0354|± |0.0132| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2828|± |0.0321| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3434|± |0.0338| +| | |none | 0|acc_norm |↑ | 0.3434|± |0.0338| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3434|± |0.0338| +| | |none | 0|acc_norm |↑ | 0.3434|± |0.0338| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1227|± |0.0141| +| | |strict-match | 0|exact_match|↑ | 0.0073|± |0.0037| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0733|± |0.0112| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2692|± |0.0190| +| | |strict-match | 0|exact_match|↑ | 0.0037|± |0.0026| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3608|± |0.0206| +| | |none | 0|acc_norm |↑ | 0.3608|± |0.0206| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3828|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.3828|± |0.0208| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1295|± |0.0159| +| | |strict-match | 0|exact_match|↑ | 0.0134|± |0.0054| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0647|± |0.0116| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2634|± |0.0208| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3884|± |0.0231| +| | |none | 0|acc_norm |↑ | 0.3884|± |0.0231| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3504|± |0.0226| +| | |none | 0|acc_norm |↑ | 0.3504|± |0.0226| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8787|± |0.0090| +| | |strict-match | 5|exact_match|↑ | 0.8726|± |0.0092| +|hellaswag | 1|none | 0|acc |↑ | 0.5711|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7487|± |0.0043| +|mmlu | 2|none | |acc |↑ | 0.7290|± |0.0035| +| - humanities | 2|none | |acc |↑ | 0.6383|± |0.0065| +| - formal_logic | 1|none | 0|acc |↑ | 0.6032|± |0.0438| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8788|± |0.0255| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8824|± |0.0226| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8734|± |0.0216| +| - international_law | 1|none | 0|acc |↑ | 0.8182|± |0.0352| +| - jurisprudence | 1|none | 0|acc |↑ | 0.8056|± |0.0383| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8405|± |0.0288| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7399|± |0.0236| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4101|± |0.0164| +| - philosophy | 1|none | 0|acc |↑ | 0.7878|± |0.0232| +| - prehistory | 1|none | 0|acc |↑ | 0.8395|± |0.0204| +| - professional_law | 1|none | 0|acc |↑ | 0.5111|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8655|± |0.0262| +| - other | 2|none | |acc |↑ | 0.7702|± |0.0072| +| - business_ethics | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7925|± |0.0250| +| - college_medicine | 1|none | 0|acc |↑ | 0.7572|± |0.0327| +| - global_facts | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - human_aging | 1|none | 0|acc |↑ | 0.7309|± |0.0298| +| - management | 1|none | 0|acc |↑ | 0.8835|± |0.0318| +| - marketing | 1|none | 0|acc |↑ | 0.9274|± |0.0170| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8557|± |0.0126| +| - nutrition | 1|none | 0|acc |↑ | 0.7810|± |0.0237| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5745|± |0.0295| +| - professional_medicine | 1|none | 0|acc |↑ | 0.8199|± |0.0233| +| - virology | 1|none | 0|acc |↑ | 0.5422|± |0.0388| +| - social sciences | 2|none | |acc |↑ | 0.8294|± |0.0067| +| - econometrics | 1|none | 0|acc |↑ | 0.6754|± |0.0440| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8535|± |0.0252| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9326|± |0.0181| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7949|± |0.0205| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.9160|± |0.0180| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.9064|± |0.0125| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8473|± |0.0315| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7533|± |0.0174| +| - public_relations | 1|none | 0|acc |↑ | 0.7091|± |0.0435| +| - security_studies | 1|none | 0|acc |↑ | 0.7755|± |0.0267| +| - sociology | 1|none | 0|acc |↑ | 0.8856|± |0.0225| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8600|± |0.0349| +| - stem | 2|none | |acc |↑ | 0.7257|± |0.0077| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5700|± |0.0498| +| - anatomy | 1|none | 0|acc |↑ | 0.7037|± |0.0394| +| - astronomy | 1|none | 0|acc |↑ | 0.8684|± |0.0275| +| - college_biology | 1|none | 0|acc |↑ | 0.8542|± |0.0295| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - college_computer_science | 1|none | 0|acc |↑ | 0.7300|± |0.0446| +| - college_mathematics | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - college_physics | 1|none | 0|acc |↑ | 0.5686|± |0.0493| +| - computer_security | 1|none | 0|acc |↑ | 0.8200|± |0.0386| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.8255|± |0.0248| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7310|± |0.0370| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.7011|± |0.0236| +| - high_school_biology | 1|none | 0|acc |↑ | 0.9129|± |0.0160| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.7241|± |0.0314| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8800|± |0.0327| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5074|± |0.0305| +| - high_school_physics | 1|none | 0|acc |↑ | 0.7020|± |0.0373| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.7315|± |0.0302| +| - machine_learning | 1|none | 0|acc |↑ | 0.5893|± |0.0467| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0737|± |0.0043| +|openbookqa | 1|none | 0|acc |↑ | 0.3160|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.4180|± |0.0221| +|piqa | 1|none | 0|acc |↑ | 0.7644|± |0.0099| +| | |none | 0|acc_norm |↑ | 0.7753|± |0.0097| +|qnli | 1|none | 0|acc |↑ | 0.7818|± |0.0056| +|sciq | 1|none | 0|acc |↑ | 0.9670|± |0.0057| +| | |none | 0|acc_norm |↑ | 0.9580|± |0.0063| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3206|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.6022|± |0.0171| +| | |none | 0|bleu_diff |↑ |16.3978|± |1.0966| +| | |none | 0|bleu_max |↑ |35.5543|± |0.8937| +| | |none | 0|rouge1_acc |↑ | 0.6083|± |0.0171| +| | |none | 0|rouge1_diff|↑ |23.2877|± |1.5315| +| | |none | 0|rouge1_max |↑ |62.0759|± |0.9426| +| | |none | 0|rouge2_acc |↑ | 0.5704|± |0.0173| +| | |none | 0|rouge2_diff|↑ |23.9536|± |1.6740| +| | |none | 0|rouge2_max |↑ |50.5507|± |1.1698| +| | |none | 0|rougeL_acc |↑ | 0.6120|± |0.0171| +| | |none | 0|rougeL_diff|↑ |23.4988|± |1.5413| +| | |none | 0|rougeL_max |↑ |59.9677|± |0.9917| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3635|± |0.0168| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5431|± |0.0158| +|winogrande | 1|none | 0|acc |↑ | 0.6803|± |0.0131| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.7976|± |0.0045| +|mmlu | 2|none | |acc |↑ |0.7290|± |0.0035| +| - humanities | 2|none | |acc |↑ |0.6383|± |0.0065| +| - other | 2|none | |acc |↑ |0.7702|± |0.0072| +| - social sciences| 2|none | |acc |↑ |0.8294|± |0.0067| +| - stem | 2|none | |acc |↑ |0.7257|± |0.0077| + +Qwen_Qwen3-8B,: 15h 32m 7s +✅ Benchmark completed for Qwen_Qwen3-8B, + +🔥 Starting benchmark for Qwen_Qwen3-8B-FP8, +Passed argument batch_size = auto:1. Detecting largest batch size +Qwen_Qwen3-8B-FP8,: 0h 5m 29s +✅ Benchmark completed for Qwen_Qwen3-8B-FP8, + +🔥 Starting benchmark for Qwen_Qwen2.5-Math-7B-Instruct, +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 4 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 4 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-Math-7B-Instruct,), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4310|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.4150|± |0.0156| +|anli_r3 | 1|none | 0|acc |↑ | 0.4292|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.4061|± |0.0144| +| | |none | 0|acc_norm |↑ | 0.4309|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.6140|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9040|± |0.0187| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4545|± |0.0365| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.6680|± |0.0298| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0480|± |0.0135| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.4480|± |0.0315| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.6520|± |0.0302| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5440|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3960|± |0.0310| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9160|± |0.0176| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9480|± |0.0141| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9320|± |0.0160| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.7960|± |0.0255| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7671|± |0.0351| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.8200|± |0.0243| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.3800|± |0.0308| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.3240|± |0.0297| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5674|± |0.0372| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.6360|± |0.0305| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2560|± |0.0277| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.6600|± |0.0300| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.9280|± |0.0164| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9960|± |0.0040| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0840|± |0.0176| +|boolq | 2|none | 0|acc |↑ | 0.6061|± |0.0085| +|drop | 3|none | 0|em |↑ | 0.0001|± |0.0001| +| | |none | 0|f1 |↑ | 0.0273|± |0.0008| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1010|± |0.0215| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0909|± |0.0205| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0960|± |0.0210| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3182|± |0.0332| +| | |none | 0|acc_norm |↑ | 0.3182|± |0.0332| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3182|± |0.0332| +| | |none | 0|acc_norm |↑ | 0.3182|± |0.0332| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1154|± |0.0137| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1227|± |0.0141| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1136|± |0.0136| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3278|± |0.0201| +| | |none | 0|acc_norm |↑ | 0.3278|± |0.0201| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3498|± |0.0204| +| | |none | 0|acc_norm |↑ | 0.3498|± |0.0204| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1116|± |0.0149| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1205|± |0.0154| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1295|± |0.0159| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3147|± |0.0220| +| | |none | 0|acc_norm |↑ | 0.3147|± |0.0220| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2879|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.2879|± |0.0214| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8931|± |0.0085| +| | |strict-match | 5|exact_match|↑ | 0.8901|± |0.0086| +|hellaswag | 1|none | 0|acc |↑ | 0.4395|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.5881|± |0.0049| +|mmlu | 2|none | |acc |↑ | 0.5372|± |0.0041| +| - humanities | 2|none | |acc |↑ | 0.4389|± |0.0070| +| - formal_logic | 1|none | 0|acc |↑ | 0.4921|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.5818|± |0.0385| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.5343|± |0.0350| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.6160|± |0.0317| +| - international_law | 1|none | 0|acc |↑ | 0.6529|± |0.0435| +| - jurisprudence | 1|none | 0|acc |↑ | 0.5833|± |0.0477| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.6196|± |0.0381| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5318|± |0.0269| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2771|± |0.0150| +| - philosophy | 1|none | 0|acc |↑ | 0.5563|± |0.0282| +| - prehistory | 1|none | 0|acc |↑ | 0.5031|± |0.0278| +| - professional_law | 1|none | 0|acc |↑ | 0.3677|± |0.0123| +| - world_religions | 1|none | 0|acc |↑ | 0.4503|± |0.0382| +| - other | 2|none | |acc |↑ | 0.5340|± |0.0087| +| - business_ethics | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5132|± |0.0308| +| - college_medicine | 1|none | 0|acc |↑ | 0.4913|± |0.0381| +| - global_facts | 1|none | 0|acc |↑ | 0.2400|± |0.0429| +| - human_aging | 1|none | 0|acc |↑ | 0.5471|± |0.0334| +| - management | 1|none | 0|acc |↑ | 0.7379|± |0.0435| +| - marketing | 1|none | 0|acc |↑ | 0.7607|± |0.0280| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5000|± |0.0503| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6054|± |0.0175| +| - nutrition | 1|none | 0|acc |↑ | 0.5065|± |0.0286| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4113|± |0.0294| +| - professional_medicine | 1|none | 0|acc |↑ | 0.3934|± |0.0297| +| - virology | 1|none | 0|acc |↑ | 0.4578|± |0.0388| +| - social sciences | 2|none | |acc |↑ | 0.6233|± |0.0086| +| - econometrics | 1|none | 0|acc |↑ | 0.5263|± |0.0470| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6111|± |0.0347| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.6114|± |0.0352| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6410|± |0.0243| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7479|± |0.0282| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7193|± |0.0193| +| - human_sexuality | 1|none | 0|acc |↑ | 0.4962|± |0.0439| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5049|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.5818|± |0.0472| +| - security_studies | 1|none | 0|acc |↑ | 0.6286|± |0.0309| +| - sociology | 1|none | 0|acc |↑ | 0.6866|± |0.0328| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - stem | 2|none | |acc |↑ | 0.6032|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - anatomy | 1|none | 0|acc |↑ | 0.4519|± |0.0430| +| - astronomy | 1|none | 0|acc |↑ | 0.6382|± |0.0391| +| - college_biology | 1|none | 0|acc |↑ | 0.5000|± |0.0418| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5700|± |0.0498| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4900|± |0.0502| +| - college_physics | 1|none | 0|acc |↑ | 0.4412|± |0.0494| +| - computer_security | 1|none | 0|acc |↑ | 0.6500|± |0.0479| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7234|± |0.0292| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6276|± |0.0403| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6958|± |0.0237| +| - high_school_biology | 1|none | 0|acc |↑ | 0.6903|± |0.0263| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6059|± |0.0344| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5444|± |0.0304| +| - high_school_physics | 1|none | 0|acc |↑ | 0.5497|± |0.0406| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6852|± |0.0317| +| - machine_learning | 1|none | 0|acc |↑ | 0.4464|± |0.0472| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0199|± |0.0023| +|openbookqa | 1|none | 0|acc |↑ | 0.2380|± |0.0191| +| | |none | 0|acc_norm |↑ | 0.3340|± |0.0211| +|piqa | 1|none | 0|acc |↑ | 0.6850|± |0.0108| +| | |none | 0|acc_norm |↑ | 0.6855|± |0.0108| +|qnli | 1|none | 0|acc |↑ | 0.6775|± |0.0063| +|sciq | 1|none | 0|acc |↑ | 0.9110|± |0.0090| +| | |none | 0|acc_norm |↑ | 0.8580|± |0.0110| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0075|± |0.0006| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3978|± |0.0171| +| | |none | 0|bleu_diff |↑ | 0.7878|± |0.5709| +| | |none | 0|bleu_max |↑ |17.7316|± |0.6218| +| | |none | 0|rouge1_acc |↑ | 0.4211|± |0.0173| +| | |none | 0|rouge1_diff|↑ | 1.8283|± |0.8791| +| | |none | 0|rouge1_max |↑ |42.6331|± |0.8380| +| | |none | 0|rouge2_acc |↑ | 0.3537|± |0.0167| +| | |none | 0|rouge2_diff|↑ | 1.4478|± |0.9688| +| | |none | 0|rouge2_max |↑ |29.6721|± |0.9118| +| | |none | 0|rougeL_acc |↑ | 0.4039|± |0.0172| +| | |none | 0|rougeL_diff|↑ | 1.5558|± |0.8801| +| | |none | 0|rougeL_max |↑ |40.0785|± |0.8407| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2987|± |0.0160| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4750|± |0.0160| +|winogrande | 1|none | 0|acc |↑ | 0.5793|± |0.0139| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6140|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.5372|± |0.0041| +| - humanities | 2|none | |acc |↑ |0.4389|± |0.0070| +| - other | 2|none | |acc |↑ |0.5340|± |0.0087| +| - social sciences| 2|none | |acc |↑ |0.6233|± |0.0086| +| - stem | 2|none | |acc |↑ |0.6032|± |0.0086| + +Qwen_Qwen2.5-Math-7B-Instruct,: 5h 37m 20s +✅ Benchmark completed for Qwen_Qwen2.5-Math-7B-Instruct, + +🔥 Starting benchmark for Qwen_Qwen2.5-Math-7B, +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 4 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 4 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-Math-7B,), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (4) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3870|± |0.0154| +|anli_r2 | 1|none | 0|acc |↑ | 0.4070|± |0.0155| +|anli_r3 | 1|none | 0|acc |↑ | 0.3825|± |0.0140| +|arc_challenge | 1|none | 0|acc |↑ | 0.4855|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5026|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.6724|± |0.0050| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9120|± |0.0180| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5027|± |0.0367| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5080|± |0.0317| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.1680|± |0.0237| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5520|± |0.0315| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.4440|± |0.0315| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3800|± |0.0308| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9480|± |0.0141| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8680|± |0.0215| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7808|± |0.0344| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.8840|± |0.0203| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.5040|± |0.0317| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.3480|± |0.0302| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5506|± |0.0374| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7400|± |0.0278| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.3080|± |0.0293| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.8960|± |0.0193| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.8520|± |0.0225| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2960|± |0.0289| +|boolq | 2|none | 0|acc |↑ | 0.7456|± |0.0076| +|drop | 3|none | 0|em |↑ | 0.0012|± |0.0003| +| | |none | 0|f1 |↑ | 0.0432|± |0.0011| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2121|± |0.0291| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2778|± |0.0319| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3333|± |0.0336| +| | |none | 0|acc_norm |↑ | 0.3333|± |0.0336| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3182|± |0.0332| +| | |none | 0|acc_norm |↑ | 0.3182|± |0.0332| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2161|± |0.0176| +| | |strict-match | 0|exact_match|↑ | 0.0055|± |0.0032| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2692|± |0.0190| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3168|± |0.0199| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3040|± |0.0197| +| | |none | 0|acc_norm |↑ | 0.3040|± |0.0197| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3297|± |0.0201| +| | |none | 0|acc_norm |↑ | 0.3297|± |0.0201| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2188|± |0.0196| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2455|± |0.0204| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3237|± |0.0221| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3036|± |0.0217| +| | |none | 0|acc_norm |↑ | 0.3036|± |0.0217| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3080|± |0.0218| +| | |none | 0|acc_norm |↑ | 0.3080|± |0.0218| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8491|± |0.0099| +| | |strict-match | 5|exact_match|↑ | 0.8476|± |0.0099| +|hellaswag | 1|none | 0|acc |↑ | 0.4907|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6529|± |0.0048| +|mmlu | 2|none | |acc |↑ | 0.5799|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.4742|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.5159|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6606|± |0.0370| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.6225|± |0.0340| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.6751|± |0.0305| +| - international_law | 1|none | 0|acc |↑ | 0.6942|± |0.0421| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6296|± |0.0467| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7055|± |0.0358| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6185|± |0.0262| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.6206|± |0.0276| +| - prehistory | 1|none | 0|acc |↑ | 0.5216|± |0.0278| +| - professional_law | 1|none | 0|acc |↑ | 0.3963|± |0.0125| +| - world_religions | 1|none | 0|acc |↑ | 0.5965|± |0.0376| +| - other | 2|none | |acc |↑ | 0.5845|± |0.0086| +| - business_ethics | 1|none | 0|acc |↑ | 0.6400|± |0.0482| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5811|± |0.0304| +| - college_medicine | 1|none | 0|acc |↑ | 0.5029|± |0.0381| +| - global_facts | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - human_aging | 1|none | 0|acc |↑ | 0.6278|± |0.0324| +| - management | 1|none | 0|acc |↑ | 0.6990|± |0.0454| +| - marketing | 1|none | 0|acc |↑ | 0.8248|± |0.0249| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5700|± |0.0498| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6782|± |0.0167| +| - nutrition | 1|none | 0|acc |↑ | 0.5654|± |0.0284| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4681|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.3713|± |0.0293| +| - virology | 1|none | 0|acc |↑ | 0.4699|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.6724|± |0.0084| +| - econometrics | 1|none | 0|acc |↑ | 0.5877|± |0.0463| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7020|± |0.0326| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.7254|± |0.0322| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6744|± |0.0238| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8193|± |0.0250| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7266|± |0.0191| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5878|± |0.0432| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5621|± |0.0201| +| - public_relations | 1|none | 0|acc |↑ | 0.6182|± |0.0465| +| - security_studies | 1|none | 0|acc |↑ | 0.6367|± |0.0308| +| - sociology | 1|none | 0|acc |↑ | 0.7413|± |0.0310| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - stem | 2|none | |acc |↑ | 0.6429|± |0.0084| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - anatomy | 1|none | 0|acc |↑ | 0.4370|± |0.0428| +| - astronomy | 1|none | 0|acc |↑ | 0.7105|± |0.0369| +| - college_biology | 1|none | 0|acc |↑ | 0.6389|± |0.0402| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - college_physics | 1|none | 0|acc |↑ | 0.4608|± |0.0496| +| - computer_security | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7447|± |0.0285| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6690|± |0.0392| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.7169|± |0.0232| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7581|± |0.0244| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6207|± |0.0341| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5630|± |0.0302| +| - high_school_physics | 1|none | 0|acc |↑ | 0.6026|± |0.0400| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.7222|± |0.0305| +| - machine_learning | 1|none | 0|acc |↑ | 0.4821|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0510|± |0.0037| +|openbookqa | 1|none | 0|acc |↑ | 0.2720|± |0.0199| +| | |none | 0|acc_norm |↑ | 0.3920|± |0.0219| +|piqa | 1|none | 0|acc |↑ | 0.7285|± |0.0104| +| | |none | 0|acc_norm |↑ | 0.7454|± |0.0102| +|qnli | 1|none | 0|acc |↑ | 0.4981|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9410|± |0.0075| +| | |none | 0|acc_norm |↑ | 0.9290|± |0.0081| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.2183|± |0.0031| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3733|± |0.0169| +| | |none | 0|bleu_diff |↑ |-1.3812|± |0.3689| +| | |none | 0|bleu_max |↑ |13.2847|± |0.4547| +| | |none | 0|rouge1_acc |↑ | 0.3733|± |0.0169| +| | |none | 0|rouge1_diff|↑ |-2.1164|± |0.5749| +| | |none | 0|rouge1_max |↑ |36.6193|± |0.7424| +| | |none | 0|rouge2_acc |↑ | 0.3158|± |0.0163| +| | |none | 0|rouge2_diff|↑ |-2.6990|± |0.6541| +| | |none | 0|rouge2_max |↑ |24.5611|± |0.7453| +| | |none | 0|rougeL_acc |↑ | 0.3733|± |0.0169| +| | |none | 0|rougeL_diff|↑ |-2.1518|± |0.5742| +| | |none | 0|rougeL_max |↑ |34.7410|± |0.7352| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3207|± |0.0163| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4832|± |0.0150| +|winogrande | 1|none | 0|acc |↑ | 0.6480|± |0.0134| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6724|± |0.0050| +|mmlu | 2|none | |acc |↑ |0.5799|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.4742|± |0.0069| +| - other | 2|none | |acc |↑ |0.5845|± |0.0086| +| - social sciences| 2|none | |acc |↑ |0.6724|± |0.0084| +| - stem | 2|none | |acc |↑ |0.6429|± |0.0084| + +Qwen_Qwen2.5-Math-7B,: 27h 22m 7s +✅ Benchmark completed for Qwen_Qwen2.5-Math-7B, + + +______________________ +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +Passed argument batch_size = auto:5.0. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto:5.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto:5.0. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto:5.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-7B-Instruct), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto:5 (1,64,64,64,64,64) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.6840|± |0.0147| +|anli_r2 | 1|none | 0|acc |↑ | 0.5440|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5492|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5265|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5529|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.4534|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8120|± |0.0248| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.3529|± |0.0350| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.4560|± |0.0316| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0080|± |0.0056| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.1040|± |0.0193| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.0360|± |0.0118| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8800|± |0.0206| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.2055|± |0.0336| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5560|± |0.0315| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.0360|± |0.0118| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.2680|± |0.0281| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5281|± |0.0375| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7760|± |0.0264| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.3200|± |0.0296| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2640|± |0.0279| +|boolq | 2|none | 0|acc |↑ | 0.8633|± |0.0060| +|drop | 3|none | 0|em |↑ | 0.0028|± |0.0005| +| | |none | 0|f1 |↑ | 0.0713|± |0.0014| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1414|± |0.0248| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0859|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2879|± |0.0323| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3131|± |0.0330| +| | |none | 0|acc_norm |↑ | 0.3131|± |0.0330| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1612|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0989|± |0.0128| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2546|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3242|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3242|± |0.0200| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3388|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3388|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1540|± |0.0171| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1094|± |0.0148| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2746|± |0.0211| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3237|± |0.0221| +| | |none | 0|acc_norm |↑ | 0.3237|± |0.0221| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3326|± |0.0223| +| | |none | 0|acc_norm |↑ | 0.3326|± |0.0223| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8256|± |0.0105| +| | |strict-match | 5|exact_match|↑ | 0.7582|± |0.0118| +|hellaswag | 1|none | 0|acc |↑ | 0.6198|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.8039|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.7175|± |0.0036| +| - humanities | 2|none | |acc |↑ | 0.6351|± |0.0066| +| - formal_logic | 1|none | 0|acc |↑ | 0.5635|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8485|± |0.0280| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.9020|± |0.0209| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8734|± |0.0216| +| - international_law | 1|none | 0|acc |↑ | 0.8099|± |0.0358| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7963|± |0.0389| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8221|± |0.0300| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7659|± |0.0228| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4190|± |0.0165| +| - philosophy | 1|none | 0|acc |↑ | 0.7363|± |0.0250| +| - prehistory | 1|none | 0|acc |↑ | 0.8241|± |0.0212| +| - professional_law | 1|none | 0|acc |↑ | 0.5156|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8246|± |0.0292| +| - other | 2|none | |acc |↑ | 0.7650|± |0.0073| +| - business_ethics | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7774|± |0.0256| +| - college_medicine | 1|none | 0|acc |↑ | 0.6705|± |0.0358| +| - global_facts | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - human_aging | 1|none | 0|acc |↑ | 0.7803|± |0.0278| +| - management | 1|none | 0|acc |↑ | 0.8835|± |0.0318| +| - marketing | 1|none | 0|acc |↑ | 0.9231|± |0.0175| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8400|± |0.0368| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8531|± |0.0127| +| - nutrition | 1|none | 0|acc |↑ | 0.7876|± |0.0234| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5567|± |0.0296| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7831|± |0.0250| +| - virology | 1|none | 0|acc |↑ | 0.5241|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.8274|± |0.0067| +| - econometrics | 1|none | 0|acc |↑ | 0.6754|± |0.0440| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8788|± |0.0233| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9430|± |0.0167| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7897|± |0.0207| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8655|± |0.0222| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.9064|± |0.0125| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8092|± |0.0345| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7598|± |0.0173| +| - public_relations | 1|none | 0|acc |↑ | 0.7182|± |0.0431| +| - security_studies | 1|none | 0|acc |↑ | 0.7714|± |0.0269| +| - sociology | 1|none | 0|acc |↑ | 0.8955|± |0.0216| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8600|± |0.0349| +| - stem | 2|none | |acc |↑ | 0.6863|± |0.0080| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5800|± |0.0496| +| - anatomy | 1|none | 0|acc |↑ | 0.7333|± |0.0382| +| - astronomy | 1|none | 0|acc |↑ | 0.8553|± |0.0286| +| - college_biology | 1|none | 0|acc |↑ | 0.8681|± |0.0283| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - college_physics | 1|none | 0|acc |↑ | 0.5098|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7702|± |0.0275| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7103|± |0.0378| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6720|± |0.0242| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8677|± |0.0193| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6404|± |0.0338| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5481|± |0.0303| +| - high_school_physics | 1|none | 0|acc |↑ | 0.5762|± |0.0403| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6806|± |0.0318| +| - machine_learning | 1|none | 0|acc |↑ | 0.5268|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0457|± |0.0035| +|openbookqa | 1|none | 0|acc |↑ | 0.3440|± |0.0213| +| | |none | 0|acc_norm |↑ | 0.4860|± |0.0224| +|piqa | 1|none | 0|acc |↑ | 0.7943|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.8009|± |0.0093| +|qnli | 1|none | 0|acc |↑ | 0.8047|± |0.0054| +|sciq | 1|none | 0|acc |↑ | 0.9560|± |0.0065| +| | |none | 0|acc_norm |↑ | 0.9360|± |0.0077| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3251|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5129|± |0.0175| +| | |none | 0|bleu_diff |↑ | 0.3578|± |0.2345| +| | |none | 0|bleu_max |↑ | 8.0558|± |0.4296| +| | |none | 0|rouge1_acc |↑ | 0.5373|± |0.0175| +| | |none | 0|rouge1_diff|↑ | 1.0510|± |0.3423| +| | |none | 0|rouge1_max |↑ |25.7407|± |0.6485| +| | |none | 0|rouge2_acc |↑ | 0.4455|± |0.0174| +| | |none | 0|rouge2_diff|↑ | 0.4063|± |0.3684| +| | |none | 0|rouge2_max |↑ |15.2839|± |0.6032| +| | |none | 0|rougeL_acc |↑ | 0.4884|± |0.0175| +| | |none | 0|rougeL_diff|↑ | 0.4719|± |0.3267| +| | |none | 0|rougeL_max |↑ |22.5918|± |0.6236| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4774|± |0.0175| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.6485|± |0.0155| +|winogrande | 1|none | 0|acc |↑ | 0.7080|± |0.0128| +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-7B-Instruct), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto:5 (1,64,64,64,64,64) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.6840|± |0.0147| +|anli_r2 | 1|none | 0|acc |↑ | 0.5440|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5492|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5265|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5529|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.4534|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8120|± |0.0248| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.3529|± |0.0350| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.4560|± |0.0316| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0080|± |0.0056| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.1040|± |0.0193| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.0360|± |0.0118| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8800|± |0.0206| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.2055|± |0.0336| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5560|± |0.0315| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.0360|± |0.0118| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.2680|± |0.0281| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5281|± |0.0375| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7760|± |0.0264| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.3200|± |0.0296| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2640|± |0.0279| +|boolq | 2|none | 0|acc |↑ | 0.8633|± |0.0060| +|drop | 3|none | 0|em |↑ | 0.0028|± |0.0005| +| | |none | 0|f1 |↑ | 0.0713|± |0.0014| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1414|± |0.0248| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0859|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2879|± |0.0323| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3131|± |0.0330| +| | |none | 0|acc_norm |↑ | 0.3131|± |0.0330| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1612|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0989|± |0.0128| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2546|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3242|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3242|± |0.0200| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3388|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3388|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1540|± |0.0171| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1094|± |0.0148| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2746|± |0.0211| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3237|± |0.0221| +| | |none | 0|acc_norm |↑ | 0.3237|± |0.0221| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3326|± |0.0223| +| | |none | 0|acc_norm |↑ | 0.3326|± |0.0223| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8256|± |0.0105| +| | |strict-match | 5|exact_match|↑ | 0.7582|± |0.0118| +|hellaswag | 1|none | 0|acc |↑ | 0.6198|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.8039|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.7175|± |0.0036| +| - humanities | 2|none | |acc |↑ | 0.6351|± |0.0066| +| - formal_logic | 1|none | 0|acc |↑ | 0.5635|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8485|± |0.0280| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.9020|± |0.0209| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8734|± |0.0216| +| - international_law | 1|none | 0|acc |↑ | 0.8099|± |0.0358| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7963|± |0.0389| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8221|± |0.0300| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7659|± |0.0228| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4190|± |0.0165| +| - philosophy | 1|none | 0|acc |↑ | 0.7363|± |0.0250| +| - prehistory | 1|none | 0|acc |↑ | 0.8241|± |0.0212| +| - professional_law | 1|none | 0|acc |↑ | 0.5156|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8246|± |0.0292| +| - other | 2|none | |acc |↑ | 0.7650|± |0.0073| +| - business_ethics | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7774|± |0.0256| +| - college_medicine | 1|none | 0|acc |↑ | 0.6705|± |0.0358| +| - global_facts | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - human_aging | 1|none | 0|acc |↑ | 0.7803|± |0.0278| +| - management | 1|none | 0|acc |↑ | 0.8835|± |0.0318| +| - marketing | 1|none | 0|acc |↑ | 0.9231|± |0.0175| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8400|± |0.0368| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8531|± |0.0127| +| - nutrition | 1|none | 0|acc |↑ | 0.7876|± |0.0234| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5567|± |0.0296| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7831|± |0.0250| +| - virology | 1|none | 0|acc |↑ | 0.5241|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.8274|± |0.0067| +| - econometrics | 1|none | 0|acc |↑ | 0.6754|± |0.0440| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8788|± |0.0233| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9430|± |0.0167| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7897|± |0.0207| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8655|± |0.0222| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.9064|± |0.0125| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8092|± |0.0345| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7598|± |0.0173| +| - public_relations | 1|none | 0|acc |↑ | 0.7182|± |0.0431| +| - security_studies | 1|none | 0|acc |↑ | 0.7714|± |0.0269| +| - sociology | 1|none | 0|acc |↑ | 0.8955|± |0.0216| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8600|± |0.0349| +| - stem | 2|none | |acc |↑ | 0.6863|± |0.0080| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5800|± |0.0496| +| - anatomy | 1|none | 0|acc |↑ | 0.7333|± |0.0382| +| - astronomy | 1|none | 0|acc |↑ | 0.8553|± |0.0286| +| - college_biology | 1|none | 0|acc |↑ | 0.8681|± |0.0283| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - college_physics | 1|none | 0|acc |↑ | 0.5098|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.7800|± |0.0416| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7702|± |0.0275| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7103|± |0.0378| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6720|± |0.0242| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8677|± |0.0193| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6404|± |0.0338| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5481|± |0.0303| +| - high_school_physics | 1|none | 0|acc |↑ | 0.5762|± |0.0403| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6806|± |0.0318| +| - machine_learning | 1|none | 0|acc |↑ | 0.5268|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0457|± |0.0035| +|openbookqa | 1|none | 0|acc |↑ | 0.3440|± |0.0213| +| | |none | 0|acc_norm |↑ | 0.4860|± |0.0224| +|piqa | 1|none | 0|acc |↑ | 0.7943|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.8009|± |0.0093| +|qnli | 1|none | 0|acc |↑ | 0.8047|± |0.0054| +|sciq | 1|none | 0|acc |↑ | 0.9560|± |0.0065| +| | |none | 0|acc_norm |↑ | 0.9360|± |0.0077| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3251|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5129|± |0.0175| +| | |none | 0|bleu_diff |↑ | 0.3578|± |0.2345| +| | |none | 0|bleu_max |↑ | 8.0558|± |0.4296| +| | |none | 0|rouge1_acc |↑ | 0.5373|± |0.0175| +| | |none | 0|rouge1_diff|↑ | 1.0510|± |0.3423| +| | |none | 0|rouge1_max |↑ |25.7407|± |0.6485| +| | |none | 0|rouge2_acc |↑ | 0.4455|± |0.0174| +| | |none | 0|rouge2_diff|↑ | 0.4063|± |0.3684| +| | |none | 0|rouge2_max |↑ |15.2839|± |0.6032| +| | |none | 0|rougeL_acc |↑ | 0.4884|± |0.0175| +| | |none | 0|rougeL_diff|↑ | 0.4719|± |0.3267| +| | |none | 0|rougeL_max |↑ |22.5918|± |0.6236| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4774|± |0.0175| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.6485|± |0.0155| +|winogrande | 1|none | 0|acc |↑ | 0.7080|± |0.0128| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4534|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.7175|± |0.0036| +| - humanities | 2|none | |acc |↑ |0.6351|± |0.0066| +| - other | 2|none | |acc |↑ |0.7650|± |0.0073| +| - social sciences| 2|none | |acc |↑ |0.8274|± |0.0067| +| - stem | 2|none | |acc |↑ |0.6863|± |0.0080| + + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4534|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.7175|± |0.0036| +| - humanities | 2|none | |acc |↑ |0.6351|± |0.0066| +| - other | 2|none | |acc |↑ |0.7650|± |0.0073| +| - social sciences| 2|none | |acc |↑ |0.8274|± |0.0067| +| - stem | 2|none | |acc |↑ |0.6863|± |0.0080| + +Qwen_Qwen2.5-7B-Instruct: 11h 6m 29s +✅ Benchmark completed for Qwen_Qwen2.5-7B-Instruct + +🔥 Starting benchmark for deepseek-ai_DeepSeek-R1-Distill-Llama-8B + +Passed argument batch_size = auto:5.0. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto:5.0. Detecting largest batch size +Determined largest batch size: 64 + +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_DeepSeek-R1-Distill-Llama-8B), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto:5 (1,64,64,64,64,64) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4040|± |0.0155| +|anli_r2 | 1|none | 0|acc |↑ | 0.4100|± |0.0156| +|anli_r3 | 1|none | 0|acc |↑ | 0.3883|± |0.0141| +|arc_challenge | 1|none | 0|acc |↑ | 0.4061|± |0.0144| +| | |none | 0|acc_norm |↑ | 0.4232|± |0.0144| +|bbh | 3|get-answer | |exact_match|↑ | 0.6037|± |0.0050| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9400|± |0.0151| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5508|± |0.0365| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7440|± |0.0277| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6120|± |0.0309| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.2360|± |0.0269| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.2520|± |0.0275| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3200|± |0.0296| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.7920|± |0.0257| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2760|± |0.0283| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0600|± |0.0151| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8520|± |0.0225| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.6840|± |0.0295| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.8840|± |0.0203| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8560|± |0.0222| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8320|± |0.0237| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.3973|± |0.0406| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6360|± |0.0305| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.6200|± |0.0308| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5680|± |0.0314| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5506|± |0.0374| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8200|± |0.0243| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.8640|± |0.0217| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1040|± |0.0193| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.9480|± |0.0141| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.3960|± |0.0310| +|boolq | 2|none | 0|acc |↑ | 0.8287|± |0.0066| +|drop | 3|none | 0|em |↑ | 0.0031|± |0.0006| +| | |none | 0|f1 |↑ | 0.0712|± |0.0014| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0909|± |0.0205| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0707|± |0.0183| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2626|± |0.0314| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2727|± |0.0317| +| | |none | 0|acc_norm |↑ | 0.2727|± |0.0317| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1172|± |0.0138| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0952|± |0.0126| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3059|± |0.0197| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2912|± |0.0195| +| | |none | 0|acc_norm |↑ | 0.2912|± |0.0195| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2967|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2967|± |0.0196| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0893|± |0.0135| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1138|± |0.0150| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2746|± |0.0211| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3125|± |0.0219| +| | |none | 0|acc_norm |↑ | 0.3125|± |0.0219| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2746|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.2746|± |0.0211| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6452|± |0.0132| +| | |strict-match | 5|exact_match|↑ | 0.6247|± |0.0133| +|hellaswag | 1|none | 0|acc |↑ | 0.5562|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.7430|± |0.0044| +|mmlu | 2|none | |acc |↑ | 0.5327|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.4767|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.4048|± |0.0439| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7030|± |0.0357| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.6765|± |0.0328| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7384|± |0.0286| +| - international_law | 1|none | 0|acc |↑ | 0.7025|± |0.0417| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6019|± |0.0473| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.5706|± |0.0389| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5289|± |0.0269| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.5466|± |0.0283| +| - prehistory | 1|none | 0|acc |↑ | 0.6327|± |0.0268| +| - professional_law | 1|none | 0|acc |↑ | 0.4068|± |0.0125| +| - world_religions | 1|none | 0|acc |↑ | 0.7076|± |0.0349| +| - other | 2|none | |acc |↑ | 0.6041|± |0.0085| +| - business_ethics | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5811|± |0.0304| +| - college_medicine | 1|none | 0|acc |↑ | 0.5318|± |0.0380| +| - global_facts | 1|none | 0|acc |↑ | 0.3800|± |0.0488| +| - human_aging | 1|none | 0|acc |↑ | 0.5291|± |0.0335| +| - management | 1|none | 0|acc |↑ | 0.7282|± |0.0441| +| - marketing | 1|none | 0|acc |↑ | 0.7949|± |0.0265| +| - medical_genetics | 1|none | 0|acc |↑ | 0.6300|± |0.0485| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7152|± |0.0161| +| - nutrition | 1|none | 0|acc |↑ | 0.6144|± |0.0279| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4007|± |0.0292| +| - professional_medicine | 1|none | 0|acc |↑ | 0.5625|± |0.0301| +| - virology | 1|none | 0|acc |↑ | 0.4639|± |0.0388| +| - social sciences | 2|none | |acc |↑ | 0.6074|± |0.0086| +| - econometrics | 1|none | 0|acc |↑ | 0.3158|± |0.0437| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6465|± |0.0341| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.6425|± |0.0346| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5154|± |0.0253| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.4874|± |0.0325| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7009|± |0.0196| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6336|± |0.0423| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5539|± |0.0201| +| - public_relations | 1|none | 0|acc |↑ | 0.6273|± |0.0463| +| - security_studies | 1|none | 0|acc |↑ | 0.6367|± |0.0308| +| - sociology | 1|none | 0|acc |↑ | 0.7363|± |0.0312| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8700|± |0.0338| +| - stem | 2|none | |acc |↑ | 0.4729|± |0.0088| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - anatomy | 1|none | 0|acc |↑ | 0.5111|± |0.0432| +| - astronomy | 1|none | 0|acc |↑ | 0.5461|± |0.0405| +| - college_biology | 1|none | 0|acc |↑ | 0.5556|± |0.0416| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3800|± |0.0488| +| - college_computer_science | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - college_physics | 1|none | 0|acc |↑ | 0.3725|± |0.0481| +| - computer_security | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.4681|± |0.0326| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5034|± |0.0417| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4286|± |0.0255| +| - high_school_biology | 1|none | 0|acc |↑ | 0.6452|± |0.0272| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4483|± |0.0350| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.5600|± |0.0499| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3815|± |0.0296| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3907|± |0.0398| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4444|± |0.0339| +| - machine_learning | 1|none | 0|acc |↑ | 0.4018|± |0.0465| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0584|± |0.0039| +|openbookqa | 1|none | 0|acc |↑ | 0.3160|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.4100|± |0.0220| +|piqa | 1|none | 0|acc |↑ | 0.7595|± |0.0100| +| | |none | 0|acc_norm |↑ | 0.7758|± |0.0097| +|qnli | 1|none | 0|acc |↑ | 0.5147|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9290|± |0.0081| +| | |none | 0|acc_norm |↑ | 0.8990|± |0.0095| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.1940|± |0.0030| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4455|± |0.0174| +| | |none | 0|bleu_diff |↑ |-0.9197|± |0.5624| +| | |none | 0|bleu_max |↑ |15.5776|± |0.6556| +| | |none | 0|rouge1_acc |↑ | 0.4517|± |0.0174| +| | |none | 0|rouge1_diff|↑ |-1.1087|± |0.7899| +| | |none | 0|rouge1_max |↑ |37.4816|± |0.7944| +| | |none | 0|rouge2_acc |↑ | 0.2521|± |0.0152| +| | |none | 0|rouge2_diff|↑ |-4.0247|± |0.7868| +| | |none | 0|rouge2_max |↑ |19.9717|± |0.8482| +| | |none | 0|rougeL_acc |↑ | 0.4565|± |0.0174| +| | |none | 0|rougeL_diff|↑ |-1.0779|± |0.7931| +| | |none | 0|rougeL_max |↑ |34.9414|± |0.7851| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3219|± |0.0164| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5045|± |0.0154| +|winogrande | 1|none | 0|acc |↑ | 0.6780|± |0.0131| +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_DeepSeek-R1-Distill-Llama-8B), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto:5 (1,64,64,64,64,64) + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6037|± |0.0050| +|mmlu | 2|none | |acc |↑ |0.5327|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.4767|± |0.0069| +| - other | 2|none | |acc |↑ |0.6041|± |0.0085| +| - social sciences| 2|none | |acc |↑ |0.6074|± |0.0086| +| - stem | 2|none | |acc |↑ |0.4729|± |0.0088| + + +deepseek-ai_DeepSeek-R1-Distill-Llama-8B: 11h 46m 55s +✅ Benchmark completed for deepseek-ai_DeepSeek-R1-Distill-Llama-8B +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen-7B-Chat +Qwen_Qwen-7B-Chat: 0h 0m 3s +✅ Benchmark completed for Qwen_Qwen-7B-Chat + +🔥 Starting benchmark for Qwen_Qwen-7B +🔥 Starting benchmark for Qwen_Qwen-7B-Chat +Qwen_Qwen-7B-Chat: 0h 0m 4s +✅ Benchmark completed for Qwen_Qwen-7B-Chat + +🔥 Starting benchmark for Qwen_Qwen-7B +🔥 Starting benchmark for Qwen_Qwen-7B-Chat +🔥 Starting benchmark for Qwen_Qwen-7B +🔥 Starting benchmark for meta-llama_Meta-Llama-3-8B-Instruct +🔥 Starting benchmark for mistralai_Mistral-7B-Instruct-v0.3 +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-7B-Instruct +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-7B-Instruct,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.6850|± |0.0147| +|anli_r2 | 1|none | 0|acc |↑ | 0.5490|± |0.0157| +|anli_r3 | 1|none | 0|acc |↑ | 0.5525|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5239|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5529|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.4488|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8160|± |0.0246| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.3209|± |0.0342| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.4520|± |0.0315| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0080|± |0.0056| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.1960|± |0.0252| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.0960|± |0.0187| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.4120|± |0.0312| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4000|± |0.0310| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1600|± |0.0232| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8400|± |0.0232| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.0440|± |0.0130| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.2600|± |0.0278| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8880|± |0.0200| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.1918|± |0.0327| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5520|± |0.0315| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.0480|± |0.0135| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.2080|± |0.0257| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5281|± |0.0375| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7840|± |0.0261| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.7080|± |0.0288| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7480|± |0.0275| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.5480|± |0.0315| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.4280|± |0.0314| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2720|± |0.0282| +|boolq | 2|none | 0|acc |↑ | 0.8633|± |0.0060| +|drop | 3|none | 0|em |↑ | 0.0025|± |0.0005| +| | |none | 0|f1 |↑ | 0.0711|± |0.0014| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1465|± |0.0252| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0859|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2879|± |0.0323| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3081|± |0.0329| +| | |none | 0|acc_norm |↑ | 0.3081|± |0.0329| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1410|± |0.0149| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1007|± |0.0129| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2564|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3242|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3242|± |0.0200| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3388|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3388|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1429|± |0.0166| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1027|± |0.0144| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2946|± |0.0216| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3214|± |0.0221| +| | |none | 0|acc_norm |↑ | 0.3214|± |0.0221| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3281|± |0.0222| +| | |none | 0|acc_norm |↑ | 0.3281|± |0.0222| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8256|± |0.0105| +| | |strict-match | 5|exact_match|↑ | 0.7627|± |0.0117| +|hellaswag | 1|none | 0|acc |↑ | 0.6194|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.8049|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.7181|± |0.0036| +| - humanities | 2|none | |acc |↑ | 0.6372|± |0.0066| +| - formal_logic | 1|none | 0|acc |↑ | 0.5714|± |0.0443| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8485|± |0.0280| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.9020|± |0.0209| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8734|± |0.0216| +| - international_law | 1|none | 0|acc |↑ | 0.8182|± |0.0352| +| - jurisprudence | 1|none | 0|acc |↑ | 0.8056|± |0.0383| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8221|± |0.0300| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7659|± |0.0228| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4235|± |0.0165| +| - philosophy | 1|none | 0|acc |↑ | 0.7395|± |0.0249| +| - prehistory | 1|none | 0|acc |↑ | 0.8272|± |0.0210| +| - professional_law | 1|none | 0|acc |↑ | 0.5156|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8304|± |0.0288| +| - other | 2|none | |acc |↑ | 0.7647|± |0.0073| +| - business_ethics | 1|none | 0|acc |↑ | 0.7900|± |0.0409| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7774|± |0.0256| +| - college_medicine | 1|none | 0|acc |↑ | 0.6705|± |0.0358| +| - global_facts | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - human_aging | 1|none | 0|acc |↑ | 0.7803|± |0.0278| +| - management | 1|none | 0|acc |↑ | 0.8932|± |0.0306| +| - marketing | 1|none | 0|acc |↑ | 0.9188|± |0.0179| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8400|± |0.0368| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8519|± |0.0127| +| - nutrition | 1|none | 0|acc |↑ | 0.7843|± |0.0236| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5567|± |0.0296| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7831|± |0.0250| +| - virology | 1|none | 0|acc |↑ | 0.5181|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.8284|± |0.0067| +| - econometrics | 1|none | 0|acc |↑ | 0.6579|± |0.0446| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8788|± |0.0233| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9378|± |0.0174| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7949|± |0.0205| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8697|± |0.0219| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.9064|± |0.0125| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8092|± |0.0345| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7647|± |0.0172| +| - public_relations | 1|none | 0|acc |↑ | 0.7182|± |0.0431| +| - security_studies | 1|none | 0|acc |↑ | 0.7714|± |0.0269| +| - sociology | 1|none | 0|acc |↑ | 0.8955|± |0.0216| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8600|± |0.0349| +| - stem | 2|none | |acc |↑ | 0.6851|± |0.0080| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.5800|± |0.0496| +| - anatomy | 1|none | 0|acc |↑ | 0.7407|± |0.0379| +| - astronomy | 1|none | 0|acc |↑ | 0.8553|± |0.0286| +| - college_biology | 1|none | 0|acc |↑ | 0.8681|± |0.0283| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - college_physics | 1|none | 0|acc |↑ | 0.5098|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.7700|± |0.0423| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7660|± |0.0277| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7172|± |0.0375| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6746|± |0.0241| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8645|± |0.0195| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6404|± |0.0338| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5407|± |0.0304| +| - high_school_physics | 1|none | 0|acc |↑ | 0.5695|± |0.0404| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6759|± |0.0319| +| - machine_learning | 1|none | 0|acc |↑ | 0.5268|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0457|± |0.0035| +|openbookqa | 1|none | 0|acc |↑ | 0.3420|± |0.0212| +| | |none | 0|acc_norm |↑ | 0.4860|± |0.0224| +|piqa | 1|none | 0|acc |↑ | 0.7960|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.8030|± |0.0093| +|qnli | 1|none | 0|acc |↑ | 0.8045|± |0.0054| +|sciq | 1|none | 0|acc |↑ | 0.9560|± |0.0065| +| | |none | 0|acc_norm |↑ | 0.9370|± |0.0077| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3254|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5006|± |0.0175| +| | |none | 0|bleu_diff |↑ | 0.3019|± |0.2337| +| | |none | 0|bleu_max |↑ | 7.9281|± |0.4300| +| | |none | 0|rouge1_acc |↑ | 0.5386|± |0.0175| +| | |none | 0|rouge1_diff|↑ | 0.9126|± |0.3421| +| | |none | 0|rouge1_max |↑ |25.3456|± |0.6494| +| | |none | 0|rouge2_acc |↑ | 0.4455|± |0.0174| +| | |none | 0|rouge2_diff|↑ | 0.2550|± |0.3623| +| | |none | 0|rouge2_max |↑ |15.0210|± |0.6025| +| | |none | 0|rougeL_acc |↑ | 0.4847|± |0.0175| +| | |none | 0|rougeL_diff|↑ | 0.3002|± |0.3241| +| | |none | 0|rougeL_max |↑ |22.2349|± |0.6237| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4774|± |0.0175| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.6485|± |0.0155| +|winogrande | 1|none | 0|acc |↑ | 0.7119|± |0.0127| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4488|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.7181|± |0.0036| +| - humanities | 2|none | |acc |↑ |0.6372|± |0.0066| +| - other | 2|none | |acc |↑ |0.7647|± |0.0073| +| - social sciences| 2|none | |acc |↑ |0.8284|± |0.0067| +| - stem | 2|none | |acc |↑ |0.6851|± |0.0080| + +Qwen_Qwen2.5-7B-Instruct: 9h 37m 6s +✅ Benchmark completed for Qwen_Qwen2.5-7B-Instruct + +🔥 Starting benchmark for Qwen_Qwen-7B-Chat +Qwen_Qwen-7B-Chat: 0h 5m 2s +✅ Benchmark completed for Qwen_Qwen-7B-Chat + +🔥 Starting benchmark for Qwen_Qwen-7B +Qwen_Qwen-7B: 0h 5m 1s +✅ Benchmark completed for Qwen_Qwen-7B + +🔥 Starting benchmark for meta-llama_Meta-Llama-3-8B-Instruct +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/meta-llama_Meta-Llama-3-8B-Instruct,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4840|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4580|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4483|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5316|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5640|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.6790|± |0.0053| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8960|± |0.0193| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5668|± |0.0363| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.8400|± |0.0232| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6800|± |0.0296| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0880|± |0.0180| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.5160|± |0.0317| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.7880|± |0.0259| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5720|± |0.0314| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4920|± |0.0317| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.6640|± |0.0299| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8880|± |0.0200| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7877|± |0.0340| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.7400|± |0.0278| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5680|± |0.0314| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.7135|± |0.0340| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9160|± |0.0176| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.6720|± |0.0298| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.7080|± |0.0288| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.5280|± |0.0316| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.7680|± |0.0268| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9960|± |0.0040| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +|boolq | 2|none | 0|acc |↑ | 0.8312|± |0.0066| +|drop | 3|none | 0|em |↑ | 0.0290|± |0.0017| +| | |none | 0|f1 |↑ | 0.1640|± |0.0024| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1515|± |0.0255| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1414|± |0.0248| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2778|± |0.0319| +| | |none | 0|acc_norm |↑ | 0.2778|± |0.0319| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1923|± |0.0169| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1575|± |0.0156| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2619|± |0.0188| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2967|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2967|± |0.0196| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3260|± |0.0201| +| | |none | 0|acc_norm |↑ | 0.3260|± |0.0201| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1830|± |0.0183| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1429|± |0.0166| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2790|± |0.0212| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3058|± |0.0218| +| | |none | 0|acc_norm |↑ | 0.3058|± |0.0218| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3103|± |0.0219| +| | |none | 0|acc_norm |↑ | 0.3103|± |0.0219| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7544|± |0.0119| +| | |strict-match | 5|exact_match|↑ | 0.7566|± |0.0118| +|hellaswag | 1|none | 0|acc |↑ | 0.5764|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7592|± |0.0043| +|mmlu | 2|none | |acc |↑ | 0.6387|± |0.0038| +| - humanities | 2|none | |acc |↑ | 0.5824|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.5000|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7455|± |0.0340| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8186|± |0.0270| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8354|± |0.0241| +| - international_law | 1|none | 0|acc |↑ | 0.7603|± |0.0390| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7593|± |0.0413| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7607|± |0.0335| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6994|± |0.0247| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3441|± |0.0159| +| - philosophy | 1|none | 0|acc |↑ | 0.7170|± |0.0256| +| - prehistory | 1|none | 0|acc |↑ | 0.7253|± |0.0248| +| - professional_law | 1|none | 0|acc |↑ | 0.4896|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.7719|± |0.0322| +| - other | 2|none | |acc |↑ | 0.7187|± |0.0078| +| - business_ethics | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7208|± |0.0276| +| - college_medicine | 1|none | 0|acc |↑ | 0.6243|± |0.0369| +| - global_facts | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - human_aging | 1|none | 0|acc |↑ | 0.6726|± |0.0315| +| - management | 1|none | 0|acc |↑ | 0.8350|± |0.0368| +| - marketing | 1|none | 0|acc |↑ | 0.8932|± |0.0202| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8059|± |0.0141| +| - nutrition | 1|none | 0|acc |↑ | 0.7320|± |0.0254| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5532|± |0.0297| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7500|± |0.0263| +| - virology | 1|none | 0|acc |↑ | 0.5120|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7413|± |0.0078| +| - econometrics | 1|none | 0|acc |↑ | 0.5526|± |0.0468| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7778|± |0.0296| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8756|± |0.0238| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6513|± |0.0242| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7143|± |0.0293| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8239|± |0.0163| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7557|± |0.0377| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6699|± |0.0190| +| - public_relations | 1|none | 0|acc |↑ | 0.6909|± |0.0443| +| - security_studies | 1|none | 0|acc |↑ | 0.7388|± |0.0281| +| - sociology | 1|none | 0|acc |↑ | 0.8557|± |0.0248| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8400|± |0.0368| +| - stem | 2|none | |acc |↑ | 0.5439|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - anatomy | 1|none | 0|acc |↑ | 0.6370|± |0.0415| +| - astronomy | 1|none | 0|acc |↑ | 0.6908|± |0.0376| +| - college_biology | 1|none | 0|acc |↑ | 0.7431|± |0.0365| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - college_physics | 1|none | 0|acc |↑ | 0.4902|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.7700|± |0.0423| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5489|± |0.0325| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6345|± |0.0401| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4577|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7677|± |0.0240| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4828|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6800|± |0.0469| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3778|± |0.0296| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4437|± |0.0406| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5185|± |0.0341| +| - machine_learning | 1|none | 0|acc |↑ | 0.4911|± |0.0475| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1590|± |0.0061| +|openbookqa | 1|none | 0|acc |↑ | 0.3400|± |0.0212| +| | |none | 0|acc_norm |↑ | 0.4300|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7824|± |0.0096| +| | |none | 0|acc_norm |↑ | 0.7873|± |0.0095| +|qnli | 1|none | 0|acc |↑ | 0.5464|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9630|± |0.0060| +| | |none | 0|acc_norm |↑ | 0.9320|± |0.0080| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5112|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4761|± |0.0175| +| | |none | 0|bleu_diff |↑ |-0.1939|± |0.6341| +| | |none | 0|bleu_max |↑ |20.2147|± |0.7257| +| | |none | 0|rouge1_acc |↑ | 0.4957|± |0.0175| +| | |none | 0|rouge1_diff|↑ |-0.2355|± |0.8648| +| | |none | 0|rouge1_max |↑ |43.2820|± |0.8713| +| | |none | 0|rouge2_acc |↑ | 0.3684|± |0.0169| +| | |none | 0|rouge2_diff|↑ |-1.5024|± |0.9176| +| | |none | 0|rouge2_max |↑ |27.2640|± |0.9552| +| | |none | 0|rougeL_acc |↑ | 0.4798|± |0.0175| +| | |none | 0|rougeL_diff|↑ |-0.6690|± |0.8701| +| | |none | 0|rougeL_max |↑ |40.4168|± |0.8713| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3635|± |0.0168| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5171|± |0.0152| +|winogrande | 1|none | 0|acc |↑ | 0.7167|± |0.0127| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6790|± |0.0053| +|mmlu | 2|none | |acc |↑ |0.6387|± |0.0038| +| - humanities | 2|none | |acc |↑ |0.5824|± |0.0068| +| - other | 2|none | |acc |↑ |0.7187|± |0.0078| +| - social sciences| 2|none | |acc |↑ |0.7413|± |0.0078| +| - stem | 2|none | |acc |↑ |0.5439|± |0.0086| + +meta-llama_Meta-Llama-3-8B-Instruct: 6h 30m 49s +✅ Benchmark completed for meta-llama_Meta-Llama-3-8B-Instruct + +🔥 Starting benchmark for mistralai_Mistral-7B-Instruct-v0.3 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/mistralai_Mistral-7B-Instruct-v0.3,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4760|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4430|± |0.0157| +|anli_r3 | 1|none | 0|acc |↑ | 0.4483|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5742|± |0.0144| +| | |none | 0|acc_norm |↑ | 0.5896|± |0.0144| +|bbh | 3|get-answer | |exact_match|↑ | 0.5626|± |0.0056| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8520|± |0.0225| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5508|± |0.0365| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.6400|± |0.0304| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5960|± |0.0311| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0720|± |0.0164| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4800|± |0.0317| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.4680|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8080|± |0.0250| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7240|± |0.0283| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.2960|± |0.0289| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6680|± |0.0298| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5959|± |0.0408| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6880|± |0.0294| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4400|± |0.0315| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4280|± |0.0314| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.7191|± |0.0338| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9200|± |0.0172| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.5240|± |0.0316| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4240|± |0.0313| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.3000|± |0.0290| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2200|± |0.0263| +|boolq | 2|none | 0|acc |↑ | 0.8584|± |0.0061| +|drop | 3|none | 0|em |↑ | 0.0094|± |0.0010| +| | |none | 0|f1 |↑ | 0.0900|± |0.0018| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1162|± |0.0228| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1616|± |0.0262| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2172|± |0.0294| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2677|± |0.0315| +| | |none | 0|acc_norm |↑ | 0.2677|± |0.0315| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1777|± |0.0164| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1447|± |0.0151| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2582|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2894|± |0.0194| +| | |none | 0|acc_norm |↑ | 0.2894|± |0.0194| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3480|± |0.0204| +| | |none | 0|acc_norm |↑ | 0.3480|± |0.0204| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1585|± |0.0173| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1518|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2790|± |0.0212| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3371|± |0.0224| +| | |none | 0|acc_norm |↑ | 0.3371|± |0.0224| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2835|± |0.0213| +| | |none | 0|acc_norm |↑ | 0.2835|± |0.0213| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.4913|± |0.0138| +| | |strict-match | 5|exact_match|↑ | 0.4898|± |0.0138| +|hellaswag | 1|none | 0|acc |↑ | 0.6484|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.8289|± |0.0038| +|mmlu | 2|none | |acc |↑ | 0.5971|± |0.0039| +| - humanities | 2|none | |acc |↑ | 0.5420|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.4365|± |0.0444| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7394|± |0.0343| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8039|± |0.0279| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7722|± |0.0273| +| - international_law | 1|none | 0|acc |↑ | 0.7686|± |0.0385| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7593|± |0.0413| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7607|± |0.0335| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6763|± |0.0252| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2670|± |0.0148| +| - philosophy | 1|none | 0|acc |↑ | 0.6624|± |0.0269| +| - prehistory | 1|none | 0|acc |↑ | 0.6883|± |0.0258| +| - professional_law | 1|none | 0|acc |↑ | 0.4492|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.7953|± |0.0309| +| - other | 2|none | |acc |↑ | 0.6720|± |0.0081| +| - business_ethics | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6906|± |0.0285| +| - college_medicine | 1|none | 0|acc |↑ | 0.5607|± |0.0378| +| - global_facts | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - human_aging | 1|none | 0|acc |↑ | 0.6368|± |0.0323| +| - management | 1|none | 0|acc |↑ | 0.7961|± |0.0399| +| - marketing | 1|none | 0|acc |↑ | 0.8718|± |0.0219| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7816|± |0.0148| +| - nutrition | 1|none | 0|acc |↑ | 0.6634|± |0.0271| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4645|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6654|± |0.0287| +| - virology | 1|none | 0|acc |↑ | 0.5060|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7000|± |0.0080| +| - econometrics | 1|none | 0|acc |↑ | 0.4737|± |0.0470| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7525|± |0.0307| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8705|± |0.0242| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5795|± |0.0250| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.6050|± |0.0318| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8092|± |0.0168| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6947|± |0.0404| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6242|± |0.0196| +| - public_relations | 1|none | 0|acc |↑ | 0.6455|± |0.0458| +| - security_studies | 1|none | 0|acc |↑ | 0.7061|± |0.0292| +| - sociology | 1|none | 0|acc |↑ | 0.8458|± |0.0255| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8500|± |0.0359| +| - stem | 2|none | |acc |↑ | 0.5052|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.2700|± |0.0446| +| - anatomy | 1|none | 0|acc |↑ | 0.5926|± |0.0424| +| - astronomy | 1|none | 0|acc |↑ | 0.6382|± |0.0391| +| - college_biology | 1|none | 0|acc |↑ | 0.7292|± |0.0372| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5000|± |0.0503| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3500|± |0.0479| +| - college_physics | 1|none | 0|acc |↑ | 0.4608|± |0.0496| +| - computer_security | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5234|± |0.0327| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5655|± |0.0413| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.3704|± |0.0249| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7323|± |0.0252| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5123|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6400|± |0.0482| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3370|± |0.0288| +| - high_school_physics | 1|none | 0|acc |↑ | 0.2980|± |0.0373| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4676|± |0.0340| +| - machine_learning | 1|none | 0|acc |↑ | 0.5446|± |0.0473| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1537|± |0.0060| +|openbookqa | 1|none | 0|acc |↑ | 0.3540|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.4700|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.8156|± |0.0090| +| | |none | 0|acc_norm |↑ | 0.8270|± |0.0088| +|qnli | 1|none | 0|acc |↑ | 0.5146|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9600|± |0.0062| +| | |none | 0|acc_norm |↑ | 0.9430|± |0.0073| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5683|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5643|± |0.0174| +| | |none | 0|bleu_diff |↑ | 8.1688|± |0.8460| +| | |none | 0|bleu_max |↑ |27.6629|± |0.8109| +| | |none | 0|rouge1_acc |↑ | 0.5716|± |0.0173| +| | |none | 0|rouge1_diff|↑ |12.0899|± |1.1860| +| | |none | 0|rouge1_max |↑ |54.8010|± |0.8641| +| | |none | 0|rouge2_acc |↑ | 0.5202|± |0.0175| +| | |none | 0|rouge2_diff|↑ |12.2282|± |1.2604| +| | |none | 0|rouge2_max |↑ |41.2658|± |1.0220| +| | |none | 0|rougeL_acc |↑ | 0.5692|± |0.0173| +| | |none | 0|rougeL_diff|↑ |11.8949|± |1.1929| +| | |none | 0|rougeL_max |↑ |51.6072|± |0.8958| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4211|± |0.0173| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5968|± |0.0155| +|winogrande | 1|none | 0|acc |↑ | 0.7403|± |0.0123| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5626|± |0.0056| +|mmlu | 2|none | |acc |↑ |0.5971|± |0.0039| +| - humanities | 2|none | |acc |↑ |0.5420|± |0.0067| +| - other | 2|none | |acc |↑ |0.6720|± |0.0081| +| - social sciences| 2|none | |acc |↑ |0.7000|± |0.0080| +| - stem | 2|none | |acc |↑ |0.5052|± |0.0086| + +mistralai_Mistral-7B-Instruct-v0.3: 8h 38m 15s +✅ Benchmark completed for mistralai_Mistral-7B-Instruct-v0.3 + +🔥 Starting benchmark for openchat_openchat-3.6-8b-20240522 +🔥 Starting benchmark for internlm_internlm2_5-7b-chat +🔥 Starting benchmark for THUDM_chatglm3-6b +🔥 Starting benchmark for NousResearch_Hermes-2-Pro-Mistral-7B +🔥 Starting benchmark for Qwen_Qwen-1_8B-Chat +Qwen_Qwen-1_8B-Chat: 0h 18m 23s +✅ Benchmark completed for Qwen_Qwen-1_8B-Chat + +🔥 Starting benchmark for Qwen_Qwen-1_8B +Qwen_Qwen-1_8B: 0h 18m 19s +✅ Benchmark completed for Qwen_Qwen-1_8B + +🔥 Starting benchmark for deepseek-ai_DeepSeek-R1-Distill-Qwen-7B +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_DeepSeek-R1-Distill-Qwen-7B,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4450|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.4180|± |0.0156| +|anli_r3 | 1|none | 0|acc |↑ | 0.4100|± |0.0142| +|arc_challenge | 1|none | 0|acc |↑ | 0.4215|± |0.0144| +| | |none | 0|acc_norm |↑ | 0.4377|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.5569|± |0.0050| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9600|± |0.0124| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5187|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.6760|± |0.0297| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5320|± |0.0316| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.2640|± |0.0279| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.2720|± |0.0282| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3480|± |0.0302| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.8400|± |0.0232| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0200|± |0.0089| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8040|± |0.0252| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5000|± |0.0317| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9520|± |0.0135| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9040|± |0.0187| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8320|± |0.0237| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7603|± |0.0355| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6960|± |0.0292| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.5120|± |0.0317| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.3760|± |0.0307| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5506|± |0.0374| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7480|± |0.0275| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.3760|± |0.0307| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4360|± |0.0314| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.0040|± |0.0040| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8080|± |0.0250| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2400|± |0.0271| +|boolq | 2|none | 0|acc |↑ | 0.7783|± |0.0073| +|drop | 3|none | 0|em |↑ | 0.0023|± |0.0005| +| | |none | 0|f1 |↑ | 0.0412|± |0.0011| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0758|± |0.0189| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0404|± |0.0140| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0266| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2677|± |0.0315| +| | |none | 0|acc_norm |↑ | 0.2677|± |0.0315| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2778|± |0.0319| +| | |none | 0|acc_norm |↑ | 0.2778|± |0.0319| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1007|± |0.0129| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0934|± |0.0125| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1758|± |0.0163| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2839|± |0.0193| +| | |none | 0|acc_norm |↑ | 0.2839|± |0.0193| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3480|± |0.0204| +| | |none | 0|acc_norm |↑ | 0.3480|± |0.0204| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0848|± |0.0132| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0759|± |0.0125| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1473|± |0.0168| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2634|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.2634|± |0.0208| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3348|± |0.0223| +| | |none | 0|acc_norm |↑ | 0.3348|± |0.0223| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7998|± |0.0110| +| | |strict-match | 5|exact_match|↑ | 0.7862|± |0.0113| +|hellaswag | 1|none | 0|acc |↑ | 0.4627|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6026|± |0.0049| +|mmlu | 2|none | |acc |↑ | 0.5263|± |0.0041| +| - humanities | 2|none | |acc |↑ | 0.4406|± |0.0070| +| - formal_logic | 1|none | 0|acc |↑ | 0.4921|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6121|± |0.0380| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.6176|± |0.0341| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.6203|± |0.0316| +| - international_law | 1|none | 0|acc |↑ | 0.6116|± |0.0445| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6481|± |0.0462| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.6319|± |0.0379| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5578|± |0.0267| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2581|± |0.0146| +| - philosophy | 1|none | 0|acc |↑ | 0.5498|± |0.0283| +| - prehistory | 1|none | 0|acc |↑ | 0.4599|± |0.0277| +| - professional_law | 1|none | 0|acc |↑ | 0.3585|± |0.0122| +| - world_religions | 1|none | 0|acc |↑ | 0.5614|± |0.0381| +| - other | 2|none | |acc |↑ | 0.5391|± |0.0087| +| - business_ethics | 1|none | 0|acc |↑ | 0.5700|± |0.0498| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5358|± |0.0307| +| - college_medicine | 1|none | 0|acc |↑ | 0.4971|± |0.0381| +| - global_facts | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - human_aging | 1|none | 0|acc |↑ | 0.5426|± |0.0334| +| - management | 1|none | 0|acc |↑ | 0.6796|± |0.0462| +| - marketing | 1|none | 0|acc |↑ | 0.7479|± |0.0284| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6309|± |0.0173| +| - nutrition | 1|none | 0|acc |↑ | 0.5359|± |0.0286| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3936|± |0.0291| +| - professional_medicine | 1|none | 0|acc |↑ | 0.3713|± |0.0293| +| - virology | 1|none | 0|acc |↑ | 0.4036|± |0.0382| +| - social sciences | 2|none | |acc |↑ | 0.6123|± |0.0087| +| - econometrics | 1|none | 0|acc |↑ | 0.5702|± |0.0466| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6212|± |0.0346| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.6839|± |0.0336| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5974|± |0.0249| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7185|± |0.0292| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.6550|± |0.0204| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5420|± |0.0437| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5033|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.5818|± |0.0472| +| - security_studies | 1|none | 0|acc |↑ | 0.5918|± |0.0315| +| - sociology | 1|none | 0|acc |↑ | 0.7214|± |0.0317| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - stem | 2|none | |acc |↑ | 0.5579|± |0.0087| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - anatomy | 1|none | 0|acc |↑ | 0.4444|± |0.0429| +| - astronomy | 1|none | 0|acc |↑ | 0.5855|± |0.0401| +| - college_biology | 1|none | 0|acc |↑ | 0.5208|± |0.0418| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5800|± |0.0496| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - college_physics | 1|none | 0|acc |↑ | 0.4020|± |0.0488| +| - computer_security | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.6936|± |0.0301| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5931|± |0.0409| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6058|± |0.0252| +| - high_school_biology | 1|none | 0|acc |↑ | 0.6323|± |0.0274| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4975|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7300|± |0.0446| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4556|± |0.0304| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4901|± |0.0408| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6389|± |0.0328| +| - machine_learning | 1|none | 0|acc |↑ | 0.4286|± |0.0470| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0321|± |0.0029| +|openbookqa | 1|none | 0|acc |↑ | 0.2620|± |0.0197| +| | |none | 0|acc_norm |↑ | 0.3600|± |0.0215| +|piqa | 1|none | 0|acc |↑ | 0.7067|± |0.0106| +| | |none | 0|acc_norm |↑ | 0.7165|± |0.0105| +|qnli | 1|none | 0|acc |↑ | 0.5210|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9360|± |0.0077| +| | |none | 0|acc_norm |↑ | 0.9180|± |0.0087| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0592|± |0.0018| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3843|± |0.0170| +| | |none | 0|bleu_diff |↑ |-0.3986|± |0.4145| +| | |none | 0|bleu_max |↑ |12.1556|± |0.4381| +| | |none | 0|rouge1_acc |↑ | 0.4076|± |0.0172| +| | |none | 0|rouge1_diff|↑ |-0.5348|± |0.7118| +| | |none | 0|rouge1_max |↑ |34.2455|± |0.7048| +| | |none | 0|rouge2_acc |↑ | 0.2925|± |0.0159| +| | |none | 0|rouge2_diff|↑ |-1.4234|± |0.7703| +| | |none | 0|rouge2_max |↑ |20.4502|± |0.7267| +| | |none | 0|rougeL_acc |↑ | 0.3978|± |0.0171| +| | |none | 0|rougeL_diff|↑ |-0.5833|± |0.7113| +| | |none | 0|rougeL_max |↑ |32.3124|± |0.7036| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2889|± |0.0159| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4563|± |0.0154| +|winogrande | 1|none | 0|acc |↑ | 0.5991|± |0.0138| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5569|± |0.0050| +|mmlu | 2|none | |acc |↑ |0.5263|± |0.0041| +| - humanities | 2|none | |acc |↑ |0.4406|± |0.0070| +| - other | 2|none | |acc |↑ |0.5391|± |0.0087| +| - social sciences| 2|none | |acc |↑ |0.6123|± |0.0087| +| - stem | 2|none | |acc |↑ |0.5579|± |0.0087| + +deepseek-ai_DeepSeek-R1-Distill-Qwen-7B: 6h 28m 41s +✅ Benchmark completed for deepseek-ai_DeepSeek-R1-Distill-Qwen-7B + +🔥 Starting benchmark for deepseek-ai_deepseek-math-7b-rl +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_deepseek-math-7b-rl,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3680|± |0.0153| +|anli_r2 | 1|none | 0|acc |↑ | 0.3890|± |0.0154| +|anli_r3 | 1|none | 0|acc |↑ | 0.4050|± |0.0142| +|arc_challenge | 1|none | 0|acc |↑ | 0.4795|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.4898|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.5247|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9040|± |0.0187| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4439|± |0.0364| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|��� | 0.5240|± |0.0316| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5200|± |0.0317| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.1840|± |0.0246| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.1680|± |0.0237| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3200|± |0.0296| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.7040|± |0.0289| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1200|± |0.0206| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0600|± |0.0151| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8080|± |0.0250| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.8040|± |0.0252| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.7880|± |0.0259| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8960|± |0.0193| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6720|± |0.0298| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.3219|± |0.0388| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5000|± |0.0317| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4888|± |0.0376| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7000|± |0.0290| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.4480|± |0.0315| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.7440|± |0.0277| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.3160|± |0.0295| +|boolq | 2|none | 0|acc |↑ | 0.7560|± |0.0075| +|drop | 3|none | 0|em |↑ | 0.0166|± |0.0013| +| | |none | 0|f1 |↑ | 0.1190|± |0.0021| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0266| +| | |strict-match | 0|exact_match|↑ | 0.0101|± |0.0071| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3131|± |0.0330| +| | |none | 0|acc_norm |↑ | 0.3131|± |0.0330| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2879|± |0.0323| +| | |none | 0|acc_norm |↑ | 0.2879|± |0.0323| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2234|± |0.0178| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1960|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0110|± |0.0045| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1502|± |0.0153| +| | |strict-match | 0|exact_match|↑ | 0.0037|± |0.0026| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2857|± |0.0194| +| | |none | 0|acc_norm |↑ | 0.2857|± |0.0194| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2949|± |0.0195| +| | |none | 0|acc_norm |↑ | 0.2949|± |0.0195| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1786|± |0.0181| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1964|± |0.0188| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1897|± |0.0185| +| | |strict-match | 0|exact_match|↑ | 0.0089|± |0.0044| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3036|± |0.0217| +| | |none | 0|acc_norm |↑ | 0.3036|± |0.0217| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2723|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.2723|± |0.0211| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.1865|± |0.0107| +| | |strict-match | 5|exact_match|↑ | 0.1425|± |0.0096| +|hellaswag | 1|none | 0|acc |↑ | 0.5293|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6896|± |0.0046| +|mmlu | 2|none | |acc |↑ | 0.5250|± |0.0041| +| - humanities | 2|none | |acc |↑ | 0.4417|± |0.0070| +| - formal_logic | 1|none | 0|acc |↑ | 0.5000|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6424|± |0.0374| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.5294|± |0.0350| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.6667|± |0.0307| +| - international_law | 1|none | 0|acc |↑ | 0.6116|± |0.0445| +| - jurisprudence | 1|none | 0|acc |↑ | 0.5463|± |0.0481| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.6442|± |0.0376| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5376|± |0.0268| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2492|± |0.0145| +| - philosophy | 1|none | 0|acc |↑ | 0.5466|± |0.0283| +| - prehistory | 1|none | 0|acc |↑ | 0.4877|± |0.0278| +| - professional_law | 1|none | 0|acc |↑ | 0.3722|± |0.0123| +| - world_religions | 1|none | 0|acc |↑ | 0.5673|± |0.0380| +| - other | 2|none | |acc |↑ | 0.5552|± |0.0087| +| - business_ethics | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5396|± |0.0307| +| - college_medicine | 1|none | 0|acc |↑ | 0.5549|± |0.0379| +| - global_facts | 1|none | 0|acc |↑ | 0.2900|± |0.0456| +| - human_aging | 1|none | 0|acc |↑ | 0.5471|± |0.0334| +| - management | 1|none | 0|acc |↑ | 0.6990|± |0.0454| +| - marketing | 1|none | 0|acc |↑ | 0.7735|± |0.0274| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6424|± |0.0171| +| - nutrition | 1|none | 0|acc |↑ | 0.5490|± |0.0285| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3901|± |0.0291| +| - professional_medicine | 1|none | 0|acc |↑ | 0.4669|± |0.0303| +| - virology | 1|none | 0|acc |↑ | 0.3795|± |0.0378| +| - social sciences | 2|none | |acc |↑ | 0.6107|± |0.0087| +| - econometrics | 1|none | 0|acc |↑ | 0.4825|± |0.0470| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6616|± |0.0337| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.6632|± |0.0341| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5718|± |0.0251| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.6639|± |0.0307| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.6899|± |0.0198| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5878|± |0.0432| +| - professional_psychology | 1|none | 0|acc |↑ | 0.4902|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.5909|± |0.0471| +| - security_studies | 1|none | 0|acc |↑ | 0.5918|± |0.0315| +| - sociology | 1|none | 0|acc |↑ | 0.7512|± |0.0306| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - stem | 2|none | |acc |↑ | 0.5360|± |0.0088| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4000|± |0.0492| +| - anatomy | 1|none | 0|acc |↑ | 0.4667|± |0.0431| +| - astronomy | 1|none | 0|acc |↑ | 0.6382|± |0.0391| +| - college_biology | 1|none | 0|acc |↑ | 0.6042|± |0.0409| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - college_physics | 1|none | 0|acc |↑ | 0.4020|± |0.0488| +| - computer_security | 1|none | 0|acc |↑ | 0.6500|± |0.0479| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5702|± |0.0324| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6069|± |0.0407| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.5608|± |0.0256| +| - high_school_biology | 1|none | 0|acc |↑ | 0.6387|± |0.0273| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5172|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4185|± |0.0301| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3709|± |0.0394| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5741|± |0.0337| +| - machine_learning | 1|none | 0|acc |↑ | 0.5179|± |0.0474| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0393|± |0.0032| +|openbookqa | 1|none | 0|acc |↑ | 0.3280|± |0.0210| +| | |none | 0|acc_norm |↑ | 0.4240|± |0.0221| +|piqa | 1|none | 0|acc |↑ | 0.7410|± |0.0102| +| | |none | 0|acc_norm |↑ | 0.7503|± |0.0101| +|qnli | 1|none | 0|acc |↑ | 0.4990|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9540|± |0.0066| +| | |none | 0|acc_norm |↑ | 0.9280|± |0.0082| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.1747|± |0.0028| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3452|± |0.0166| +| | |none | 0|bleu_diff |↑ |-4.1463|± |0.6373| +| | |none | 0|bleu_max |↑ |21.0959|± |0.7171| +| | |none | 0|rouge1_acc |↑ | 0.3256|± |0.0164| +| | |none | 0|rouge1_diff|↑ |-6.5427|± |0.7175| +| | |none | 0|rouge1_max |↑ |44.3011|± |0.8363| +| | |none | 0|rouge2_acc |↑ | 0.2815|± |0.0157| +| | |none | 0|rouge2_diff|↑ |-6.6740|± |0.8391| +| | |none | 0|rouge2_max |↑ |29.9891|± |0.9108| +| | |none | 0|rougeL_acc |↑ | 0.3097|± |0.0162| +| | |none | 0|rougeL_diff|↑ |-6.8840|± |0.7254| +| | |none | 0|rougeL_max |↑ |41.2479|± |0.8408| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2876|± |0.0158| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4029|± |0.0153| +|winogrande | 1|none | 0|acc |↑ | 0.6511|± |0.0134| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5247|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.5250|± |0.0041| +| - humanities | 2|none | |acc |↑ |0.4417|± |0.0070| +| - other | 2|none | |acc |↑ |0.5552|± |0.0087| +| - social sciences| 2|none | |acc |↑ |0.6107|± |0.0087| +| - stem | 2|none | |acc |↑ |0.5360|± |0.0088| + +deepseek-ai_deepseek-math-7b-rl: 8h 2m 14s +✅ Benchmark completed for deepseek-ai_deepseek-math-7b-rl + +🔥 Starting benchmark for deepseek-ai_deepseek-llm-7b-chat +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_deepseek-llm-7b-chat,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4230|± |0.0156| +|anli_r2 | 1|none | 0|acc |↑ | 0.4190|± |0.0156| +|anli_r3 | 1|none | 0|acc |↑ | 0.4208|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.4812|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.4966|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.4548|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5615|± |0.0364| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0160|± |0.0080| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5160|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3520|± |0.0303| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.7080|± |0.0288| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2840|± |0.0286| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.2160|± |0.0261| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.5920|± |0.0311| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7680|± |0.0268| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.1000|± |0.0190| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.5920|± |0.0311| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5960|± |0.0311| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.4795|± |0.0415| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.4840|± |0.0317| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4560|± |0.0316| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5225|± |0.0375| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9160|± |0.0176| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2960|± |0.0289| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1520|± |0.0228| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1360|± |0.0217| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3200|± |0.0296| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9320|± |0.0160| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0640|± |0.0155| +|boolq | 2|none | 0|acc |↑ | 0.8330|± |0.0065| +|drop | 3|none | 0|em |↑ | 0.0113|± |0.0011| +| | |none | 0|f1 |↑ | 0.1030|± |0.0019| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0808|± |0.0194| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1515|± |0.0255| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2626|± |0.0314| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3030|± |0.0327| +| | |none | 0|acc_norm |↑ | 0.3030|± |0.0327| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3485|± |0.0339| +| | |none | 0|acc_norm |↑ | 0.3485|± |0.0339| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1612|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1612|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2564|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2711|± |0.0190| +| | |none | 0|acc_norm |↑ | 0.2711|± |0.0190| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2766|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.2766|± |0.0192| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1607|± |0.0174| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1496|± |0.0169| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2679|± |0.0209| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2746|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.2746|± |0.0211| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2924|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.2924|± |0.0215| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.5087|± |0.0138| +| | |strict-match | 5|exact_match|↑ | 0.4640|± |0.0137| +|hellaswag | 1|none | 0|acc |↑ | 0.5914|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7772|± |0.0042| +|mmlu | 2|none | |acc |↑ | 0.4988|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.4627|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.3254|± |0.0419| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6485|± |0.0373| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.6912|± |0.0324| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7215|± |0.0292| +| - international_law | 1|none | 0|acc |↑ | 0.6364|± |0.0439| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6019|± |0.0473| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.6626|± |0.0371| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5607|± |0.0267| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2391|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.5627|± |0.0282| +| - prehistory | 1|none | 0|acc |↑ | 0.5586|± |0.0276| +| - professional_law | 1|none | 0|acc |↑ | 0.3761|± |0.0124| +| - world_religions | 1|none | 0|acc |↑ | 0.7368|± |0.0338| +| - other | 2|none | |acc |↑ | 0.5768|± |0.0086| +| - business_ethics | 1|none | 0|acc |↑ | 0.5600|± |0.0499| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5585|± |0.0306| +| - college_medicine | 1|none | 0|acc |↑ | 0.4740|± |0.0381| +| - global_facts | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - human_aging | 1|none | 0|acc |↑ | 0.5291|± |0.0335| +| - management | 1|none | 0|acc |↑ | 0.6893|± |0.0458| +| - marketing | 1|none | 0|acc |↑ | 0.7821|± |0.0270| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7241|± |0.0160| +| - nutrition | 1|none | 0|acc |↑ | 0.5490|± |0.0285| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3546|± |0.0285| +| - professional_medicine | 1|none | 0|acc |↑ | 0.4522|± |0.0302| +| - virology | 1|none | 0|acc |↑ | 0.4578|± |0.0388| +| - social sciences | 2|none | |acc |↑ | 0.5635|± |0.0087| +| - econometrics | 1|none | 0|acc |↑ | 0.3246|± |0.0440| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6364|± |0.0343| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.6839|± |0.0336| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4333|± |0.0251| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.4328|± |0.0322| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.6991|± |0.0197| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5420|± |0.0437| +| - professional_psychology | 1|none | 0|acc |↑ | 0.4886|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.5727|± |0.0474| +| - security_studies | 1|none | 0|acc |↑ | 0.5796|± |0.0316| +| - sociology | 1|none | 0|acc |↑ | 0.7015|± |0.0324| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7000|± |0.0461| +| - stem | 2|none | |acc |↑ | 0.4126|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.2900|± |0.0456| +| - anatomy | 1|none | 0|acc |↑ | 0.4963|± |0.0432| +| - astronomy | 1|none | 0|acc |↑ | 0.5329|± |0.0406| +| - college_biology | 1|none | 0|acc |↑ | 0.5208|± |0.0418| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3700|± |0.0485| +| - college_computer_science | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - college_physics | 1|none | 0|acc |↑ | 0.3235|± |0.0466| +| - computer_security | 1|none | 0|acc |↑ | 0.6200|± |0.0488| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.3915|± |0.0319| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4069|± |0.0409| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.3307|± |0.0242| +| - high_school_biology | 1|none | 0|acc |↑ | 0.5903|± |0.0280| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.3793|± |0.0341| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2741|± |0.0272| +| - high_school_physics | 1|none | 0|acc |↑ | 0.2914|± |0.0371| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.3750|± |0.0330| +| - machine_learning | 1|none | 0|acc |↑ | 0.5000|± |0.0475| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0634|± |0.0041| +|openbookqa | 1|none | 0|acc |↑ | 0.3500|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.4600|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.7949|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.8014|± |0.0093| +|qnli | 1|none | 0|acc |↑ | 0.4970|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9250|± |0.0083| +| | |none | 0|acc_norm |↑ | 0.8930|± |0.0098| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3112|± |0.0035| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4321|± |0.0173| +| | |none | 0|bleu_diff |↑ |-1.9106|± |0.5822| +| | |none | 0|bleu_max |↑ |20.8129|± |0.7241| +| | |none | 0|rouge1_acc |↑ | 0.4541|± |0.0174| +| | |none | 0|rouge1_diff|↑ |-2.6308|± |0.6640| +| | |none | 0|rouge1_max |↑ |45.5368|± |0.8107| +| | |none | 0|rouge2_acc |↑ | 0.3415|± |0.0166| +| | |none | 0|rouge2_diff|↑ |-3.7356|± |0.7920| +| | |none | 0|rouge2_max |↑ |31.2711|± |0.9083| +| | |none | 0|rougeL_acc |↑ | 0.4272|± |0.0173| +| | |none | 0|rougeL_diff|↑ |-2.8527|± |0.6674| +| | |none | 0|rougeL_max |↑ |42.3939|± |0.8172| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3488|± |0.0167| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4789|± |0.0154| +|winogrande | 1|none | 0|acc |↑ | 0.7017|± |0.0129| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4548|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.4988|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.4627|± |0.0069| +| - other | 2|none | |acc |↑ |0.5768|± |0.0086| +| - social sciences| 2|none | |acc |↑ |0.5635|± |0.0087| +| - stem | 2|none | |acc |↑ |0.4126|± |0.0086| + +deepseek-ai_deepseek-llm-7b-chat: 10h 7m 2s +✅ Benchmark completed for deepseek-ai_deepseek-llm-7b-chat + +🔥 Starting benchmark for deepseek-ai_deepseek-llm-7b-base +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_deepseek-llm-7b-base,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|-------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3400|± |0.0150| +|anli_r2 | 1|none | 0|acc |↑ | 0.3630|± |0.0152| +|anli_r3 | 1|none | 0|acc |↑ | 0.3775|± |0.0140| +|arc_challenge | 1|none | 0|acc |↑ | 0.4352|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.4454|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.4237|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4759|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.6640|± |0.0299| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.4120|± |0.0312| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0160|± |0.0080| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5160|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3720|± |0.0306| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.6720|± |0.0298| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2960|± |0.0289| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1920|± |0.0250| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7920|± |0.0257| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0160|± |0.0080| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.5600|± |0.0315| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.3973|± |0.0406| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.4680|± |0.0316| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.4560|± |0.0316| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.3160|± |0.0295| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.5056|± |0.0376| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9040|± |0.0187| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2800|± |0.0285| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1920|± |0.0250| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1040|± |0.0193| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3240|± |0.0297| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.7640|± |0.0269| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0640|± |0.0155| +|boolq | 2|none | 0|acc |↑ | 0.7235|± |0.0078| +|drop | 3|none | 0|em |↑ | 0.0168|± |0.0013| +| | |none | 0|f1 |↑ | 0.0422|± |0.0016| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1061|± |0.0219| +| | |strict-match | 0|exact_match|↑ | 0.0303|± |0.0122| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0960|± |0.0210| +| | |strict-match | 0|exact_match|↑ | 0.0152|± |0.0087| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2020|± |0.0286| +| | |strict-match | 0|exact_match|↑ | 0.0101|± |0.0071| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2677|± |0.0315| +| | |none | 0|acc_norm |↑ | 0.2677|± |0.0315| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2677|± |0.0315| +| | |none | 0|acc_norm |↑ | 0.2677|± |0.0315| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1026|± |0.0130| +| | |strict-match | 0|exact_match|↑ | 0.0165|± |0.0055| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1062|± |0.0132| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1960|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0055|± |0.0032| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2711|± |0.0190| +| | |none | 0|acc_norm |↑ | 0.2711|± |0.0190| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2747|± |0.0191| +| | |none | 0|acc_norm |↑ | 0.2747|± |0.0191| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1250|± |0.0156| +| | |strict-match | 0|exact_match|↑ | 0.0223|± |0.0070| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0871|± |0.0133| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2344|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2567|± |0.0207| +| | |none | 0|acc_norm |↑ | 0.2567|± |0.0207| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2522|± |0.0205| +| | |none | 0|acc_norm |↑ | 0.2522|± |0.0205| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.1638|± |0.0102| +| | |strict-match | 5|exact_match|↑ | 0.1622|± |0.0102| +|hellaswag | 1|none | 0|acc |↑ | 0.5706|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7606|± |0.0043| +|mmlu | 2|none | |acc |↑ | 0.4428|± |0.0041| +| - humanities | 2|none | |acc |↑ | 0.4106|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.2540|± |0.0389| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.5576|± |0.0388| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.5490|± |0.0349| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.5992|± |0.0319| +| - international_law | 1|none | 0|acc |↑ | 0.5868|± |0.0450| +| - jurisprudence | 1|none | 0|acc |↑ | 0.5926|± |0.0475| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.5951|± |0.0386| +| - moral_disputes | 1|none | 0|acc |↑ | 0.4827|± |0.0269| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.5498|± |0.0283| +| - prehistory | 1|none | 0|acc |↑ | 0.5062|± |0.0278| +| - professional_law | 1|none | 0|acc |↑ | 0.3214|± |0.0119| +| - world_religions | 1|none | 0|acc |↑ | 0.6433|± |0.0367| +| - other | 2|none | |acc |↑ | 0.4982|± |0.0088| +| - business_ethics | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.4792|± |0.0307| +| - college_medicine | 1|none | 0|acc |↑ | 0.4104|± |0.0375| +| - global_facts | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - human_aging | 1|none | 0|acc |↑ | 0.4798|± |0.0335| +| - management | 1|none | 0|acc |↑ | 0.5437|± |0.0493| +| - marketing | 1|none | 0|acc |↑ | 0.6453|± |0.0313| +| - medical_genetics | 1|none | 0|acc |↑ | 0.4600|± |0.0501| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6245|± |0.0173| +| - nutrition | 1|none | 0|acc |↑ | 0.4771|± |0.0286| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3794|± |0.0289| +| - professional_medicine | 1|none | 0|acc |↑ | 0.3897|± |0.0296| +| - virology | 1|none | 0|acc |↑ | 0.4036|± |0.0382| +| - social sciences | 2|none | |acc |↑ | 0.5005|± |0.0089| +| - econometrics | 1|none | 0|acc |↑ | 0.2719|± |0.0419| +| - high_school_geography | 1|none | 0|acc |↑ | 0.4697|± |0.0356| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.5492|± |0.0359| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4179|± |0.0250| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.3992|± |0.0318| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.5817|± |0.0211| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5725|± |0.0434| +| - professional_psychology | 1|none | 0|acc |↑ | 0.4641|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.5091|± |0.0479| +| - security_studies | 1|none | 0|acc |↑ | 0.5020|± |0.0320| +| - sociology | 1|none | 0|acc |↑ | 0.6368|± |0.0340| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - stem | 2|none | |acc |↑ | 0.3800|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3000|± |0.0461| +| - anatomy | 1|none | 0|acc |↑ | 0.4815|± |0.0432| +| - astronomy | 1|none | 0|acc |↑ | 0.4934|± |0.0407| +| - college_biology | 1|none | 0|acc |↑ | 0.4653|± |0.0417| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - college_computer_science | 1|none | 0|acc |↑ | 0.3500|± |0.0479| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3200|± |0.0469| +| - college_physics | 1|none | 0|acc |↑ | 0.2745|± |0.0444| +| - computer_security | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.3957|± |0.0320| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4621|± |0.0415| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.2857|± |0.0233| +| - high_school_biology | 1|none | 0|acc |↑ | 0.5032|± |0.0284| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.3744|± |0.0341| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2815|± |0.0274| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3245|± |0.0382| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.3796|± |0.0331| +| - machine_learning | 1|none | 0|acc |↑ | 0.2857|± |0.0429| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1510|± |0.0060| +|openbookqa | 1|none | 0|acc |↑ | 0.3260|± |0.0210| +| | |none | 0|acc_norm |↑ | 0.4340|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7894|± |0.0095| +| | |none | 0|acc_norm |↑ | 0.7976|± |0.0094| +|qnli | 1|none | 0|acc |↑ | 0.4959|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9400|± |0.0075| +| | |none | 0|acc_norm |↑ | 0.9150|± |0.0088| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5004|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3097|± |0.0162| +| | |none | 0|bleu_diff |↑ | -8.7271|± |0.7712| +| | |none | 0|bleu_max |↑ | 24.9259|± |0.7566| +| | |none | 0|rouge1_acc |↑ | 0.2974|± |0.0160| +| | |none | 0|rouge1_diff|↑ |-11.0783|± |0.8128| +| | |none | 0|rouge1_max |↑ | 50.8642|± |0.8265| +| | |none | 0|rouge2_acc |↑ | 0.2436|± |0.0150| +| | |none | 0|rouge2_diff|↑ |-13.5478|± |0.9872| +| | |none | 0|rouge2_max |↑ | 34.4263|± |0.9544| +| | |none | 0|rougeL_acc |↑ | 0.2876|± |0.0158| +| | |none | 0|rougeL_diff|↑ |-11.6501|± |0.8204| +| | |none | 0|rougeL_max |↑ | 47.8267|± |0.8414| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2326|± |0.0148| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.3492|± |0.0137| +|winogrande | 1|none | 0|acc |↑ | 0.6938|± |0.0130| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4237|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.4428|± |0.0041| +| - humanities | 2|none | |acc |↑ |0.4106|± |0.0069| +| - other | 2|none | |acc |↑ |0.4982|± |0.0088| +| - social sciences| 2|none | |acc |↑ |0.5005|± |0.0089| +| - stem | 2|none | |acc |↑ |0.3800|± |0.0085| + +deepseek-ai_deepseek-llm-7b-base: 7h 11m 27s +✅ Benchmark completed for deepseek-ai_deepseek-llm-7b-base + +🔥 Starting benchmark for openchat_openchat-3.6-8b-20240522 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/openchat_openchat-3.6-8b-20240522,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5560|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.5130|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4800|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5640|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.6032|± |0.0143| +|bbh | 3|get-answer | |exact_match|↑ | 0.6179|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5294|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5840|± |0.0312| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.7800|± |0.0263| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0320|± |0.0112| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5480|± |0.0315| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.4600|± |0.0316| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9440|± |0.0146| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4200|± |0.0313| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.3600|± |0.0304| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7480|± |0.0275| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.8720|± |0.0212| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.5200|± |0.0317| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.7320|± |0.0281| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8600|± |0.0220| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.6233|± |0.0402| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6200|± |0.0308| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.6200|± |0.0308| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5720|± |0.0314| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4551|± |0.0374| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.8760|± |0.0209| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4720|± |0.0316| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.4360|± |0.0314| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.5120|± |0.0317| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.8920|± |0.0197| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +|boolq | 2|none | 0|acc |↑ | 0.8728|± |0.0058| +|drop | 3|none | 0|em |↑ | 0.0547|± |0.0023| +| | |none | 0|f1 |↑ | 0.2516|± |0.0032| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2172|± |0.0294| +| | |strict-match | 0|exact_match|↑ | 0.0202|± |0.0100| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2020|± |0.0286| +| | |strict-match | 0|exact_match|↑ | 0.0556|± |0.0163| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2980|± |0.0326| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3283|± |0.0335| +| | |none | 0|acc_norm |↑ | 0.3283|± |0.0335| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3333|± |0.0336| +| | |none | 0|acc_norm |↑ | 0.3333|± |0.0336| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2216|± |0.0178| +| | |strict-match | 0|exact_match|↑ | 0.0147|± |0.0051| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2546|± |0.0187| +| | |strict-match | 0|exact_match|↑ | 0.0403|± |0.0084| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3095|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3242|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3242|± |0.0200| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3425|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3425|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2098|± |0.0193| +| | |strict-match | 0|exact_match|↑ | 0.0134|± |0.0054| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.2567|± |0.0207| +| | |strict-match | 0|exact_match|↑ | 0.0268|± |0.0076| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3237|± |0.0221| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3438|± |0.0225| +| | |none | 0|acc_norm |↑ | 0.3438|± |0.0225| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3326|± |0.0223| +| | |none | 0|acc_norm |↑ | 0.3326|± |0.0223| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7521|± |0.0119| +| | |strict-match | 5|exact_match|↑ | 0.7506|± |0.0119| +|hellaswag | 1|none | 0|acc |↑ | 0.6116|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7978|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.6431|± |0.0038| +| - humanities | 2|none | |acc |↑ | 0.5966|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.5000|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7515|± |0.0337| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8333|± |0.0262| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8439|± |0.0236| +| - international_law | 1|none | 0|acc |↑ | 0.7438|± |0.0398| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7870|± |0.0396| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7423|± |0.0344| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7110|± |0.0244| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.4313|± |0.0166| +| - philosophy | 1|none | 0|acc |↑ | 0.6849|± |0.0264| +| - prehistory | 1|none | 0|acc |↑ | 0.7191|± |0.0250| +| - professional_law | 1|none | 0|acc |↑ | 0.4831|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.7895|± |0.0313| +| - other | 2|none | |acc |↑ | 0.7071|± |0.0079| +| - business_ethics | 1|none | 0|acc |↑ | 0.6500|± |0.0479| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7245|± |0.0275| +| - college_medicine | 1|none | 0|acc |↑ | 0.6358|± |0.0367| +| - global_facts | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - human_aging | 1|none | 0|acc |↑ | 0.7175|± |0.0302| +| - management | 1|none | 0|acc |↑ | 0.7961|± |0.0399| +| - marketing | 1|none | 0|acc |↑ | 0.8803|± |0.0213| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8238|± |0.0136| +| - nutrition | 1|none | 0|acc |↑ | 0.7124|± |0.0259| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5142|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6434|± |0.0291| +| - virology | 1|none | 0|acc |↑ | 0.5120|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7452|± |0.0077| +| - econometrics | 1|none | 0|acc |↑ | 0.4561|± |0.0469| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7828|± |0.0294| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9067|± |0.0210| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6590|± |0.0240| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7353|± |0.0287| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8349|± |0.0159| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7710|± |0.0369| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6765|± |0.0189| +| - public_relations | 1|none | 0|acc |↑ | 0.6364|± |0.0461| +| - security_studies | 1|none | 0|acc |↑ | 0.7347|± |0.0283| +| - sociology | 1|none | 0|acc |↑ | 0.8458|± |0.0255| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8900|± |0.0314| +| - stem | 2|none | |acc |↑ | 0.5496|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3600|± |0.0482| +| - anatomy | 1|none | 0|acc |↑ | 0.6815|± |0.0402| +| - astronomy | 1|none | 0|acc |↑ | 0.7368|± |0.0358| +| - college_biology | 1|none | 0|acc |↑ | 0.7153|± |0.0377| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - college_physics | 1|none | 0|acc |↑ | 0.4804|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5787|± |0.0323| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5655|± |0.0413| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4656|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7484|± |0.0247| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5320|± |0.0351| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4000|± |0.0299| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3907|± |0.0398| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5093|± |0.0341| +| - machine_learning | 1|none | 0|acc |↑ | 0.5625|± |0.0471| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.1706|± |0.0063| +|openbookqa | 1|none | 0|acc |↑ | 0.3700|± |0.0216| +| | |none | 0|acc_norm |↑ | 0.4620|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.8041|± |0.0093| +| | |none | 0|acc_norm |↑ | 0.8183|± |0.0090| +|qnli | 1|none | 0|acc |↑ | 0.7300|± |0.0060| +|sciq | 1|none | 0|acc |↑ | 0.9730|± |0.0051| +| | |none | 0|acc_norm |↑ | 0.9640|± |0.0059| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.5659|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4162|± |0.0173| +| | |none | 0|bleu_diff |↑ |-2.4558|± |0.6495| +| | |none | 0|bleu_max |↑ |22.9231|± |0.7496| +| | |none | 0|rouge1_acc |↑ | 0.4088|± |0.0172| +| | |none | 0|rouge1_diff|↑ |-3.9076|± |0.7660| +| | |none | 0|rouge1_max |↑ |47.4547|± |0.8751| +| | |none | 0|rouge2_acc |↑ | 0.3550|± |0.0168| +| | |none | 0|rouge2_diff|↑ |-4.4347|± |0.8978| +| | |none | 0|rouge2_max |↑ |33.1938|± |0.9499| +| | |none | 0|rougeL_acc |↑ | 0.4051|± |0.0172| +| | |none | 0|rougeL_diff|↑ |-3.9650|± |0.7656| +| | |none | 0|rougeL_max |↑ |44.6201|± |0.8785| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3525|± |0.0167| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4976|± |0.0152| +|winogrande | 1|none | 0|acc |↑ | 0.7632|± |0.0119| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.6179|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.6431|± |0.0038| +| - humanities | 2|none | |acc |↑ |0.5966|± |0.0068| +| - other | 2|none | |acc |↑ |0.7071|± |0.0079| +| - social sciences| 2|none | |acc |↑ |0.7452|± |0.0077| +| - stem | 2|none | |acc |↑ |0.5496|± |0.0086| + +openchat_openchat-3.6-8b-20240522: 7h 51m 28s +✅ Benchmark completed for openchat_openchat-3.6-8b-20240522 + +🔥 Starting benchmark for internlm_internlm2_5-7b-chat +internlm_internlm2_5-7b-chat: 0h 4m 58s +✅ Benchmark completed for internlm_internlm2_5-7b-chat + +🔥 Starting benchmark for THUDM_chatglm3-6b +THUDM_chatglm3-6b: 0h 32m 25s +✅ Benchmark completed for THUDM_chatglm3-6b + +🔥 Starting benchmark for NousResearch_Hermes-2-Pro-Mistral-7B +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/NousResearch_Hermes-2-Pro-Mistral-7B,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 3 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5310|± |0.0158| +|anli_r2 | 1|none | 0|acc |↑ | 0.4960|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5000|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5444|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5657|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.5738|± |0.0055| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8080|± |0.0250| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5615|± |0.0364| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.6320|± |0.0306| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6200|± |0.0308| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0720|± |0.0164| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.8000|± |0.0253| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4640|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.2240|± |0.0264| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.8680|± |0.0215| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7880|± |0.0259| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.3800|± |0.0308| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6440|± |0.0303| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6840|± |0.0295| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5959|± |0.0408| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6360|± |0.0305| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.6480|± |0.0303| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5600|± |0.0315| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.7022|± |0.0344| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.9400|± |0.0151| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.6480|± |0.0303| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3800|± |0.0308| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.2720|± |0.0282| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.4840|± |0.0317| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9640|± |0.0118| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2320|± |0.0268| +|boolq | 2|none | 0|acc |↑ | 0.8682|± |0.0059| +|drop | 3|none | 0|em |↑ | 0.0167|± |0.0013| +| | |none | 0|f1 |↑ | 0.1098|± |0.0022| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1162|± |0.0228| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0960|± |0.0210| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2525|± |0.0310| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3232|± |0.0333| +| | |none | 0|acc_norm |↑ | 0.3232|± |0.0333| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2677|± |0.0315| +| | |none | 0|acc_norm |↑ | 0.2677|± |0.0315| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1685|± |0.0160| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1520|± |0.0154| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2637|± |0.0189| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2875|± |0.0194| +| | |none | 0|acc_norm |↑ | 0.2875|± |0.0194| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2949|± |0.0195| +| | |none | 0|acc_norm |↑ | 0.2949|± |0.0195| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1317|± |0.0160| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1228|± |0.0155| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2366|± |0.0201| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2991|± |0.0217| +| | |none | 0|acc_norm |↑ | 0.2991|± |0.0217| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2768|± |0.0212| +| | |none | 0|acc_norm |↑ | 0.2768|± |0.0212| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6861|± |0.0128| +| | |strict-match | 5|exact_match|↑ | 0.6854|± |0.0128| +|hellaswag | 1|none | 0|acc |↑ | 0.6270|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.8049|± |0.0040| +|mmlu | 2|none | |acc |↑ | 0.6051|± |0.0039| +| - humanities | 2|none | |acc |↑ | 0.5484|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.4127|± |0.0440| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7697|± |0.0329| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7990|± |0.0281| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8354|± |0.0241| +| - international_law | 1|none | 0|acc |↑ | 0.7769|± |0.0380| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7130|± |0.0437| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7178|± |0.0354| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6821|± |0.0251| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2492|± |0.0145| +| - philosophy | 1|none | 0|acc |↑ | 0.6752|± |0.0266| +| - prehistory | 1|none | 0|acc |↑ | 0.7253|± |0.0248| +| - professional_law | 1|none | 0|acc |↑ | 0.4622|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.8129|± |0.0299| +| - other | 2|none | |acc |↑ | 0.6807|± |0.0081| +| - business_ethics | 1|none | 0|acc |↑ | 0.5600|± |0.0499| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6755|± |0.0288| +| - college_medicine | 1|none | 0|acc |↑ | 0.6301|± |0.0368| +| - global_facts | 1|none | 0|acc |↑ | 0.4000|± |0.0492| +| - human_aging | 1|none | 0|acc |↑ | 0.6771|± |0.0314| +| - management | 1|none | 0|acc |↑ | 0.7864|± |0.0406| +| - marketing | 1|none | 0|acc |↑ | 0.8632|± |0.0225| +| - medical_genetics | 1|none | 0|acc |↑ | 0.6500|± |0.0479| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8020|± |0.0142| +| - nutrition | 1|none | 0|acc |↑ | 0.6928|± |0.0264| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4362|± |0.0296| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6765|± |0.0284| +| - virology | 1|none | 0|acc |↑ | 0.5120|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7082|± |0.0080| +| - econometrics | 1|none | 0|acc |↑ | 0.3860|± |0.0458| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7424|± |0.0312| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8446|± |0.0261| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6256|± |0.0245| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.6681|± |0.0306| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8239|± |0.0163| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7328|± |0.0388| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6373|± |0.0195| +| - public_relations | 1|none | 0|acc |↑ | 0.5909|± |0.0471| +| - security_studies | 1|none | 0|acc |↑ | 0.7020|± |0.0293| +| - sociology | 1|none | 0|acc |↑ | 0.8159|± |0.0274| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8600|± |0.0349| +| - stem | 2|none | |acc |↑ | 0.5147|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3200|± |0.0469| +| - anatomy | 1|none | 0|acc |↑ | 0.5704|± |0.0428| +| - astronomy | 1|none | 0|acc |↑ | 0.6382|± |0.0391| +| - college_biology | 1|none | 0|acc |↑ | 0.7083|± |0.0380| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5100|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.2800|± |0.0451| +| - college_physics | 1|none | 0|acc |↑ | 0.4216|± |0.0491| +| - computer_security | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5617|± |0.0324| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4828|± |0.0416| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4471|± |0.0256| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7387|± |0.0250| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4828|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6400|± |0.0482| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3704|± |0.0294| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3377|± |0.0386| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4954|± |0.0341| +| - machine_learning | 1|none | 0|acc |↑ | 0.4911|± |0.0475| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0404|± |0.0033| +|openbookqa | 1|none | 0|acc |↑ | 0.3380|± |0.0212| +| | |none | 0|acc_norm |↑ | 0.4340|± |0.0222| +|piqa | 1|none | 0|acc |↑ | 0.7938|± |0.0094| +| | |none | 0|acc_norm |↑ | 0.7987|± |0.0094| +|qnli | 1|none | 0|acc |↑ | 0.5565|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9500|± |0.0069| +| | |none | 0|acc_norm |↑ | 0.9170|± |0.0087| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.4711|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5606|± |0.0174| +| | |none | 0|bleu_diff |↑ | 8.7131|± |0.8710| +| | |none | 0|bleu_max |↑ |27.5391|± |0.8368| +| | |none | 0|rouge1_acc |↑ | 0.5887|± |0.0172| +| | |none | 0|rouge1_diff|↑ |12.4445|± |1.2143| +| | |none | 0|rouge1_max |↑ |54.1696|± |0.8927| +| | |none | 0|rouge2_acc |↑ | 0.4994|± |0.0175| +| | |none | 0|rouge2_diff|↑ |12.4661|± |1.2976| +| | |none | 0|rouge2_max |↑ |40.9144|± |1.0592| +| | |none | 0|rougeL_acc |↑ | 0.5569|± |0.0174| +| | |none | 0|rougeL_diff|↑ |11.9317|± |1.2333| +| | |none | 0|rougeL_max |↑ |50.9560|± |0.9357| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4137|± |0.0172| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5912|± |0.0158| +|winogrande | 1|none | 0|acc |↑ | 0.7198|± |0.0126| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.5738|± |0.0055| +|mmlu | 2|none | |acc |↑ |0.6051|± |0.0039| +| - humanities | 2|none | |acc |↑ |0.5484|± |0.0067| +| - other | 2|none | |acc |↑ |0.6807|± |0.0081| +| - social sciences| 2|none | |acc |↑ |0.7082|± |0.0080| +| - stem | 2|none | |acc |↑ |0.5147|± |0.0086| + +NousResearch_Hermes-2-Pro-Mistral-7B: 8h 27m 23s +✅ Benchmark completed for NousResearch_Hermes-2-Pro-Mistral-7B + +🔥 Starting benchmark for deepseek-ai_deepseek-moe-16b-base +deepseek-ai_deepseek-moe-16b-base: 0h 0m 13s +✅ Benchmark completed for deepseek-ai_deepseek-moe-16b-base + +🔥 Starting benchmark for deepseek-ai_deepseek-moe-16b-chat +🔥 Starting benchmark for baichuan-inc_Baichuan-M1-14B-Instruct +baichuan-inc_Baichuan-M1-14B-Instruct: 0h 0m 4s +✅ Benchmark completed for baichuan-inc_Baichuan-M1-14B-Instruct + +🔥 Starting benchmark for baichuan-inc_Baichuan2-13B-Chat +🔥 Starting benchmark for baichuan-inc_Baichuan2-13B-Chat +🔥 Starting benchmark for baichuan-inc_Baichuan-M1-14B-Instruct +baichuan-inc_Baichuan-M1-14B-Instruct: 0h 0m 4s +✅ Benchmark completed for baichuan-inc_Baichuan-M1-14B-Instruct + +🔥 Starting benchmark for moonshotai_Moonlight-16B-A3B-Instruct +🔥 Starting benchmark for moonshotai_Moonlight-16B-A3B-Instruct +moonshotai_Moonlight-16B-A3B-Instruct: 0h 0m 3s +✅ Benchmark completed for moonshotai_Moonlight-16B-A3B-Instruct + +🔥 Starting benchmark for moonshotai_Moonlight-16B-A3B +🔥 Starting benchmark for moonshotai_Moonlight-16B-A3B-Instruct +🔥 Starting benchmark for Qwen_Qwen3-14B +🔥 Starting benchmark for Qwen_Qwen2.5-14B-Instruct +🔥 Starting benchmark for Qwen_Qwen-7B-Chat +Qwen_Qwen-7B-Chat: 0h 4m 49s +✅ Benchmark completed for Qwen_Qwen-7B-Chat + +🔥 Starting benchmark for Qwen_Qwen-7B +Qwen_Qwen-7B: 0h 4m 53s +✅ Benchmark completed for Qwen_Qwen-7B + +🔥 Starting benchmark for baichuan-inc_Baichuan2-13B-Chat +baichuan-inc_Baichuan2-13B-Chat: 0h 5m 31s +✅ Benchmark completed for baichuan-inc_Baichuan2-13B-Chat + +🔥 Starting benchmark for moonshotai_Moonlight-16B-A3B-Instruct +moonshotai_Moonlight-16B-A3B-Instruct: 0h 5m 27s +✅ Benchmark completed for moonshotai_Moonlight-16B-A3B-Instruct + +🔥 Starting benchmark for moonshotai_Moonlight-16B-A3B +moonshotai_Moonlight-16B-A3B: 0h 5m 26s +✅ Benchmark completed for moonshotai_Moonlight-16B-A3B + +🔥 Starting benchmark for Qwen_Qwen3-14B +Qwen_Qwen3-14B: 0h 5m 11s +✅ Benchmark completed for Qwen_Qwen3-14B + +🔥 Starting benchmark for Qwen_Qwen2.5-14B-Instruct +🔥 Starting benchmark for openai-community_gpt2 +🔥 Starting benchmark for deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B +🔥 Starting benchmark for openai-community_gpt2 +🔥 Starting benchmark for openai-community_gpt2 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 32 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 32 +openai-community_gpt2: 0h 28m 26s +✅ Benchmark completed for openai-community_gpt2 + +🔥 Starting benchmark for openai-community_gpt2-medium +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 28 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 28 +openai-community_gpt2-medium: 0h 51m 53s +✅ Benchmark completed for openai-community_gpt2-medium + +🔥 Starting benchmark for openai-community_gpt2-large +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 19 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 19 +openai-community_gpt2-large: 1h 27m 10s +✅ Benchmark completed for openai-community_gpt2-large + +🔥 Starting benchmark for openai-community_gpt2-xl +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 13 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 13 +openai-community_gpt2-xl: 2h 30m 7s +✅ Benchmark completed for openai-community_gpt2-xl + +🔥 Starting benchmark for Qwen_Qwen2.5-Math-1.5B-Instruct +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 6 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 6 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-Math-1.5B-Instruct,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto:4 (6,64,64,64,64) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3420|± |0.0150| +|anli_r2 | 1|none | 0|acc |↑ | 0.3410|± |0.0150| +|anli_r3 | 1|none | 0|acc |↑ | 0.3533|± |0.0138| +|arc_challenge | 1|none | 0|acc |↑ | 0.3336|± |0.0138| +| | |none | 0|acc_norm |↑ | 0.3652|± |0.0141| +|bbh | 3|get-answer | |exact_match|↑ | 0.4373|± |0.0052| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8760|± |0.0209| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4706|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5680|± |0.0314| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.5280|± |0.0316| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.4960|± |0.0317| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.1440|± |0.0222| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5360|± |0.0316| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2960|± |0.0289| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7040|± |0.0289| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.2840|± |0.0286| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.8080|± |0.0250| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8680|± |0.0215| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6800|± |0.0296| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.6027|± |0.0406| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5760|± |0.0313| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.0480|± |0.0135| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.2120|± |0.0259| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4775|± |0.0375| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.4800|± |0.0317| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.1840|± |0.0246| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|��� | 0.2680|± |0.0281| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.2040|± |0.0255| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.5200|± |0.0317| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.8120|± |0.0248| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0240|± |0.0097| +|boolq | 2|none | 0|acc |↑ | 0.5694|± |0.0087| +|drop | 3|none | 0|em |↑ | 0.0002|± |0.0001| +| | |none | 0|f1 |↑ | 0.0231|± |0.0007| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1162|± |0.0228| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0960|± |0.0210| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2172|± |0.0294| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2828|± |0.0321| +| | |none | 0|acc_norm |↑ | 0.2828|± |0.0321| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3030|± |0.0327| +| | |none | 0|acc_norm |↑ | 0.3030|± |0.0327| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0971|± |0.0127| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1282|± |0.0143| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2527|± |0.0186| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3168|± |0.0199| +| | |none | 0|acc_norm |↑ | 0.3168|± |0.0199| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2821|± |0.0193| +| | |none | 0|acc_norm |↑ | 0.2821|± |0.0193| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1004|± |0.0142| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1094|± |0.0148| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2433|± |0.0203| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3013|± |0.0217| +| | |none | 0|acc_norm |↑ | 0.3013|± |0.0217| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2835|± |0.0213| +| | |none | 0|acc_norm |↑ | 0.2835|± |0.0213| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7415|± |0.0121| +| | |strict-match | 5|exact_match|↑ | 0.7369|± |0.0121| +|hellaswag | 1|none | 0|acc |↑ | 0.3530|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.4166|± |0.0049| +|mmlu | 2|none | |acc |↑ | 0.3788|± |0.0041| +| - humanities | 2|none | |acc |↑ | 0.3271|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.4762|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.3758|± |0.0378| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.3873|± |0.0342| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.3755|± |0.0315| +| - international_law | 1|none | 0|acc |↑ | 0.4959|± |0.0456| +| - jurisprudence | 1|none | 0|acc |↑ | 0.3241|± |0.0452| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.3988|± |0.0385| +| - moral_disputes | 1|none | 0|acc |↑ | 0.3699|± |0.0260| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2469|± |0.0144| +| - philosophy | 1|none | 0|acc |↑ | 0.3826|± |0.0276| +| - prehistory | 1|none | 0|acc |↑ | 0.3457|± |0.0265| +| - professional_law | 1|none | 0|acc |↑ | 0.3025|± |0.0117| +| - world_religions | 1|none | 0|acc |↑ | 0.2632|± |0.0338| +| - other | 2|none | |acc |↑ | 0.3746|± |0.0086| +| - business_ethics | 1|none | 0|acc |↑ | 0.2900|± |0.0456| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.3736|± |0.0298| +| - college_medicine | 1|none | 0|acc |↑ | 0.4046|± |0.0374| +| - global_facts | 1|none | 0|acc |↑ | 0.2600|± |0.0441| +| - human_aging | 1|none | 0|acc |↑ | 0.3677|± |0.0324| +| - management | 1|none | 0|acc |↑ | 0.4951|± |0.0495| +| - marketing | 1|none | 0|acc |↑ | 0.5513|± |0.0326| +| - medical_genetics | 1|none | 0|acc |↑ | 0.4700|± |0.0502| +| - miscellaneous | 1|none | 0|acc |↑ | 0.3729|± |0.0173| +| - nutrition | 1|none | 0|acc |↑ | 0.4281|± |0.0283| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3014|± |0.0274| +| - professional_medicine | 1|none | 0|acc |↑ | 0.2426|± |0.0260| +| - virology | 1|none | 0|acc |↑ | 0.3434|± |0.0370| +| - social sciences | 2|none | |acc |↑ | 0.4127|± |0.0088| +| - econometrics | 1|none | 0|acc |↑ | 0.3596|± |0.0451| +| - high_school_geography | 1|none | 0|acc |↑ | 0.3434|± |0.0338| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.3627|± |0.0347| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4282|± |0.0251| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.4874|± |0.0325| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.4532|± |0.0213| +| - human_sexuality | 1|none | 0|acc |↑ | 0.3893|± |0.0428| +| - professional_psychology | 1|none | 0|acc |↑ | 0.3578|± |0.0194| +| - public_relations | 1|none | 0|acc |↑ | 0.3727|± |0.0463| +| - security_studies | 1|none | 0|acc |↑ | 0.4163|± |0.0316| +| - sociology | 1|none | 0|acc |↑ | 0.4925|± |0.0354| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.4900|± |0.0502| +| - stem | 2|none | |acc |↑ | 0.4269|± |0.0088| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - anatomy | 1|none | 0|acc |↑ | 0.3185|± |0.0402| +| - astronomy | 1|none | 0|acc |↑ | 0.3816|± |0.0395| +| - college_biology | 1|none | 0|acc |↑ | 0.3125|± |0.0388| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3500|± |0.0479| +| - college_computer_science | 1|none | 0|acc |↑ | 0.4900|± |0.0502| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4400|± |0.0499| +| - college_physics | 1|none | 0|acc |↑ | 0.3431|± |0.0472| +| - computer_security | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.4638|± |0.0326| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4759|± |0.0416| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.5344|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.4387|± |0.0282| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4187|± |0.0347| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.5500|± |0.0500| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3778|± |0.0296| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3510|± |0.0390| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.4583|± |0.0340| +| - machine_learning | 1|none | 0|acc |↑ | 0.3929|± |0.0464| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0039|± |0.0010| +|openbookqa | 1|none | 0|acc |↑ | 0.1980|± |0.0178| +| | |none | 0|acc_norm |↑ | 0.2860|± |0.0202| +|piqa | 1|none | 0|acc |↑ | 0.6115|± |0.0114| +| | |none | 0|acc_norm |↑ | 0.6137|± |0.0114| +|qnli | 1|none | 0|acc |↑ | 0.4973|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.7550|± |0.0136| +| | |none | 0|acc_norm |↑ | 0.7180|± |0.0142| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0043|± |0.0005| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3672|± |0.0169| +| | |none | 0|bleu_diff |↑ |-0.3104|± |0.3272| +| | |none | 0|bleu_max |↑ |10.0271|± |0.4450| +| | |none | 0|rouge1_acc |↑ | 0.4162|± |0.0173| +| | |none | 0|rouge1_diff|↑ | 0.4141|± |0.5503| +| | |none | 0|rouge1_max |↑ |30.4103|± |0.7236| +| | |none | 0|rouge2_acc |↑ | 0.2632|± |0.0154| +| | |none | 0|rouge2_diff|↑ |-0.8895|± |0.6038| +| | |none | 0|rouge2_max |↑ |17.4804|± |0.7257| +| | |none | 0|rougeL_acc |↑ | 0.4002|± |0.0172| +| | |none | 0|rougeL_diff|↑ | 0.0440|± |0.5522| +| | |none | 0|rougeL_max |↑ |28.4832|± |0.7201| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2901|± |0.0159| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4895|± |0.0159| +|winogrande | 1|none | 0|acc |↑ | 0.5257|± |0.0140| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4373|± |0.0052| +|mmlu | 2|none | |acc |↑ |0.3788|± |0.0041| +| - humanities | 2|none | |acc |↑ |0.3271|± |0.0068| +| - other | 2|none | |acc |↑ |0.3746|± |0.0086| +| - social sciences| 2|none | |acc |↑ |0.4127|± |0.0088| +| - stem | 2|none | |acc |↑ |0.4269|± |0.0088| + +Qwen_Qwen2.5-Math-1.5B-Instruct: 3h 25m 33s +✅ Benchmark completed for Qwen_Qwen2.5-Math-1.5B-Instruct + +🔥 Starting benchmark for Qwen_Qwen2.5-3B-Instruct +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 2 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-3B-Instruct,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto:4 (2,64,64,64,64) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5620|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.4660|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.4942|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.4590|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.4821|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.2491|± |0.0041| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.2320|± |0.0268| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.0053|± |0.0053| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.4120|± |0.0312| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.0320|± |0.0112| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.2520|± |0.0275| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1200|± |0.0206| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0920|± |0.0183| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.0520|± |0.0141| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.0440|± |0.0130| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5280|± |0.0316| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.2329|± |0.0351| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.0160|± |0.0080| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.6800|± |0.0296| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.3720|± |0.0306| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1160|± |0.0203| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.6400|± |0.0304| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.1200|± |0.0206| +|boolq | 2|none | 0|acc |↑ | 0.8012|± |0.0070| +|drop | 3|none | 0|em |↑ | 0.0016|± |0.0004| +| | |none | 0|f1 |↑ | 0.0773|± |0.0014| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1010|± |0.0215| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0909|± |0.0205| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1667|± |0.0266| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3232|± |0.0333| +| | |none | 0|acc_norm |↑ | 0.3232|± |0.0333| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3081|± |0.0329| +| | |none | 0|acc_norm |↑ | 0.3081|± |0.0329| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1538|± |0.0155| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1081|± |0.0133| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1996|± |0.0171| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2985|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2985|± |0.0196| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3278|± |0.0201| +| | |none | 0|acc_norm |↑ | 0.3278|± |0.0201| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1183|± |0.0153| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1161|± |0.0152| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1763|± |0.0180| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2812|± |0.0213| +| | |none | 0|acc_norm |↑ | 0.2812|± |0.0213| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3214|± |0.0221| +| | |none | 0|acc_norm |↑ | 0.3214|± |0.0221| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6384|± |0.0132| +| | |strict-match | 5|exact_match|↑ | 0.1016|± |0.0083| +|hellaswag | 1|none | 0|acc |↑ | 0.5633|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7491|± |0.0043| +|mmlu | 2|none | |acc |↑ | 0.6550|± |0.0038| +| - humanities | 2|none | |acc |↑ | 0.5858|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.4603|± |0.0446| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8061|± |0.0309| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8333|± |0.0262| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8523|± |0.0231| +| - international_law | 1|none | 0|acc |↑ | 0.7851|± |0.0375| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7778|± |0.0402| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7975|± |0.0316| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6763|± |0.0252| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3374|± |0.0158| +| - philosophy | 1|none | 0|acc |↑ | 0.7074|± |0.0258| +| - prehistory | 1|none | 0|acc |↑ | 0.7315|± |0.0247| +| - professional_law | 1|none | 0|acc |↑ | 0.4896|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.8187|± |0.0295| +| - other | 2|none | |acc |↑ | 0.7023|± |0.0079| +| - business_ethics | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7094|± |0.0279| +| - college_medicine | 1|none | 0|acc |↑ | 0.6474|± |0.0364| +| - global_facts | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - human_aging | 1|none | 0|acc |↑ | 0.7040|± |0.0306| +| - management | 1|none | 0|acc |↑ | 0.7864|± |0.0406| +| - marketing | 1|none | 0|acc |↑ | 0.8846|± |0.0209| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7900|± |0.0409| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7982|± |0.0144| +| - nutrition | 1|none | 0|acc |↑ | 0.7190|± |0.0257| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5355|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6360|± |0.0292| +| - virology | 1|none | 0|acc |↑ | 0.4819|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7602|± |0.0076| +| - econometrics | 1|none | 0|acc |↑ | 0.4912|± |0.0470| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7828|± |0.0294| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8653|± |0.0246| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6846|± |0.0236| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7815|± |0.0268| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8459|± |0.0155| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7481|± |0.0381| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7190|± |0.0182| +| - public_relations | 1|none | 0|acc |↑ | 0.6818|± |0.0446| +| - security_studies | 1|none | 0|acc |↑ | 0.7510|± |0.0277| +| - sociology | 1|none | 0|acc |↑ | 0.8308|± |0.0265| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8300|± |0.0378| +| - stem | 2|none | |acc |↑ | 0.6089|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.4900|± |0.0502| +| - anatomy | 1|none | 0|acc |↑ | 0.6667|± |0.0407| +| - astronomy | 1|none | 0|acc |↑ | 0.7368|± |0.0358| +| - college_biology | 1|none | 0|acc |↑ | 0.7292|± |0.0372| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5000|± |0.0503| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - college_physics | 1|none | 0|acc |↑ | 0.4804|± |0.0497| +| - computer_security | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.6383|± |0.0314| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6483|± |0.0398| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6032|± |0.0252| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8161|± |0.0220| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5813|± |0.0347| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5000|± |0.0305| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4503|± |0.0406| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5880|± |0.0336| +| - machine_learning | 1|none | 0|acc |↑ | 0.4911|± |0.0475| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0083|± |0.0015| +|openbookqa | 1|none | 0|acc |↑ | 0.3320|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.4220|± |0.0221| +|piqa | 1|none | 0|acc |↑ | 0.7786|± |0.0097| +| | |none | 0|acc_norm |↑ | 0.7807|± |0.0097| +|qnli | 1|none | 0|acc |↑ | 0.7979|± |0.0054| +|sciq | 1|none | 0|acc |↑ | 0.9460|± |0.0072| +| | |none | 0|acc_norm |↑ | 0.9130|± |0.0089| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.3010|± |0.0034| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4835|± |0.0175| +| | |none | 0|bleu_diff |↑ |-0.2750|± |0.2661| +| | |none | 0|bleu_max |↑ | 7.7071|± |0.3537| +| | |none | 0|rouge1_acc |↑ | 0.4847|± |0.0175| +| | |none | 0|rouge1_diff|↑ |-0.1394|± |0.4206| +| | |none | 0|rouge1_max |↑ |26.4054|± |0.5770| +| | |none | 0|rouge2_acc |↑ | 0.3880|± |0.0171| +| | |none | 0|rouge2_diff|↑ |-0.7135|± |0.4378| +| | |none | 0|rouge2_max |↑ |15.0737|± |0.5493| +| | |none | 0|rougeL_acc |↑ | 0.4651|± |0.0175| +| | |none | 0|rougeL_diff|↑ |-0.4360|± |0.4149| +| | |none | 0|rougeL_max |↑ |23.4471|± |0.5573| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4162|± |0.0173| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5861|± |0.0157| +|winogrande | 1|none | 0|acc |↑ | 0.6930|± |0.0130| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.2491|± |0.0041| +|mmlu | 2|none | |acc |↑ |0.6550|± |0.0038| +| - humanities | 2|none | |acc |↑ |0.5858|± |0.0067| +| - other | 2|none | |acc |↑ |0.7023|± |0.0079| +| - social sciences| 2|none | |acc |↑ |0.7602|± |0.0076| +| - stem | 2|none | |acc |↑ |0.6089|± |0.0085| + +Qwen_Qwen2.5-3B-Instruct: 7h 48m 19s +✅ Benchmark completed for Qwen_Qwen2.5-3B-Instruct + +🔥 Starting benchmark for Qwen_Qwen2.5-1.5B-Instruct +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 1 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +Passed argument batch_size = auto. Detecting largest batch size +Determined Largest batch size: 1 +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-1.5B-Instruct,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto:4 (1,64,64,64,64) +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4500|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.3940|± |0.0155| +|anli_r3 | 1|none | 0|acc |↑ | 0.4325|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.4394|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.4659|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.3861|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.5080|± |0.0367| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.4040|± |0.0311| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.3880|± |0.0309| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0200|± |0.0089| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3120|± |0.0294| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5720|± |0.0314| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.5880|± |0.0312| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5320|± |0.0316| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.5920|± |0.0311| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6640|± |0.0299| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.4795|± |0.0415| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5200|± |0.0317| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.3480|± |0.0302| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.3440|± |0.0301| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4831|± |0.0376| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.6600|± |0.0300| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.1640|± |0.0235| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1440|± |0.0222| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1200|± |0.0206| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.4080|± |0.0311| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.0560|± |0.0146| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0440|± |0.0130| +|boolq | 2|none | 0|acc |↑ | 0.7810|± |0.0072| +|drop | 3|none | 0|em |↑ | 0.0018|± |0.0004| +| | |none | 0|f1 |↑ | 0.0391|± |0.0011| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1869|± |0.0278| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1919|± |0.0281| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2929|± |0.0324| +| | |none | 0|acc_norm |↑ | 0.2929|± |0.0324| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2273|± |0.0299| +| | |none | 0|acc_norm |↑ | 0.2273|± |0.0299| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1941|± |0.0169| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1465|± |0.0151| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2179|± |0.0177| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3077|± |0.0198| +| | |none | 0|acc_norm |↑ | 0.3077|± |0.0198| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2985|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2985|± |0.0196| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1674|± |0.0177| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1496|± |0.0169| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2232|± |0.0197| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3237|± |0.0221| +| | |none | 0|acc_norm |↑ | 0.3237|± |0.0221| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2857|± |0.0214| +| | |none | 0|acc_norm |↑ | 0.2857|± |0.0214| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.5201|± |0.0138| +| | |strict-match | 5|exact_match|↑ | 0.3025|± |0.0127| +|hellaswag | 1|none | 0|acc |↑ | 0.5087|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6827|± |0.0046| +|mmlu | 2|none | |acc |↑ | 0.6003|± |0.0039| +| - humanities | 2|none | |acc |↑ | 0.5409|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.5000|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7576|± |0.0335| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7255|± |0.0313| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7722|± |0.0273| +| - international_law | 1|none | 0|acc |↑ | 0.7355|± |0.0403| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7963|± |0.0389| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7669|± |0.0332| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6474|± |0.0257| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2972|± |0.0153| +| - philosophy | 1|none | 0|acc |↑ | 0.6656|± |0.0268| +| - prehistory | 1|none | 0|acc |↑ | 0.6728|± |0.0261| +| - professional_law | 1|none | 0|acc |↑ | 0.4394|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.8012|± |0.0306| +| - other | 2|none | |acc |↑ | 0.6453|± |0.0083| +| - business_ethics | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6642|± |0.0291| +| - college_medicine | 1|none | 0|acc |↑ | 0.6705|± |0.0358| +| - global_facts | 1|none | 0|acc |↑ | 0.2100|± |0.0409| +| - human_aging | 1|none | 0|acc |↑ | 0.6278|± |0.0324| +| - management | 1|none | 0|acc |↑ | 0.8155|± |0.0384| +| - marketing | 1|none | 0|acc |↑ | 0.8376|± |0.0242| +| - medical_genetics | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7241|± |0.0160| +| - nutrition | 1|none | 0|acc |↑ | 0.6732|± |0.0269| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4610|± |0.0297| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6029|± |0.0297| +| - virology | 1|none | 0|acc |↑ | 0.4398|± |0.0386| +| - social sciences | 2|none | |acc |↑ | 0.7085|± |0.0080| +| - econometrics | 1|none | 0|acc |↑ | 0.4474|± |0.0468| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7677|± |0.0301| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.7979|± |0.0290| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6769|± |0.0237| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7185|± |0.0292| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8275|± |0.0162| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7328|± |0.0388| +| - professional_psychology | 1|none | 0|acc |↑ | 0.6013|± |0.0198| +| - public_relations | 1|none | 0|acc |↑ | 0.5909|± |0.0471| +| - security_studies | 1|none | 0|acc |↑ | 0.6980|± |0.0294| +| - sociology | 1|none | 0|acc |↑ | 0.8060|± |0.0280| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - stem | 2|none | |acc |↑ | 0.5392|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3700|± |0.0485| +| - anatomy | 1|none | 0|acc |↑ | 0.5333|± |0.0431| +| - astronomy | 1|none | 0|acc |↑ | 0.7105|± |0.0369| +| - college_biology | 1|none | 0|acc |↑ | 0.6528|± |0.0398| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5000|± |0.0503| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - college_physics | 1|none | 0|acc |↑ | 0.4608|± |0.0496| +| - computer_security | 1|none | 0|acc |↑ | 0.7300|± |0.0446| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5787|± |0.0323| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6069|± |0.0407| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4947|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7613|± |0.0243| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4926|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4148|± |0.0300| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3709|± |0.0394| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5463|± |0.0340| +| - machine_learning | 1|none | 0|acc |↑ | 0.4018|± |0.0465| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0421|± |0.0033| +|openbookqa | 1|none | 0|acc |↑ | 0.3160|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.4040|± |0.0220| +|piqa | 1|none | 0|acc |↑ | 0.7633|± |0.0099| +| | |none | 0|acc_norm |↑ | 0.7595|± |0.0100| +|qnli | 1|none | 0|acc |↑ | 0.5660|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9490|± |0.0070| +| | |none | 0|acc_norm |↑ | 0.9400|± |0.0075| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.2801|± |0.0034| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3966|± |0.0171| +| | |none | 0|bleu_diff |↑ |-1.7922|± |0.4018| +| | |none | 0|bleu_max |↑ |12.2414|± |0.5454| +| | |none | 0|rouge1_acc |↑ | 0.4272|± |0.0173| +| | |none | 0|rouge1_diff|↑ |-3.2519|± |0.6342| +| | |none | 0|rouge1_max |↑ |32.3672|± |0.7715| +| | |none | 0|rouge2_acc |↑ | 0.2742|± |0.0156| +| | |none | 0|rouge2_diff|↑ |-3.5921|± |0.6461| +| | |none | 0|rouge2_max |↑ |17.5177|± |0.7544| +| | |none | 0|rougeL_acc |↑ | 0.4235|± |0.0173| +| | |none | 0|rougeL_diff|↑ |-3.5600|± |0.6371| +| | |none | 0|rougeL_max |↑ |29.5881|± |0.7560| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3121|± |0.0162| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4657|± |0.0150| +|winogrande | 1|none | 0|acc |↑ | 0.6290|± |0.0136| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.3861|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.6003|± |0.0039| +| - humanities | 2|none | |acc |↑ |0.5409|± |0.0068| +| - other | 2|none | |acc |↑ |0.6453|± |0.0083| +| - social sciences| 2|none | |acc |↑ |0.7085|± |0.0080| +| - stem | 2|none | |acc |↑ |0.5392|± |0.0086| + +Qwen_Qwen2.5-1.5B-Instruct: 5h 38m 25s +✅ Benchmark completed for Qwen_Qwen2.5-1.5B-Instruct + +🔥 Starting benchmark for Qwen_Qwen2.5-0.5B-Instruct +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 2 +Passed argument batch_size = auto:4.0. Detecting largest batch size +Determined largest batch size: 64 +🔥 Starting benchmark for deepseek-ai_deepseek-moe-16b-base +🔥 Starting benchmark for deepseek-ai_deepseek-moe-16b-base +deepseek-ai_deepseek-moe-16b-base: 0h 5m 31s +✅ Benchmark completed for deepseek-ai_deepseek-moe-16b-base + +🔥 Starting benchmark for Deepseek-ai_deepseek-moe-16b-chat +Deepseek-ai_deepseek-moe-16b-chat: 0h 0m 4s +✅ Benchmark completed for Deepseek-ai_deepseek-moe-16b-chat + +🔥 Starting benchmark for Qwen_Qwen-7B-Chat +Qwen_Qwen-7B-Chat: 0h 5m 5s +✅ Benchmark completed for Qwen_Qwen-7B-Chat + +🔥 Starting benchmark for Qwen_Qwen-7B +Qwen_Qwen-7B: 0h 5m 8s +✅ Benchmark completed for Qwen_Qwen-7B + +🔥 Starting benchmark for baichuan-inc_Baichuan2-13B-Chat +baichuan-inc_Baichuan2-13B-Chat: 1h 19m 49s +✅ Benchmark completed for baichuan-inc_Baichuan2-13B-Chat + +🔥 Starting benchmark for Qwen_Qwen2.5-14B-Instruct +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-14B-Instruct,trust_remote_code=True,load_in_8bit=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 1 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.7210|± |0.0142| +|anli_r2 | 1|none | 0|acc |↑ | 0.6340|± |0.0152| +|anli_r3 | 1|none | 0|acc |↑ | 0.6175|± |0.0140| +|arc_challenge | 1|none | 0|acc |↑ | 0.6067|± |0.0143| +| | |none | 0|acc_norm |↑ | 0.6152|± |0.0142| +|bbh | 3|get-answer | |exact_match|↑ | 0.1069|± |0.0032| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.2995|± |0.0336| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.3280|± |0.0298| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.0560|± |0.0146| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0680|± |0.0160| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.0120|± |0.0069| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.1400|± |0.0220| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.0120|± |0.0069| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.0280|± |0.0105| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.1480|± |0.0225| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.5400|± |0.0316| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.1027|± |0.0252| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.4600|± |0.0316| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.0120|± |0.0069| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.6000|± |0.0310| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.0056|± |0.0056| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.0320|± |0.0112| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.0120|± |0.0069| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.0040|± |0.0040| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0440|± |0.0130| +|boolq | 2|none | 0|acc |↑ | 0.8862|± |0.0056| +|drop | 3|none | 0|em |↑ | 0.0002|± |0.0001| +| | |none | 0|f1 |↑ | 0.0713|± |0.0012| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1414|± |0.0248| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1111|± |0.0224| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2424|± |0.0305| +| | |strict-match | 0|exact_match|↑ | 0.0101|± |0.0071| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.4091|± |0.0350| +| | |none | 0|acc_norm |↑ | 0.4091|± |0.0350| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3434|± |0.0338| +| | |none | 0|acc_norm |↑ | 0.3434|± |0.0338| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1593|± |0.0157| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1337|± |0.0146| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2930|± |0.0195| +| | |strict-match | 0|exact_match|↑ | 0.0037|± |0.0026| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3755|± |0.0207| +| | |none | 0|acc_norm |↑ | 0.3755|± |0.0207| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3810|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.3810|± |0.0208| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1295|± |0.0159| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1161|± |0.0152| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2812|± |0.0213| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.4107|± |0.0233| +| | |none | 0|acc_norm |↑ | 0.4107|± |0.0233| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3549|± |0.0226| +| | |none | 0|acc_norm |↑ | 0.3549|± |0.0226| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.4390|± |0.0137| +| | |strict-match | 5|exact_match|↑ | 0.7923|± |0.0112| +|hellaswag | 1|none | 0|acc |↑ | 0.6527|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.8420|± |0.0036| +|mmlu | 2|none | |acc |↑ | 0.7831|± |0.0033| +| - humanities | 2|none | |acc |↑ | 0.7214|± |0.0062| +| - formal_logic | 1|none | 0|acc |↑ | 0.6349|± |0.0431| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8606|± |0.0270| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.9118|± |0.0199| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.9072|± |0.0189| +| - international_law | 1|none | 0|acc |↑ | 0.9008|± |0.0273| +| - jurisprudence | 1|none | 0|acc |↑ | 0.8519|± |0.0343| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8834|± |0.0252| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7977|± |0.0216| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.6525|± |0.0159| +| - philosophy | 1|none | 0|acc |↑ | 0.8199|± |0.0218| +| - prehistory | 1|none | 0|acc |↑ | 0.8858|± |0.0177| +| - professional_law | 1|none | 0|acc |↑ | 0.5678|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.8947|± |0.0235| +| - other | 2|none | |acc |↑ | 0.8104|± |0.0068| +| - business_ethics | 1|none | 0|acc |↑ | 0.7900|± |0.0409| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.8340|± |0.0229| +| - college_medicine | 1|none | 0|acc |↑ | 0.7399|± |0.0335| +| - global_facts | 1|none | 0|acc |↑ | 0.5700|± |0.0498| +| - human_aging | 1|none | 0|acc |↑ | 0.7803|± |0.0278| +| - management | 1|none | 0|acc |↑ | 0.8835|± |0.0318| +| - marketing | 1|none | 0|acc |↑ | 0.9274|± |0.0170| +| - medical_genetics | 1|none | 0|acc |↑ | 0.9000|± |0.0302| +| - miscellaneous | 1|none | 0|acc |↑ | 0.9055|± |0.0105| +| - nutrition | 1|none | 0|acc |↑ | 0.8235|± |0.0218| +| - professional_accounting | 1|none | 0|acc |↑ | 0.6348|± |0.0287| +| - professional_medicine | 1|none | 0|acc |↑ | 0.8456|± |0.0220| +| - virology | 1|none | 0|acc |↑ | 0.5482|± |0.0387| +| - social sciences | 2|none | |acc |↑ | 0.8635|± |0.0061| +| - econometrics | 1|none | 0|acc |↑ | 0.6842|± |0.0437| +| - high_school_geography | 1|none | 0|acc |↑ | 0.9141|± |0.0200| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9534|± |0.0152| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.8590|± |0.0176| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.9034|± |0.0192| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.9046|± |0.0126| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8626|± |0.0302| +| - professional_psychology | 1|none | 0|acc |↑ | 0.8137|± |0.0158| +| - public_relations | 1|none | 0|acc |↑ | 0.7727|± |0.0401| +| - security_studies | 1|none | 0|acc |↑ | 0.8327|± |0.0239| +| - sociology | 1|none | 0|acc |↑ | 0.8955|± |0.0216| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.9100|± |0.0288| +| - stem | 2|none | |acc |↑ | 0.7697|± |0.0073| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.6300|± |0.0485| +| - anatomy | 1|none | 0|acc |↑ | 0.7630|± |0.0367| +| - astronomy | 1|none | 0|acc |↑ | 0.9079|± |0.0235| +| - college_biology | 1|none | 0|acc |↑ | 0.8958|± |0.0255| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - college_mathematics | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - college_physics | 1|none | 0|acc |↑ | 0.6373|± |0.0478| +| - computer_security | 1|none | 0|acc |↑ | 0.8000|± |0.0402| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.8340|± |0.0243| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7448|± |0.0363| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.8624|± |0.0177| +| - high_school_biology | 1|none | 0|acc |↑ | 0.9065|± |0.0166| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.6897|± |0.0326| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.9000|± |0.0302| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.6444|± |0.0292| +| - high_school_physics | 1|none | 0|acc |↑ | 0.7351|± |0.0360| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.7824|± |0.0281| +| - machine_learning | 1|none | 0|acc |↑ | 0.6518|± |0.0452| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0615|± |0.0040| +|openbookqa | 1|none | 0|acc |↑ | 0.3700|± |0.0216| +| | |none | 0|acc_norm |↑ | 0.4760|± |0.0224| +|piqa | 1|none | 0|acc |↑ | 0.8058|± |0.0092| +| | |none | 0|acc_norm |↑ | 0.8172|± |0.0090| +|qnli | 1|none | 0|acc |↑ | 0.8539|± |0.0048| +|sciq | 1|none | 0|acc |↑ | 0.9640|± |0.0059| +| | |none | 0|acc_norm |↑ | 0.9290|± |0.0081| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0393|± |0.0015| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5141|± |0.0175| +| | |none | 0|bleu_diff |↑ | 0.9069|± |0.2407| +| | |none | 0|bleu_max |↑ | 7.7993|± |0.4042| +| | |none | 0|rouge1_acc |↑ | 0.5386|± |0.0175| +| | |none | 0|rouge1_diff|↑ | 1.7284|± |0.4077| +| | |none | 0|rouge1_max |↑ |25.5442|± |0.6360| +| | |none | 0|rouge2_acc |↑ | 0.4541|± |0.0174| +| | |none | 0|rouge2_diff|↑ | 1.1688|± |0.4042| +| | |none | 0|rouge2_max |↑ |14.7681|± |0.5874| +| | |none | 0|rougeL_acc |↑ | 0.5239|± |0.0175| +| | |none | 0|rougeL_diff|↑ | 1.4904|± |0.3881| +| | |none | 0|rougeL_max |↑ |22.4469|± |0.6199| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.5104|± |0.0175| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.6830|± |0.0150| +|winogrande | 1|none | 0|acc |↑ | 0.7545|± |0.0121| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.1069|± |0.0032| +|mmlu | 2|none | |acc |↑ |0.7831|± |0.0033| +| - humanities | 2|none | |acc |↑ |0.7214|± |0.0062| +| - other | 2|none | |acc |↑ |0.8104|± |0.0068| +| - social sciences| 2|none | |acc |↑ |0.8635|± |0.0061| +| - stem | 2|none | |acc |↑ |0.7697|± |0.0073| + +Qwen_Qwen2.5-14B-Instruct: 52h 44m 39s +✅ Benchmark completed for Qwen_Qwen2.5-14B-Instruct + +🔥 Starting benchmark for Qwen_Qwen3-14B +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen3-14B,trust_remote_code=True,load_in_8bit=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 1 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.6460|± |0.0151| +|anli_r2 | 1|none | 0|acc |↑ | 0.5700|± |0.0157| +|anli_r3 | 1|none | 0|acc |↑ | 0.5567|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.5870|± |0.0144| +| | |none | 0|acc_norm |↑ | 0.6007|± |0.0143| +|bbh | 3|get-answer | |exact_match|↑ | 0.4330|± |0.0048| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9760|± |0.0097| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.0588|± |0.0173| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.3720|± |0.0306| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.7880|± |0.0259| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.2000|± |0.0253| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.4000|± |0.0310| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5120|± |0.0317| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0920|± |0.0183| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9240|± |0.0168| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7840|± |0.0261| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6760|± |0.0297| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.7200|± |0.0285| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5068|± |0.0415| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.2000|± |0.0253| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.3560|± |0.0303| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.0080|± |0.0056| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.2809|± |0.0338| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8640|± |0.0217| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.8640|± |0.0217| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.6720|± |0.0298| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1120|± |0.0200| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.1080|± |0.0197| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.3760|± |0.0307| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.2560|± |0.0277| +|boolq | 2|none | 0|acc |↑ | 0.8917|± |0.0054| +|drop | 3|none | 0|em |↑ | 0.0035|± |0.0006| +| | |none | 0|f1 |↑ | 0.0904|± |0.0018| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0758|± |0.0189| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0606|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3939|± |0.0348| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3586|± |0.0342| +| | |none | 0|acc_norm |↑ | 0.3586|± |0.0342| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3838|± |0.0346| +| | |none | 0|acc_norm |↑ | 0.3838|± |0.0346| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1062|± |0.0132| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0586|± |0.0101| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3590|± |0.0205| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3974|± |0.0210| +| | |none | 0|acc_norm |↑ | 0.3974|± |0.0210| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3828|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.3828|± |0.0208| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1205|± |0.0154| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0781|± |0.0127| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.3415|± |0.0224| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3750|± |0.0229| +| | |none | 0|acc_norm |↑ | 0.3750|± |0.0229| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3973|± |0.0231| +| | |none | 0|acc_norm |↑ | 0.3973|± |0.0231| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8135|± |0.0107| +| | |strict-match | 5|exact_match|↑ | 0.8984|± |0.0083| +|hellaswag | 1|none | 0|acc |↑ | 0.6084|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.7877|± |0.0041| +|mmlu | 2|none | |acc |↑ | 0.7695|± |0.0034| +| - humanities | 2|none | |acc |↑ | 0.6778|± |0.0065| +| - formal_logic | 1|none | 0|acc |↑ | 0.6667|± |0.0422| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.8303|± |0.0293| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.9069|± |0.0204| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8565|± |0.0228| +| - international_law | 1|none | 0|acc |↑ | 0.8430|± |0.0332| +| - jurisprudence | 1|none | 0|acc |↑ | 0.8611|± |0.0334| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8773|± |0.0258| +| - moral_disputes | 1|none | 0|acc |↑ | 0.7775|± |0.0224| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.5665|± |0.0166| +| - philosophy | 1|none | 0|acc |↑ | 0.7556|± |0.0244| +| - prehistory | 1|none | 0|acc |↑ | 0.8333|± |0.0207| +| - professional_law | 1|none | 0|acc |↑ | 0.5293|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.8713|± |0.0257| +| - other | 2|none | |acc |↑ | 0.8011|± |0.0069| +| - business_ethics | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.8151|± |0.0239| +| - college_medicine | 1|none | 0|acc |↑ | 0.7919|± |0.0310| +| - global_facts | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - human_aging | 1|none | 0|acc |↑ | 0.7444|± |0.0293| +| - management | 1|none | 0|acc |↑ | 0.8350|± |0.0368| +| - marketing | 1|none | 0|acc |↑ | 0.9231|± |0.0175| +| - medical_genetics | 1|none | 0|acc |↑ | 0.8200|± |0.0386| +| - miscellaneous | 1|none | 0|acc |↑ | 0.8838|± |0.0115| +| - nutrition | 1|none | 0|acc |↑ | 0.8170|± |0.0221| +| - professional_accounting | 1|none | 0|acc |↑ | 0.6454|± |0.0285| +| - professional_medicine | 1|none | 0|acc |↑ | 0.8640|± |0.0208| +| - virology | 1|none | 0|acc |↑ | 0.5663|± |0.0386| +| - social sciences | 2|none | |acc |↑ | 0.8586|± |0.0062| +| - econometrics | 1|none | 0|acc |↑ | 0.6842|± |0.0437| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8889|± |0.0224| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.9275|± |0.0187| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.8615|± |0.0175| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.9412|± |0.0153| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.9266|± |0.0112| +| - human_sexuality | 1|none | 0|acc |↑ | 0.8397|± |0.0322| +| - professional_psychology | 1|none | 0|acc |↑ | 0.8072|± |0.0160| +| - public_relations | 1|none | 0|acc |↑ | 0.7455|± |0.0417| +| - security_studies | 1|none | 0|acc |↑ | 0.7918|± |0.0260| +| - sociology | 1|none | 0|acc |↑ | 0.8706|± |0.0237| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8900|± |0.0314| +| - stem | 2|none | |acc |↑ | 0.7881|± |0.0070| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.6300|± |0.0485| +| - anatomy | 1|none | 0|acc |↑ | 0.8000|± |0.0346| +| - astronomy | 1|none | 0|acc |↑ | 0.8750|± |0.0269| +| - college_biology | 1|none | 0|acc |↑ | 0.9028|± |0.0248| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5800|± |0.0496| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - college_mathematics | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - college_physics | 1|none | 0|acc |↑ | 0.6569|± |0.0472| +| - computer_security | 1|none | 0|acc |↑ | 0.8500|± |0.0359| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.8723|± |0.0218| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.8276|± |0.0315| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.8545|± |0.0182| +| - high_school_biology | 1|none | 0|acc |↑ | 0.9387|± |0.0136| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.7833|± |0.0290| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8900|± |0.0314| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.5963|± |0.0299| +| - high_school_physics | 1|none | 0|acc |↑ | 0.7550|± |0.0351| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.7917|± |0.0277| +| - machine_learning | 1|none | 0|acc |↑ | 0.6607|± |0.0449| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0922|± |0.0048| +|openbookqa | 1|none | 0|acc |↑ | 0.3420|± |0.0212| +| | |none | 0|acc_norm |↑ | 0.4600|± |0.0223| +|piqa | 1|none | 0|acc |↑ | 0.7916|± |0.0095| +| | |none | 0|acc_norm |↑ | 0.7949|± |0.0094| +|qnli | 1|none | 0|acc |↑ | 0.8442|± |0.0049| +|sciq | 1|none | 0|acc |↑ | 0.9770|± |0.0047| +| | |none | 0|acc_norm |↑ | 0.9660|± |0.0057| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.4075|± |0.0037| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.6744|± |0.0164| +| | |none | 0|bleu_diff |↑ |20.9239|± |1.1232| +| | |none | 0|bleu_max |↑ |38.9157|± |0.8760| +| | |none | 0|rouge1_acc |↑ | 0.6818|± |0.0163| +| | |none | 0|rouge1_diff|↑ |29.7410|± |1.5930| +| | |none | 0|rouge1_max |↑ |65.2284|± |0.9593| +| | |none | 0|rouge2_acc |↑ | 0.6267|± |0.0169| +| | |none | 0|rouge2_diff|↑ |30.9999|± |1.7090| +| | |none | 0|rouge2_max |↑ |54.1666|± |1.1923| +| | |none | 0|rougeL_acc |↑ | 0.6756|± |0.0164| +| | |none | 0|rougeL_diff|↑ |29.8027|± |1.6052| +| | |none | 0|rougeL_max |↑ |63.4625|± |0.9960| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.4064|± |0.0172| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5894|± |0.0154| +|winogrande | 1|none | 0|acc |↑ | 0.7206|± |0.0126| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4330|± |0.0048| +|mmlu | 2|none | |acc |↑ |0.7695|± |0.0034| +| - humanities | 2|none | |acc |↑ |0.6778|± |0.0065| +| - other | 2|none | |acc |↑ |0.8011|± |0.0069| +| - social sciences| 2|none | |acc |↑ |0.8586|± |0.0062| +| - stem | 2|none | |acc |↑ |0.7881|± |0.0070| + +Qwen_Qwen3-14B: 29h 46m 1s +✅ Benchmark completed for Qwen_Qwen3-14B + +🔥 Starting benchmark for Qwen_Qwen2.5-1.5B-Instruct +🔥 Starting benchmark for Qwen_Qwen2.5-1.5B-Instruct +Qwen_Qwen2.5-1.5B-Instruct: 0h 5m 18s +✅ Benchmark completed for Qwen_Qwen2.5-1.5B-Instruct + +🔥 Starting benchmark for Qwen_Qwen2.5-0.5B-Instruct +Qwen_Qwen2.5-0.5B-Instruct: 0h 5m 0s +✅ Benchmark completed for Qwen_Qwen2.5-0.5B-Instruct + +🔥 Starting benchmark for deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B +deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B: 0h 5m 19s +✅ Benchmark completed for deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B + +🔥 Starting benchmark for Qwen_Qwen3-1.7B +Qwen_Qwen3-1.7B: 0h 5m 18s +✅ Benchmark completed for Qwen_Qwen3-1.7B + +🔥 Starting benchmark for Qwen_Qwen3-0.6B +Qwen_Qwen3-0.6B: 0h 5m 0s +✅ Benchmark completed for Qwen_Qwen3-0.6B + +🔥 Starting benchmark for Qwen_Qwen3-4B +Qwen_Qwen3-4B: 0h 5m 7s +✅ Benchmark completed for Qwen_Qwen3-4B + +🔥 Starting benchmark for Qwen_Qwen2.5-1.5B-Instruct +Qwen_Qwen2.5-1.5B-Instruct: 0h 0m 3s +✅ Benchmark completed for Qwen_Qwen2.5-1.5B-Instruct + +🔥 Starting benchmark for Qwen_Qwen2.5-0.5B-Instruct +Qwen_Qwen2.5-0.5B-Instruct: 0h 0m 3s +✅ Benchmark completed for Qwen_Qwen2.5-0.5B-Instruct + +🔥 Starting benchmark for deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B +🔥 Starting benchmark for Qwen_Qwen2.5-1.5B-Instruct +Passed argument batch_size = auto:1. Detecting largest batch size +Determined largest batch size: 1 +🔥 Starting benchmark for Qwen_Qwen2.5-1.5B-Instruct +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-1.5B-Instruct,trust_remote_code=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 6 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4480|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.3920|± |0.0154| +|anli_r3 | 1|none | 0|acc |↑ | 0.4317|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.4352|± |0.0145| +| | |none | 0|acc_norm |↑ | 0.4684|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.3692|± |0.0054| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8600|± |0.0220| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4813|± |0.0366| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.2920|± |0.0288| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.4000|± |0.0310| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0160|± |0.0080| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.1920|± |0.0250| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3160|± |0.0295| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.5720|± |0.0314| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2560|± |0.0277| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.5240|± |0.0316| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.4720|± |0.0316| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.6040|± |0.0310| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.6520|± |0.0302| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5068|± |0.0415| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.4600|± |0.0316| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.3320|± |0.0298| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.3440|± |0.0301| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4438|± |0.0373| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.6360|± |0.0305| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.1120|± |0.0200| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1520|± |0.0228| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1040|± |0.0193| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.2560|± |0.0277| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0440|± |0.0130| +|boolq | 2|none | 0|acc |↑ | 0.7813|± |0.0072| +|drop | 3|none | 0|em |↑ | 0.0017|± |0.0004| +| | |none | 0|f1 |↑ | 0.0391|± |0.0011| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1566|± |0.0259| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1768|± |0.0272| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2929|± |0.0324| +| | |none | 0|acc_norm |↑ | 0.2929|± |0.0324| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2323|± |0.0301| +| | |none | 0|acc_norm |↑ | 0.2323|± |0.0301| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1630|± |0.0158| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1520|± |0.0154| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2271|± |0.0179| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3114|± |0.0198| +| | |none | 0|acc_norm |↑ | 0.3114|± |0.0198| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2985|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2985|± |0.0196| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1987|± |0.0189| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1228|± |0.0155| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2165|± |0.0195| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3259|± |0.0222| +| | |none | 0|acc_norm |↑ | 0.3259|± |0.0222| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2835|± |0.0213| +| | |none | 0|acc_norm |↑ | 0.2835|± |0.0213| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.5095|± |0.0138| +| | |strict-match | 5|exact_match|↑ | 0.3192|± |0.0128| +|hellaswag | 1|none | 0|acc |↑ | 0.5080|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6829|± |0.0046| +|mmlu | 2|none | |acc |↑ | 0.6006|± |0.0039| +| - humanities | 2|none | |acc |↑ | 0.5422|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.5238|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7576|± |0.0335| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.7255|± |0.0313| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7722|± |0.0273| +| - international_law | 1|none | 0|acc |↑ | 0.7438|± |0.0398| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7870|± |0.0396| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7669|± |0.0332| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6532|± |0.0256| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3006|± |0.0153| +| - philosophy | 1|none | 0|acc |↑ | 0.6656|± |0.0268| +| - prehistory | 1|none | 0|acc |↑ | 0.6728|± |0.0261| +| - professional_law | 1|none | 0|acc |↑ | 0.4394|± |0.0127| +| - world_religions | 1|none | 0|acc |↑ | 0.7895|± |0.0313| +| - other | 2|none | |acc |↑ | 0.6460|± |0.0083| +| - business_ethics | 1|none | 0|acc |↑ | 0.6600|± |0.0476| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6679|± |0.0290| +| - college_medicine | 1|none | 0|acc |↑ | 0.6705|± |0.0358| +| - global_facts | 1|none | 0|acc |↑ | 0.2100|± |0.0409| +| - human_aging | 1|none | 0|acc |↑ | 0.6278|± |0.0324| +| - management | 1|none | 0|acc |↑ | 0.8058|± |0.0392| +| - marketing | 1|none | 0|acc |↑ | 0.8376|± |0.0242| +| - medical_genetics | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7241|± |0.0160| +| - nutrition | 1|none | 0|acc |↑ | 0.6732|± |0.0269| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4645|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.6029|± |0.0297| +| - virology | 1|none | 0|acc |↑ | 0.4398|± |0.0386| +| - social sciences | 2|none | |acc |↑ | 0.7065|± |0.0080| +| - econometrics | 1|none | 0|acc |↑ | 0.4474|± |0.0468| +| - high_school_geography | 1|none | 0|acc |↑ | 0.7576|± |0.0305| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8083|± |0.0284| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.6718|± |0.0238| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.7143|± |0.0293| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8220|± |0.0164| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7328|± |0.0388| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5997|± |0.0198| +| - public_relations | 1|none | 0|acc |↑ | 0.5909|± |0.0471| +| - security_studies | 1|none | 0|acc |↑ | 0.6980|± |0.0294| +| - sociology | 1|none | 0|acc |↑ | 0.8109|± |0.0277| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7500|± |0.0435| +| - stem | 2|none | |acc |↑ | 0.5395|± |0.0086| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3600|± |0.0482| +| - anatomy | 1|none | 0|acc |↑ | 0.5333|± |0.0431| +| - astronomy | 1|none | 0|acc |↑ | 0.7105|± |0.0369| +| - college_biology | 1|none | 0|acc |↑ | 0.6389|± |0.0402| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - college_computer_science | 1|none | 0|acc |↑ | 0.5000|± |0.0503| +| - college_mathematics | 1|none | 0|acc |↑ | 0.4100|± |0.0494| +| - college_physics | 1|none | 0|acc |↑ | 0.4608|± |0.0496| +| - computer_security | 1|none | 0|acc |↑ | 0.7300|± |0.0446| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.5787|± |0.0323| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6069|± |0.0407| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4947|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.7613|± |0.0243| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4975|± |0.0352| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6800|± |0.0469| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4222|± |0.0301| +| - high_school_physics | 1|none | 0|acc |↑ | 0.3709|± |0.0394| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5417|± |0.0340| +| - machine_learning | 1|none | 0|acc |↑ | 0.4018|± |0.0465| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0416|± |0.0033| +|openbookqa | 1|none | 0|acc |↑ | 0.3200|± |0.0209| +| | |none | 0|acc_norm |↑ | 0.4060|± |0.0220| +|piqa | 1|none | 0|acc |↑ | 0.7628|± |0.0099| +| | |none | 0|acc_norm |↑ | 0.7584|± |0.0100| +|qnli | 1|none | 0|acc |↑ | 0.5667|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9490|± |0.0070| +| | |none | 0|acc_norm |↑ | 0.9390|± |0.0076| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.2826|± |0.0034| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4027|± |0.0172| +| | |none | 0|bleu_diff |↑ |-1.9015|± |0.4040| +| | |none | 0|bleu_max |↑ |11.9071|± |0.5278| +| | |none | 0|rouge1_acc |↑ | 0.4406|± |0.0174| +| | |none | 0|rouge1_diff|↑ |-3.1237|± |0.6369| +| | |none | 0|rouge1_max |↑ |32.1186|± |0.7534| +| | |none | 0|rouge2_acc |↑ | 0.2815|± |0.0157| +| | |none | 0|rouge2_diff|↑ |-3.7671|± |0.6452| +| | |none | 0|rouge2_max |↑ |16.9153|± |0.7371| +| | |none | 0|rougeL_acc |↑ | 0.4321|± |0.0173| +| | |none | 0|rougeL_diff|↑ |-3.3951|± |0.6399| +| | |none | 0|rougeL_max |↑ |29.3596|± |0.7370| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3121|± |0.0162| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4657|± |0.0150| +|winogrande | 1|none | 0|acc |↑ | 0.6275|± |0.0136| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.3692|± |0.0054| +|mmlu | 2|none | |acc |↑ |0.6006|± |0.0039| +| - humanities | 2|none | |acc |↑ |0.5422|± |0.0068| +| - other | 2|none | |acc |↑ |0.6460|± |0.0083| +| - social sciences| 2|none | |acc |↑ |0.7065|± |0.0080| +| - stem | 2|none | |acc |↑ |0.5395|± |0.0086| + +Qwen_Qwen2.5-1.5B-Instruct: 3h 20m 46s +✅ Benchmark completed for Qwen_Qwen2.5-1.5B-Instruct + +🔥 Starting benchmark for Qwen_Qwen2.5-0.5B-Instruct +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen2.5-0.5B-Instruct,trust_remote_code=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 6 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3240|± |0.0148| +|anli_r2 | 1|none | 0|acc |↑ | 0.3420|± |0.0150| +|anli_r3 | 1|none | 0|acc |↑ | 0.3475|± |0.0138| +|arc_challenge | 1|none | 0|acc |↑ | 0.3020|± |0.0134| +| | |none | 0|acc_norm |↑ | 0.3370|± |0.0138| +|bbh | 3|get-answer | |exact_match|↑ | 0.2138|± |0.0046| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.6440|± |0.0303| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.0160|± |0.0092| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.2240|± |0.0264| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.1280|± |0.0212| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.1960|± |0.0252| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.0680|± |0.0160| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.3920|± |0.0309| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1600|± |0.0232| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0720|± |0.0164| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.3120|± |0.0294| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.2320|± |0.0268| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.5480|± |0.0315| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.1233|± |0.0273| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.1000|± |0.0190| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.1720|± |0.0239| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.0240|± |0.0097| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.2528|± |0.0327| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2640|± |0.0279| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1360|± |0.0217| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1160|± |0.0203| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.3280|± |0.0298| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.0200|± |0.0089| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0240|± |0.0097| +|boolq | 2|none | 0|acc |↑ | 0.6768|± |0.0082| +|drop | 3|none | 0|em |↑ | 0.0003|± |0.0002| +| | |none | 0|f1 |↑ | 0.0286|± |0.0008| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1263|± |0.0237| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1768|± |0.0272| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1970|± |0.0283| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2323|± |0.0301| +| | |none | 0|acc_norm |↑ | 0.2323|± |0.0301| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2626|± |0.0314| +| | |none | 0|acc_norm |↑ | 0.2626|± |0.0314| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1447|± |0.0151| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1868|± |0.0167| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2015|± |0.0172| +| | |strict-match | 0|exact_match|↑ | 0.0037|± |0.0026| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2601|± |0.0188| +| | |none | 0|acc_norm |↑ | 0.2601|± |0.0188| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2802|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.2802|± |0.0192| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1585|± |0.0173| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1585|± |0.0173| +| | |strict-match | 0|exact_match|↑ | 0.0022|± |0.0022| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1808|± |0.0182| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2701|± |0.0210| +| | |none | 0|acc_norm |↑ | 0.2701|± |0.0210| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2679|± |0.0209| +| | |none | 0|acc_norm |↑ | 0.2679|± |0.0209| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.3169|± |0.0128| +| | |strict-match | 5|exact_match|↑ | 0.2077|± |0.0112| +|hellaswag | 1|none | 0|acc |↑ | 0.4049|± |0.0049| +| | |none | 0|acc_norm |↑ | 0.5241|± |0.0050| +|mmlu | 2|none | |acc |↑ | 0.4576|± |0.0041| +| - humanities | 2|none | |acc |↑ | 0.4219|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.3175|± |0.0416| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6000|± |0.0383| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.5294|± |0.0350| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.6076|± |0.0318| +| - international_law | 1|none | 0|acc |↑ | 0.7438|± |0.0398| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6019|± |0.0473| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.4724|± |0.0392| +| - moral_disputes | 1|none | 0|acc |↑ | 0.5318|± |0.0269| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2380|± |0.0142| +| - philosophy | 1|none | 0|acc |↑ | 0.4759|± |0.0284| +| - prehistory | 1|none | 0|acc |↑ | 0.5432|± |0.0277| +| - professional_law | 1|none | 0|acc |↑ | 0.3520|± |0.0122| +| - world_religions | 1|none | 0|acc |↑ | 0.5906|± |0.0377| +| - other | 2|none | |acc |↑ | 0.5082|± |0.0088| +| - business_ethics | 1|none | 0|acc |↑ | 0.5300|± |0.0502| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.5094|± |0.0308| +| - college_medicine | 1|none | 0|acc |↑ | 0.4509|± |0.0379| +| - global_facts | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - human_aging | 1|none | 0|acc |↑ | 0.5426|± |0.0334| +| - management | 1|none | 0|acc |↑ | 0.5728|± |0.0490| +| - marketing | 1|none | 0|acc |↑ | 0.7393|± |0.0288| +| - medical_genetics | 1|none | 0|acc |↑ | 0.5200|± |0.0502| +| - miscellaneous | 1|none | 0|acc |↑ | 0.5556|± |0.0178| +| - nutrition | 1|none | 0|acc |↑ | 0.5882|± |0.0282| +| - professional_accounting | 1|none | 0|acc |↑ | 0.3191|± |0.0278| +| - professional_medicine | 1|none | 0|acc |↑ | 0.3676|± |0.0293| +| - virology | 1|none | 0|acc |↑ | 0.4337|± |0.0386| +| - social sciences | 2|none | |acc |↑ | 0.5301|± |0.0089| +| - econometrics | 1|none | 0|acc |↑ | 0.2895|± |0.0427| +| - high_school_geography | 1|none | 0|acc |↑ | 0.5657|± |0.0353| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.5544|± |0.0359| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4410|± |0.0252| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.4748|± |0.0324| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.6183|± |0.0208| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5573|± |0.0436| +| - professional_psychology | 1|none | 0|acc |↑ | 0.4608|± |0.0202| +| - public_relations | 1|none | 0|acc |↑ | 0.5273|± |0.0478| +| - security_studies | 1|none | 0|acc |↑ | 0.5633|± |0.0318| +| - sociology | 1|none | 0|acc |↑ | 0.6667|± |0.0333| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7200|± |0.0451| +| - stem | 2|none | |acc |↑ | 0.3901|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - anatomy | 1|none | 0|acc |↑ | 0.4000|± |0.0423| +| - astronomy | 1|none | 0|acc |↑ | 0.4737|± |0.0406| +| - college_biology | 1|none | 0|acc |↑ | 0.4444|± |0.0416| +| - college_chemistry | 1|none | 0|acc |↑ | 0.2900|± |0.0456| +| - college_computer_science | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - college_mathematics | 1|none | 0|acc |↑ | 0.2700|± |0.0446| +| - college_physics | 1|none | 0|acc |↑ | 0.2745|± |0.0444| +| - computer_security | 1|none | 0|acc |↑ | 0.6900|± |0.0465| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.3830|± |0.0318| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.5103|± |0.0417| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.3307|± |0.0242| +| - high_school_biology | 1|none | 0|acc |↑ | 0.5355|± |0.0284| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.4138|± |0.0347| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.4400|± |0.0499| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3000|± |0.0279| +| - high_school_physics | 1|none | 0|acc |↑ | 0.2517|± |0.0354| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.3333|± |0.0321| +| - machine_learning | 1|none | 0|acc |↑ | 0.4107|± |0.0467| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0205|± |0.0024| +|openbookqa | 1|none | 0|acc |↑ | 0.2440|± |0.0192| +| | |none | 0|acc_norm |↑ | 0.3460|± |0.0213| +|piqa | 1|none | 0|acc |↑ | 0.7062|± |0.0106| +| | |none | 0|acc_norm |↑ | 0.7040|± |0.0107| +|qnli | 1|none | 0|acc |↑ | 0.5369|± |0.0067| +|sciq | 1|none | 0|acc |↑ | 0.9190|± |0.0086| +| | |none | 0|acc_norm |↑ | 0.8830|± |0.0102| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.1342|± |0.0025| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3219|± |0.0164| +| | |none | 0|bleu_diff |↑ |-0.2967|± |0.0886| +| | |none | 0|bleu_max |↑ | 2.9561|± |0.1554| +| | |none | 0|rouge1_acc |↑ | 0.3770|± |0.0170| +| | |none | 0|rouge1_diff|↑ |-0.1350|± |0.2257| +| | |none | 0|rouge1_max |↑ |12.8150|± |0.3731| +| | |none | 0|rouge2_acc |↑ | 0.2154|± |0.0144| +| | |none | 0|rouge2_diff|↑ |-0.5713|± |0.1837| +| | |none | 0|rouge2_max |↑ | 5.3013|± |0.2882| +| | |none | 0|rougeL_acc |↑ | 0.3537|± |0.0167| +| | |none | 0|rougeL_diff|↑ |-0.4059|± |0.2150| +| | |none | 0|rougeL_max |↑ |11.6973|± |0.3485| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2717|± |0.0156| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4184|± |0.0146| +|winogrande | 1|none | 0|acc |↑ | 0.5564|± |0.0140| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.2138|± |0.0046| +|mmlu | 2|none | |acc |↑ |0.4576|± |0.0041| +| - humanities | 2|none | |acc |↑ |0.4219|± |0.0069| +| - other | 2|none | |acc |↑ |0.5082|± |0.0088| +| - social sciences| 2|none | |acc |↑ |0.5301|± |0.0089| +| - stem | 2|none | |acc |↑ |0.3901|± |0.0085| + +Qwen_Qwen2.5-0.5B-Instruct: 2h 34m 22s +✅ Benchmark completed for Qwen_Qwen2.5-0.5B-Instruct + +🔥 Starting benchmark for deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B,trust_remote_code=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 6 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3560|± |0.0151| +|anli_r2 | 1|none | 0|acc |↑ | 0.3620|± |0.0152| +|anli_r3 | 1|none | 0|acc |↑ | 0.3625|± |0.0139| +|arc_challenge | 1|none | 0|acc |↑ | 0.3422|± |0.0139| +| | |none | 0|acc_norm |↑ | 0.3464|± |0.0139| +|bbh | 3|get-answer | |exact_match|↑ | 0.4059|± |0.0051| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.8600|± |0.0220| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.3850|± |0.0357| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.4520|± |0.0315| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.2960|± |0.0289| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0160|± |0.0080| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.1600|± |0.0232| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.2560|± |0.0277| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.3600|± |0.0304| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1520|± |0.0228| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0080|± |0.0056| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.6280|± |0.0306| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5080|± |0.0317| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9280|± |0.0164| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.7840|± |0.0261| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6320|± |0.0306| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.4932|± |0.0415| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5560|± |0.0315| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.2440|± |0.0272| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.1280|± |0.0212| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.1798|± |0.0289| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.4760|± |0.0316| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.1760|± |0.0241| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.4240|± |0.0313| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.6280|± |0.0306| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9200|± |0.0172| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0960|± |0.0187| +|boolq | 2|none | 0|acc |↑ | 0.6801|± |0.0082| +|drop | 3|none | 0|em |↑ | 0.0008|± |0.0003| +| | |none | 0|f1 |↑ | 0.0507|± |0.0013| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0606|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0606|± |0.0170| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1364|± |0.0245| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2727|± |0.0317| +| | |none | 0|acc_norm |↑ | 0.2727|± |0.0317| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2727|± |0.0317| +| | |none | 0|acc_norm |↑ | 0.2727|± |0.0317| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0788|± |0.0115| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0824|± |0.0118| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1740|± |0.0162| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2418|± |0.0183| +| | |none | 0|acc_norm |↑ | 0.2418|± |0.0183| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3205|± |0.0200| +| | |none | 0|acc_norm |↑ | 0.3205|± |0.0200| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0714|± |0.0122| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0759|± |0.0125| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1451|± |0.0167| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2612|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.2612|± |0.0208| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2723|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.2723|± |0.0211| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.7074|± |0.0125| +| | |strict-match | 5|exact_match|↑ | 0.7013|± |0.0126| +|hellaswag | 1|none | 0|acc |↑ | 0.3633|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.4467|± |0.0050| +|mmlu | 2|none | |acc |↑ | 0.3606|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.3135|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.3968|± |0.0438| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.3515|± |0.0373| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.3235|± |0.0328| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.4135|± |0.0321| +| - international_law | 1|none | 0|acc |↑ | 0.4463|± |0.0454| +| - jurisprudence | 1|none | 0|acc |↑ | 0.4259|± |0.0478| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.4110|± |0.0387| +| - moral_disputes | 1|none | 0|acc |↑ | 0.3555|± |0.0258| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2447|± |0.0144| +| - philosophy | 1|none | 0|acc |↑ | 0.4051|± |0.0279| +| - prehistory | 1|none | 0|acc |↑ | 0.3765|± |0.0270| +| - professional_law | 1|none | 0|acc |↑ | 0.2595|± |0.0112| +| - world_religions | 1|none | 0|acc |↑ | 0.2807|± |0.0345| +| - other | 2|none | |acc |↑ | 0.3859|± |0.0087| +| - business_ethics | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.3811|± |0.0299| +| - college_medicine | 1|none | 0|acc |↑ | 0.3468|± |0.0363| +| - global_facts | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - human_aging | 1|none | 0|acc |↑ | 0.3767|± |0.0325| +| - management | 1|none | 0|acc |↑ | 0.5146|± |0.0495| +| - marketing | 1|none | 0|acc |↑ | 0.5726|± |0.0324| +| - medical_genetics | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - miscellaneous | 1|none | 0|acc |↑ | 0.3921|± |0.0175| +| - nutrition | 1|none | 0|acc |↑ | 0.4118|± |0.0282| +| - professional_accounting | 1|none | 0|acc |↑ | 0.2943|± |0.0272| +| - professional_medicine | 1|none | 0|acc |↑ | 0.2721|± |0.0270| +| - virology | 1|none | 0|acc |↑ | 0.3735|± |0.0377| +| - social sciences | 2|none | |acc |↑ | 0.4027|± |0.0088| +| - econometrics | 1|none | 0|acc |↑ | 0.3070|± |0.0434| +| - high_school_geography | 1|none | 0|acc |↑ | 0.3535|± |0.0341| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.3938|± |0.0353| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.3821|± |0.0246| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.4580|± |0.0324| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.4624|± |0.0214| +| - human_sexuality | 1|none | 0|acc |↑ | 0.4351|± |0.0435| +| - professional_psychology | 1|none | 0|acc |↑ | 0.3317|± |0.0190| +| - public_relations | 1|none | 0|acc |↑ | 0.3818|± |0.0465| +| - security_studies | 1|none | 0|acc |↑ | 0.4367|± |0.0318| +| - sociology | 1|none | 0|acc |↑ | 0.4527|± |0.0352| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.4800|± |0.0502| +| - stem | 2|none | |acc |↑ | 0.3650|± |0.0085| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3000|± |0.0461| +| - anatomy | 1|none | 0|acc |↑ | 0.3037|± |0.0397| +| - astronomy | 1|none | 0|acc |↑ | 0.4013|± |0.0399| +| - college_biology | 1|none | 0|acc |↑ | 0.3056|± |0.0385| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3100|± |0.0465| +| - college_computer_science | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3800|± |0.0488| +| - college_physics | 1|none | 0|acc |↑ | 0.2941|± |0.0453| +| - computer_security | 1|none | 0|acc |↑ | 0.3600|± |0.0482| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.4043|± |0.0321| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4345|± |0.0413| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.4471|± |0.0256| +| - high_school_biology | 1|none | 0|acc |↑ | 0.4258|± |0.0281| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.3202|± |0.0328| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3519|± |0.0291| +| - high_school_physics | 1|none | 0|acc |↑ | 0.1987|± |0.0326| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.3472|± |0.0325| +| - machine_learning | 1|none | 0|acc |↑ | 0.3393|± |0.0449| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0064|± |0.0013| +|openbookqa | 1|none | 0|acc |↑ | 0.1980|± |0.0178| +| | |none | 0|acc_norm |↑ | 0.3080|± |0.0207| +|piqa | 1|none | 0|acc |↑ | 0.6513|± |0.0111| +| | |none | 0|acc_norm |↑ | 0.6578|± |0.0111| +|qnli | 1|none | 0|acc |↑ | 0.5054|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.8990|± |0.0095| +| | |none | 0|acc_norm |↑ | 0.8450|± |0.0115| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0090|± |0.0007| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3268|± |0.0164| +| | |none | 0|bleu_diff |↑ |-2.4557|± |0.4017| +| | |none | 0|bleu_max |↑ |12.6086|± |0.4908| +| | |none | 0|rouge1_acc |↑ | 0.3476|± |0.0167| +| | |none | 0|rouge1_diff|↑ |-3.9220|± |0.5617| +| | |none | 0|rouge1_max |↑ |34.8664|± |0.7473| +| | |none | 0|rouge2_acc |↑ | 0.2289|± |0.0147| +| | |none | 0|rouge2_diff|↑ |-5.0782|± |0.6278| +| | |none | 0|rouge2_max |↑ |20.7313|± |0.7617| +| | |none | 0|rougeL_acc |↑ | 0.3329|± |0.0165| +| | |none | 0|rougeL_diff|↑ |-4.0907|± |0.5646| +| | |none | 0|rougeL_max |↑ |32.3187|± |0.7355| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2938|± |0.0159| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4517|± |0.0155| +|winogrande | 1|none | 0|acc |↑ | 0.5493|± |0.0140| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4059|± |0.0051| +|mmlu | 2|none | |acc |↑ |0.3606|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.3135|± |0.0067| +| - other | 2|none | |acc |↑ |0.3859|± |0.0087| +| - social sciences| 2|none | |acc |↑ |0.4027|± |0.0088| +| - stem | 2|none | |acc |↑ |0.3650|± |0.0085| + +deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B: 3h 41m 4s +✅ Benchmark completed for deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B + +🔥 Starting benchmark for Qwen_Qwen3-1.7B +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen3-1.7B,trust_remote_code=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 6 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.4100|± |0.0156| +|anli_r2 | 1|none | 0|acc |↑ | 0.4040|± |0.0155| +|anli_r3 | 1|none | 0|acc |↑ | 0.4342|± |0.0143| +|arc_challenge | 1|none | 0|acc |↑ | 0.3985|± |0.0143| +| | |none | 0|acc_norm |↑ | 0.4343|± |0.0145| +|bbh | 3|get-answer | |exact_match|↑ | 0.4826|± |0.0048| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9120|± |0.0180| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.4920|± |0.0367| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.6280|± |0.0306| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.2840|± |0.0286| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0520|± |0.0141| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3040|± |0.0292| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.6840|± |0.0295| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.1200|± |0.0206| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.7880|± |0.0259| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.5720|± |0.0314| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9640|± |0.0118| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.8880|± |0.0200| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.7560|± |0.0272| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.5890|± |0.0409| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.5960|± |0.0311| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.5000|± |0.0317| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.0840|± |0.0176| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.4101|± |0.0370| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.2920|± |0.0288| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2080|± |0.0257| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.8120|± |0.0248| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.1800|± |0.0243| +|boolq | 2|none | 0|acc |↑ | 0.7765|± |0.0073| +|drop | 3|none | 0|em |↑ | 0.0031|± |0.0006| +| | |none | 0|f1 |↑ | 0.0753|± |0.0018| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0758|± |0.0189| +| | |strict-match | 0|exact_match|↑ | 0.0051|± |0.0051| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0253|± |0.0112| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1263|± |0.0237| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2475|± |0.0307| +| | |none | 0|acc_norm |↑ | 0.2475|± |0.0307| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3232|± |0.0333| +| | |none | 0|acc_norm |↑ | 0.3232|± |0.0333| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0824|± |0.0118| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0934|± |0.0125| +| | |strict-match | 0|exact_match|↑ | 0.0055|± |0.0032| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1722|± |0.0162| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2949|± |0.0195| +| | |none | 0|acc_norm |↑ | 0.2949|± |0.0195| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.2967|± |0.0196| +| | |none | 0|acc_norm |↑ | 0.2967|± |0.0196| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0893|± |0.0135| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0804|± |0.0129| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1473|± |0.0168| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2612|± |0.0208| +| | |none | 0|acc_norm |↑ | 0.2612|± |0.0208| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2902|± |0.0215| +| | |none | 0|acc_norm |↑ | 0.2902|± |0.0215| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.6922|± |0.0127| +| | |strict-match | 5|exact_match|↑ | 0.6899|± |0.0127| +|hellaswag | 1|none | 0|acc |↑ | 0.4606|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6038|± |0.0049| +|mmlu | 2|none | |acc |↑ | 0.5538|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.4854|± |0.0069| +| - formal_logic | 1|none | 0|acc |↑ | 0.4841|± |0.0447| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.6727|± |0.0366| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.6618|± |0.0332| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.7046|± |0.0297| +| - international_law | 1|none | 0|acc |↑ | 0.6364|± |0.0439| +| - jurisprudence | 1|none | 0|acc |↑ | 0.6852|± |0.0449| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.7055|± |0.0358| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6069|± |0.0263| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.6141|± |0.0276| +| - prehistory | 1|none | 0|acc |↑ | 0.6080|± |0.0272| +| - professional_law | 1|none | 0|acc |↑ | 0.3924|± |0.0125| +| - world_religions | 1|none | 0|acc |↑ | 0.7427|± |0.0335| +| - other | 2|none | |acc |↑ | 0.5993|± |0.0085| +| - business_ethics | 1|none | 0|acc |↑ | 0.5900|± |0.0494| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.6000|± |0.0302| +| - college_medicine | 1|none | 0|acc |↑ | 0.5896|± |0.0375| +| - global_facts | 1|none | 0|acc |↑ | 0.2300|± |0.0423| +| - human_aging | 1|none | 0|acc |↑ | 0.5695|± |0.0332| +| - management | 1|none | 0|acc |↑ | 0.6699|± |0.0466| +| - marketing | 1|none | 0|acc |↑ | 0.8248|± |0.0249| +| - medical_genetics | 1|none | 0|acc |↑ | 0.6400|± |0.0482| +| - miscellaneous | 1|none | 0|acc |↑ | 0.6871|± |0.0166| +| - nutrition | 1|none | 0|acc |↑ | 0.5850|± |0.0282| +| - professional_accounting | 1|none | 0|acc |↑ | 0.4149|± |0.0294| +| - professional_medicine | 1|none | 0|acc |↑ | 0.5515|± |0.0302| +| - virology | 1|none | 0|acc |↑ | 0.4940|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.6341|± |0.0085| +| - econometrics | 1|none | 0|acc |↑ | 0.4825|± |0.0470| +| - high_school_geography | 1|none | 0|acc |↑ | 0.6919|± |0.0329| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.6995|± |0.0331| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.5410|± |0.0253| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.6345|± |0.0313| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.7725|± |0.0180| +| - human_sexuality | 1|none | 0|acc |↑ | 0.6641|± |0.0414| +| - professional_psychology | 1|none | 0|acc |↑ | 0.5523|± |0.0201| +| - public_relations | 1|none | 0|acc |↑ | 0.5455|± |0.0477| +| - security_studies | 1|none | 0|acc |↑ | 0.5918|± |0.0315| +| - sociology | 1|none | 0|acc |↑ | 0.6816|± |0.0329| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.7400|± |0.0441| +| - stem | 2|none | |acc |↑ | 0.5325|± |0.0087| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - anatomy | 1|none | 0|acc |↑ | 0.5259|± |0.0431| +| - astronomy | 1|none | 0|acc |↑ | 0.6382|± |0.0391| +| - college_biology | 1|none | 0|acc |↑ | 0.6806|± |0.0390| +| - college_chemistry | 1|none | 0|acc |↑ | 0.4200|± |0.0496| +| - college_computer_science | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - college_physics | 1|none | 0|acc |↑ | 0.3333|± |0.0469| +| - computer_security | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.6511|± |0.0312| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.6000|± |0.0408| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.5238|± |0.0257| +| - high_school_biology | 1|none | 0|acc |↑ | 0.6806|± |0.0265| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.5320|± |0.0351| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.3481|± |0.0290| +| - high_school_physics | 1|none | 0|acc |↑ | 0.4106|± |0.0402| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.5463|± |0.0340| +| - machine_learning | 1|none | 0|acc |↑ | 0.4196|± |0.0468| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0222|± |0.0025| +|openbookqa | 1|none | 0|acc |↑ | 0.2820|± |0.0201| +| | |none | 0|acc_norm |↑ | 0.3760|± |0.0217| +|piqa | 1|none | 0|acc |↑ | 0.7242|± |0.0104| +| | |none | 0|acc_norm |↑ | 0.7203|± |0.0105| +|qnli | 1|none | 0|acc |↑ | 0.5105|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.9310|± |0.0080| +| | |none | 0|acc_norm |↑ | 0.9140|± |0.0089| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.1350|± |0.0026| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.4688|± |0.0175| +| | |none | 0|bleu_diff |↑ | 2.7742|± |0.9353| +| | |none | 0|bleu_max |↑ |26.4045|± |0.8442| +| | |none | 0|rouge1_acc |↑ | 0.4627|± |0.0175| +| | |none | 0|rouge1_diff|↑ | 4.5361|± |1.2612| +| | |none | 0|rouge1_max |↑ |50.0513|± |0.9614| +| | |none | 0|rouge2_acc |↑ | 0.3354|± |0.0165| +| | |none | 0|rouge2_diff|↑ | 2.8338|± |1.3839| +| | |none | 0|rouge2_max |↑ |34.8104|± |1.1528| +| | |none | 0|rougeL_acc |↑ | 0.4529|± |0.0174| +| | |none | 0|rougeL_diff|↑ | 4.3615|± |1.2613| +| | |none | 0|rougeL_max |↑ |47.8592|± |0.9740| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2950|± |0.0160| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4588|± |0.0155| +|winogrande | 1|none | 0|acc |↑ | 0.6085|± |0.0137| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4826|± |0.0048| +|mmlu | 2|none | |acc |↑ |0.5538|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.4854|± |0.0069| +| - other | 2|none | |acc |↑ |0.5993|± |0.0085| +| - social sciences| 2|none | |acc |↑ |0.6341|± |0.0085| +| - stem | 2|none | |acc |↑ |0.5325|± |0.0087| + +Qwen_Qwen3-1.7B: 4h 25m 25s +✅ Benchmark completed for Qwen_Qwen3-1.7B + +🔥 Starting benchmark for Qwen_Qwen3-0.6B +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen3-0.6B,trust_remote_code=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 6 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.3430|± |0.0150| +|anli_r2 | 1|none | 0|acc |↑ | 0.3190|± |0.0147| +|anli_r3 | 1|none | 0|acc |↑ | 0.3442|± |0.0137| +|arc_challenge | 1|none | 0|acc |↑ | 0.3123|± |0.0135| +| | |none | 0|acc_norm |↑ | 0.3422|± |0.0139| +|bbh | 3|get-answer | |exact_match|↑ | 0.4148|± |0.0053| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.7560|± |0.0272| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.3529|± |0.0350| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.3160|± |0.0295| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.0000|± |0.0000| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.3280|± |0.0298| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.3960|± |0.0310| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.6160|± |0.0308| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2280|± |0.0266| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.0920|± |0.0183| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.5520|± |0.0315| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.4520|± |0.0315| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.8240|± |0.0241| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.6640|± |0.0299| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.4726|± |0.0415| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.6000|± |0.0310| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.2120|± |0.0259| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.2360|± |0.0269| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.2921|± |0.0342| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.4880|± |0.0317| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.1280|± |0.0212| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.2840|± |0.0286| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.1240|± |0.0209| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.5640|± |0.0314| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 0.9840|± |0.0080| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.0520|± |0.0141| +|boolq | 2|none | 0|acc |↑ | 0.6391|± |0.0084| +|drop | 3|none | 0|em |↑ | 0.0007|± |0.0003| +| | |none | 0|f1 |↑ | 0.0605|± |0.0013| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1768|± |0.0272| +| | |strict-match | 0|exact_match|↑ | 0.0101|± |0.0071| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1010|± |0.0215| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2222|± |0.0296| +| | |strict-match | 0|exact_match|↑ | 0.0101|± |0.0071| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.2626|± |0.0314| +| | |none | 0|acc_norm |↑ | 0.2626|± |0.0314| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.2677|± |0.0315| +| | |none | 0|acc_norm |↑ | 0.2677|± |0.0315| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1996|± |0.0171| +| | |strict-match | 0|exact_match|↑ | 0.0110|± |0.0045| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1245|± |0.0141| +| | |strict-match | 0|exact_match|↑ | 0.0018|± |0.0018| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2674|± |0.0190| +| | |strict-match | 0|exact_match|↑ | 0.0311|± |0.0074| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.2674|± |0.0190| +| | |none | 0|acc_norm |↑ | 0.2674|± |0.0190| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3022|± |0.0197| +| | |none | 0|acc_norm |↑ | 0.3022|± |0.0197| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1897|± |0.0185| +| | |strict-match | 0|exact_match|↑ | 0.0067|± |0.0039| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.1027|± |0.0144| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2277|± |0.0198| +| | |strict-match | 0|exact_match|↑ | 0.0134|± |0.0054| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.2723|± |0.0211| +| | |none | 0|acc_norm |↑ | 0.2723|± |0.0211| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.2701|± |0.0210| +| | |none | 0|acc_norm |↑ | 0.2701|± |0.0210| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.4109|± |0.0136| +| | |strict-match | 5|exact_match|↑ | 0.4124|± |0.0136| +|hellaswag | 1|none | 0|acc |↑ | 0.3763|± |0.0048| +| | |none | 0|acc_norm |↑ | 0.4719|± |0.0050| +|mmlu | 2|none | |acc |↑ | 0.4013|± |0.0040| +| - humanities | 2|none | |acc |↑ | 0.3654|± |0.0068| +| - formal_logic | 1|none | 0|acc |↑ | 0.4206|± |0.0442| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.5455|± |0.0389| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.5000|± |0.0351| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.5907|± |0.0320| +| - international_law | 1|none | 0|acc |↑ | 0.5620|± |0.0453| +| - jurisprudence | 1|none | 0|acc |↑ | 0.4167|± |0.0477| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.4724|± |0.0392| +| - moral_disputes | 1|none | 0|acc |↑ | 0.3208|± |0.0251| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.2425|± |0.0143| +| - philosophy | 1|none | 0|acc |↑ | 0.4116|± |0.0280| +| - prehistory | 1|none | 0|acc |↑ | 0.4321|± |0.0276| +| - professional_law | 1|none | 0|acc |↑ | 0.2986|± |0.0117| +| - world_religions | 1|none | 0|acc |↑ | 0.5263|± |0.0383| +| - other | 2|none | |acc |↑ | 0.4245|± |0.0087| +| - business_ethics | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.3283|± |0.0289| +| - college_medicine | 1|none | 0|acc |↑ | 0.2890|± |0.0346| +| - global_facts | 1|none | 0|acc |↑ | 0.2400|± |0.0429| +| - human_aging | 1|none | 0|acc |↑ | 0.4664|± |0.0335| +| - management | 1|none | 0|acc |↑ | 0.5340|± |0.0494| +| - marketing | 1|none | 0|acc |↑ | 0.6325|± |0.0316| +| - medical_genetics | 1|none | 0|acc |↑ | 0.3900|± |0.0490| +| - miscellaneous | 1|none | 0|acc |↑ | 0.4891|± |0.0179| +| - nutrition | 1|none | 0|acc |↑ | 0.4641|± |0.0286| +| - professional_accounting | 1|none | 0|acc |↑ | 0.2908|± |0.0271| +| - professional_medicine | 1|none | 0|acc |↑ | 0.3235|± |0.0284| +| - virology | 1|none | 0|acc |↑ | 0.4458|± |0.0387| +| - social sciences | 2|none | |acc |↑ | 0.4777|± |0.0089| +| - econometrics | 1|none | 0|acc |↑ | 0.2895|± |0.0427| +| - high_school_geography | 1|none | 0|acc |↑ | 0.4697|± |0.0356| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.5233|± |0.0360| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.4128|± |0.0250| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.4160|± |0.0320| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.5615|± |0.0213| +| - human_sexuality | 1|none | 0|acc |↑ | 0.5038|± |0.0439| +| - professional_psychology | 1|none | 0|acc |↑ | 0.4085|± |0.0199| +| - public_relations | 1|none | 0|acc |↑ | 0.4545|± |0.0477| +| - security_studies | 1|none | 0|acc |↑ | 0.5102|± |0.0320| +| - sociology | 1|none | 0|acc |↑ | 0.6418|± |0.0339| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.5700|± |0.0498| +| - stem | 2|none | |acc |↑ | 0.3574|± |0.0084| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.2900|± |0.0456| +| - anatomy | 1|none | 0|acc |↑ | 0.3704|± |0.0417| +| - astronomy | 1|none | 0|acc |↑ | 0.4474|± |0.0405| +| - college_biology | 1|none | 0|acc |↑ | 0.4583|± |0.0417| +| - college_chemistry | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - college_computer_science | 1|none | 0|acc |↑ | 0.2600|± |0.0441| +| - college_mathematics | 1|none | 0|acc |↑ | 0.3300|± |0.0473| +| - college_physics | 1|none | 0|acc |↑ | 0.2647|± |0.0439| +| - computer_security | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.3745|± |0.0316| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.4345|± |0.0413| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.3571|± |0.0247| +| - high_school_biology | 1|none | 0|acc |↑ | 0.4419|± |0.0283| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.3300|± |0.0331| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.4300|± |0.0498| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.2815|± |0.0274| +| - high_school_physics | 1|none | 0|acc |↑ | 0.2252|± |0.0341| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.2361|± |0.0290| +| - machine_learning | 1|none | 0|acc |↑ | 0.3661|± |0.0457| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0205|± |0.0024| +|openbookqa | 1|none | 0|acc |↑ | 0.2160|± |0.0184| +| | |none | 0|acc_norm |↑ | 0.3200|± |0.0209| +|piqa | 1|none | 0|acc |↑ | 0.6752|± |0.0109| +| | |none | 0|acc_norm |↑ | 0.6752|± |0.0109| +|qnli | 1|none | 0|acc |↑ | 0.4961|± |0.0068| +|sciq | 1|none | 0|acc |↑ | 0.8720|± |0.0106| +| | |none | 0|acc_norm |↑ | 0.8330|± |0.0118| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.0193|± |0.0010| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.3305|± |0.0165| +| | |none | 0|bleu_diff |↑ |-4.2316|± |0.5836| +| | |none | 0|bleu_max |↑ |17.1879|± |0.6583| +| | |none | 0|rouge1_acc |↑ | 0.2987|± |0.0160| +| | |none | 0|rouge1_diff|↑ |-7.1428|± |0.6809| +| | |none | 0|rouge1_max |↑ |37.0632|± |0.8923| +| | |none | 0|rouge2_acc |↑ | 0.2166|± |0.0144| +| | |none | 0|rouge2_diff|↑ |-7.9206|± |0.7858| +| | |none | 0|rouge2_max |↑ |21.7683|± |0.8976| +| | |none | 0|rougeL_acc |↑ | 0.2938|± |0.0159| +| | |none | 0|rougeL_diff|↑ |-7.4867|± |0.6710| +| | |none | 0|rougeL_max |↑ |34.4220|± |0.8733| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.2705|± |0.0156| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.4277|± |0.0145| +|winogrande | 1|none | 0|acc |↑ | 0.5517|± |0.0140| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.4148|± |0.0053| +|mmlu | 2|none | |acc |↑ |0.4013|± |0.0040| +| - humanities | 2|none | |acc |↑ |0.3654|± |0.0068| +| - other | 2|none | |acc |↑ |0.4245|± |0.0087| +| - social sciences| 2|none | |acc |↑ |0.4777|± |0.0089| +| - stem | 2|none | |acc |↑ |0.3574|± |0.0084| + +Qwen_Qwen3-0.6B: 3h 45m 57s +✅ Benchmark completed for Qwen_Qwen3-0.6B + +🔥 Starting benchmark for Qwen_Qwen3-4B +hf (pretrained=/mnt/data8tb/Documents/llm/llm_models/Qwen_Qwen3-4B,trust_remote_code=True,device_map=auto), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 6 +| Tasks |Version| Filter |n-shot| Metric | | Value | |Stderr| +|----------------------------------------------------------|------:|-----------------|-----:|-----------|---|------:|---|-----:| +|anli_r1 | 1|none | 0|acc |↑ | 0.5500|± |0.0157| +|anli_r2 | 1|none | 0|acc |↑ | 0.4610|± |0.0158| +|anli_r3 | 1|none | 0|acc |↑ | 0.5133|± |0.0144| +|arc_challenge | 1|none | 0|acc |↑ | 0.5043|± |0.0146| +| | |none | 0|acc_norm |↑ | 0.5392|± |0.0146| +|bbh | 3|get-answer | |exact_match|↑ | 0.7523|± |0.0047| +| - bbh_cot_fewshot_boolean_expressions | 4|get-answer | 3|exact_match|↑ | 0.9640|± |0.0118| +| - bbh_cot_fewshot_causal_judgement | 4|get-answer | 3|exact_match|↑ | 0.3636|± |0.0353| +| - bbh_cot_fewshot_date_understanding | 4|get-answer | 3|exact_match|↑ | 0.7800|± |0.0263| +| - bbh_cot_fewshot_disambiguation_qa | 4|get-answer | 3|exact_match|↑ | 0.6120|± |0.0309| +| - bbh_cot_fewshot_dyck_languages | 4|get-answer | 3|exact_match|↑ | 0.3800|± |0.0308| +| - bbh_cot_fewshot_formal_fallacies | 4|get-answer | 3|exact_match|↑ | 0.6360|± |0.0305| +| - bbh_cot_fewshot_geometric_shapes | 4|get-answer | 3|exact_match|↑ | 0.5040|± |0.0317| +| - bbh_cot_fewshot_hyperbaton | 4|get-answer | 3|exact_match|↑ | 0.9560|± |0.0130| +| - bbh_cot_fewshot_logical_deduction_five_objects | 4|get-answer | 3|exact_match|↑ | 0.5800|± |0.0313| +| - bbh_cot_fewshot_logical_deduction_seven_objects | 4|get-answer | 3|exact_match|↑ | 0.2920|± |0.0288| +| - bbh_cot_fewshot_logical_deduction_three_objects | 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_movie_recommendation | 4|get-answer | 3|exact_match|↑ | 0.7040|± |0.0289| +| - bbh_cot_fewshot_multistep_arithmetic_two | 4|get-answer | 3|exact_match|↑ | 0.9920|± |0.0056| +| - bbh_cot_fewshot_navigate | 4|get-answer | 3|exact_match|↑ | 0.9200|± |0.0172| +| - bbh_cot_fewshot_object_counting | 4|get-answer | 3|exact_match|↑ | 0.8480|± |0.0228| +| - bbh_cot_fewshot_penguins_in_a_table | 4|get-answer | 3|exact_match|↑ | 0.7740|± |0.0347| +| - bbh_cot_fewshot_reasoning_about_colored_objects | 4|get-answer | 3|exact_match|↑ | 0.8600|± |0.0220| +| - bbh_cot_fewshot_ruin_names | 4|get-answer | 3|exact_match|↑ | 0.7600|± |0.0271| +| - bbh_cot_fewshot_salient_translation_error_detection | 4|get-answer | 3|exact_match|↑ | 0.5880|± |0.0312| +| - bbh_cot_fewshot_snarks | 4|get-answer | 3|exact_match|↑ | 0.6966|± |0.0346| +| - bbh_cot_fewshot_sports_understanding | 4|get-answer | 3|exact_match|↑ | 0.8280|± |0.0239| +| - bbh_cot_fewshot_temporal_sequences | 4|get-answer | 3|exact_match|↑ | 0.8840|± |0.0203| +| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 4|get-answer | 3|exact_match|↑ | 0.9800|± |0.0089| +| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 4|get-answer | 3|exact_match|↑ | 0.9080|± |0.0183| +| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 4|get-answer | 3|exact_match|↑ | 0.9960|± |0.0040| +| - bbh_cot_fewshot_web_of_lies | 4|get-answer | 3|exact_match|↑ | 1.0000|± |0.0000| +| - bbh_cot_fewshot_word_sorting | 4|get-answer | 3|exact_match|↑ | 0.4920|± |0.0317| +|boolq | 2|none | 0|acc |↑ | 0.8505|± |0.0062| +|drop | 3|none | 0|em |↑ | 0.0060|± |0.0008| +| | |none | 0|f1 |↑ | 0.0977|± |0.0020| +|gpqa_diamond_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1111|± |0.0224| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0859|± |0.0200| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1818|± |0.0275| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_diamond_n_shot | 2|none | 0|acc |↑ | 0.3939|± |0.0348| +| | |none | 0|acc_norm |↑ | 0.3939|± |0.0348| +|gpqa_diamond_zeroshot | 1|none | 0|acc |↑ | 0.3636|± |0.0343| +| | |none | 0|acc_norm |↑ | 0.3636|± |0.0343| +|gpqa_extended_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.1136|± |0.0136| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0879|± |0.0121| +| | |strict-match | 0|exact_match|↑ | 0.0055|± |0.0032| +|gpqa_extended_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2436|± |0.0184| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_extended_n_shot | 2|none | 0|acc |↑ | 0.3407|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3407|± |0.0203| +|gpqa_extended_zeroshot | 1|none | 0|acc |↑ | 0.3388|± |0.0203| +| | |none | 0|acc_norm |↑ | 0.3388|± |0.0203| +|gpqa_main_cot_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.0893|± |0.0135| +| | |strict-match | 0|exact_match|↑ | 0.0045|± |0.0032| +|gpqa_main_cot_zeroshot | 1|flexible-extract | 0|exact_match|↑ | 0.0647|± |0.0116| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_generative_n_shot | 2|flexible-extract | 0|exact_match|↑ | 0.2455|± |0.0204| +| | |strict-match | 0|exact_match|↑ | 0.0000|± |0.0000| +|gpqa_main_n_shot | 2|none | 0|acc |↑ | 0.3438|± |0.0225| +| | |none | 0|acc_norm |↑ | 0.3438|± |0.0225| +|gpqa_main_zeroshot | 1|none | 0|acc |↑ | 0.3259|± |0.0222| +| | |none | 0|acc_norm |↑ | 0.3259|± |0.0222| +|gsm8k | 3|flexible-extract | 5|exact_match|↑ | 0.8484|± |0.0099| +| | |strict-match | 5|exact_match|↑ | 0.8567|± |0.0097| +|hellaswag | 1|none | 0|acc |↑ | 0.5223|± |0.0050| +| | |none | 0|acc_norm |↑ | 0.6833|± |0.0046| +|mmlu | 2|none | |acc |↑ | 0.6836|± |0.0037| +| - humanities | 2|none | |acc |↑ | 0.5957|± |0.0067| +| - formal_logic | 1|none | 0|acc |↑ | 0.6429|± |0.0429| +| - high_school_european_history | 1|none | 0|acc |↑ | 0.7939|± |0.0316| +| - high_school_us_history | 1|none | 0|acc |↑ | 0.8431|± |0.0255| +| - high_school_world_history | 1|none | 0|acc |↑ | 0.8397|± |0.0239| +| - international_law | 1|none | 0|acc |↑ | 0.7355|± |0.0403| +| - jurisprudence | 1|none | 0|acc |↑ | 0.7407|± |0.0424| +| - logical_fallacies | 1|none | 0|acc |↑ | 0.8098|± |0.0308| +| - moral_disputes | 1|none | 0|acc |↑ | 0.6965|± |0.0248| +| - moral_scenarios | 1|none | 0|acc |↑ | 0.3799|± |0.0162| +| - philosophy | 1|none | 0|acc |↑ | 0.7235|± |0.0254| +| - prehistory | 1|none | 0|acc |↑ | 0.7438|± |0.0243| +| - professional_law | 1|none | 0|acc |↑ | 0.4811|± |0.0128| +| - world_religions | 1|none | 0|acc |↑ | 0.7836|± |0.0316| +| - other | 2|none | |acc |↑ | 0.7126|± |0.0079| +| - business_ethics | 1|none | 0|acc |↑ | 0.7100|± |0.0456| +| - clinical_knowledge | 1|none | 0|acc |↑ | 0.7396|± |0.0270| +| - college_medicine | 1|none | 0|acc |↑ | 0.7052|± |0.0348| +| - global_facts | 1|none | 0|acc |↑ | 0.3400|± |0.0476| +| - human_aging | 1|none | 0|acc |↑ | 0.6771|± |0.0314| +| - management | 1|none | 0|acc |↑ | 0.8155|± |0.0384| +| - marketing | 1|none | 0|acc |↑ | 0.8675|± |0.0222| +| - medical_genetics | 1|none | 0|acc |↑ | 0.7600|± |0.0429| +| - miscellaneous | 1|none | 0|acc |↑ | 0.7969|± |0.0144| +| - nutrition | 1|none | 0|acc |↑ | 0.7255|± |0.0256| +| - professional_accounting | 1|none | 0|acc |↑ | 0.5319|± |0.0298| +| - professional_medicine | 1|none | 0|acc |↑ | 0.7243|± |0.0271| +| - virology | 1|none | 0|acc |↑ | 0.5060|± |0.0389| +| - social sciences | 2|none | |acc |↑ | 0.7803|± |0.0074| +| - econometrics | 1|none | 0|acc |↑ | 0.6316|± |0.0454| +| - high_school_geography | 1|none | 0|acc |↑ | 0.8283|± |0.0269| +| - high_school_government_and_politics | 1|none | 0|acc |↑ | 0.8756|± |0.0238| +| - high_school_macroeconomics | 1|none | 0|acc |↑ | 0.7462|± |0.0221| +| - high_school_microeconomics | 1|none | 0|acc |↑ | 0.8151|± |0.0252| +| - high_school_psychology | 1|none | 0|acc |↑ | 0.8716|± |0.0143| +| - human_sexuality | 1|none | 0|acc |↑ | 0.7634|± |0.0373| +| - professional_psychology | 1|none | 0|acc |↑ | 0.7206|± |0.0182| +| - public_relations | 1|none | 0|acc |↑ | 0.6727|± |0.0449| +| - security_studies | 1|none | 0|acc |↑ | 0.7061|± |0.0292| +| - sociology | 1|none | 0|acc |↑ | 0.8308|± |0.0265| +| - us_foreign_policy | 1|none | 0|acc |↑ | 0.8100|± |0.0394| +| - stem | 2|none | |acc |↑ | 0.6917|± |0.0080| +| - abstract_algebra | 1|none | 0|acc |↑ | 0.6000|± |0.0492| +| - anatomy | 1|none | 0|acc |↑ | 0.6148|± |0.0420| +| - astronomy | 1|none | 0|acc |↑ | 0.8026|± |0.0324| +| - college_biology | 1|none | 0|acc |↑ | 0.8194|± |0.0322| +| - college_chemistry | 1|none | 0|acc |↑ | 0.5400|± |0.0501| +| - college_computer_science | 1|none | 0|acc |↑ | 0.6700|± |0.0473| +| - college_mathematics | 1|none | 0|acc |↑ | 0.5400|± |0.0501| +| - college_physics | 1|none | 0|acc |↑ | 0.5882|± |0.0490| +| - computer_security | 1|none | 0|acc |↑ | 0.7900|± |0.0409| +| - conceptual_physics | 1|none | 0|acc |↑ | 0.7830|± |0.0269| +| - electrical_engineering | 1|none | 0|acc |↑ | 0.7310|± |0.0370| +| - elementary_mathematics | 1|none | 0|acc |↑ | 0.6799|± |0.0240| +| - high_school_biology | 1|none | 0|acc |↑ | 0.8645|± |0.0195| +| - high_school_chemistry | 1|none | 0|acc |↑ | 0.7094|± |0.0319| +| - high_school_computer_science | 1|none | 0|acc |↑ | 0.8500|± |0.0359| +| - high_school_mathematics | 1|none | 0|acc |↑ | 0.4815|± |0.0305| +| - high_school_physics | 1|none | 0|acc |↑ | 0.6093|± |0.0398| +| - high_school_statistics | 1|none | 0|acc |↑ | 0.6944|± |0.0314| +| - machine_learning | 1|none | 0|acc |↑ | 0.6071|± |0.0464| +|nq_open | 4|remove_whitespace| 0|exact_match|↑ | 0.0147|± |0.0020| +|openbookqa | 1|none | 0|acc |↑ | 0.2960|± |0.0204| +| | |none | 0|acc_norm |↑ | 0.4020|± |0.0219| +|piqa | 1|none | 0|acc |↑ | 0.7514|± |0.0101| +| | |none | 0|acc_norm |↑ | 0.7514|± |0.0101| +|qnli | 1|none | 0|acc |↑ | 0.8087|± |0.0053| +|sciq | 1|none | 0|acc |↑ | 0.9550|± |0.0066| +| | |none | 0|acc_norm |↑ | 0.9320|± |0.0080| +|triviaqa | 3|remove_whitespace| 0|exact_match|↑ | 0.2250|± |0.0031| +|truthfulqa_gen | 3|none | 0|bleu_acc |↑ | 0.5838|± |0.0173| +| | |none | 0|bleu_diff |↑ |12.2904|± |0.9730| +| | |none | 0|bleu_max |↑ |29.1140|± |0.8421| +| | |none | 0|rouge1_acc |↑ | 0.6095|± |0.0171| +| | |none | 0|rouge1_diff|↑ |17.9082|± |1.3731| +| | |none | 0|rouge1_max |↑ |54.7069|± |0.9372| +| | |none | 0|rouge2_acc |↑ | 0.5520|± |0.0174| +| | |none | 0|rouge2_diff|↑ |18.5593|± |1.4928| +| | |none | 0|rouge2_max |↑ |42.6485|± |1.1203| +| | |none | 0|rougeL_acc |↑ | 0.5961|± |0.0172| +| | |none | 0|rougeL_diff|↑ |17.8681|± |1.3823| +| | |none | 0|rougeL_max |↑ |52.3619|± |0.9738| +|truthfulqa_mc1 | 2|none | 0|acc |↑ | 0.3672|± |0.0169| +|truthfulqa_mc2 | 3|none | 0|acc |↑ | 0.5476|± |0.0158| +|winogrande | 1|none | 0|acc |↑ | 0.6582|± |0.0133| + +| Groups |Version| Filter |n-shot| Metric | |Value | |Stderr| +|------------------|------:|----------|------|-----------|---|-----:|---|-----:| +|bbh | 3|get-answer| |exact_match|↑ |0.7523|± |0.0047| +|mmlu | 2|none | |acc |↑ |0.6836|± |0.0037| +| - humanities | 2|none | |acc |↑ |0.5957|± |0.0067| +| - other | 2|none | |acc |↑ |0.7126|± |0.0079| +| - social sciences| 2|none | |acc |↑ |0.7803|± |0.0074| +| - stem | 2|none | |acc |↑ |0.6917|± |0.0080| + +Qwen_Qwen3-4B: 5h 51m 27s +✅ Benchmark completed for Qwen_Qwen3-4B + +🔥 Starting benchmark for openai_gpt-oss-20b +openai_gpt-oss-20b: 0h 0m 6s +✅ Benchmark completed for openai_gpt-oss-20b + +🔥 Starting benchmark for openai_gpt-oss-20b +openai_gpt-oss-20b: 0h 0m 4s +✅ Benchmark completed for openai_gpt-oss-20b + +🔥 Starting benchmark for openai_gpt-oss-20b +openai_gpt-oss-20b: 0h 0m 4s +✅ Benchmark completed for openai_gpt-oss-20b + +🔥 Starting benchmark for openai_gpt-oss-20b +openai_gpt-oss-20b: 0h 0m 4s +✅ Benchmark completed for openai_gpt-oss-20b + +🔥 Starting benchmark for openai_gpt-oss-20b +openai_gpt-oss-20b: 0h 0m 0s +✅ Benchmark completed for openai_gpt-oss-20b + +🔥 Starting benchmark for openai_gpt-oss-20b +openai_gpt-oss-20b: 0h 0m 4s +✅ Benchmark completed for openai_gpt-oss-20b + +🔥 Starting benchmark for openai_gpt-oss-20b +openai_gpt-oss-20b: 0h 0m 17s +✅ Benchmark completed for openai_gpt-oss-20b +