Adding Evaluation Results

This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr

The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.

If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions

Files changed (1) hide show

README.md +181 -63

README.md CHANGED Viewed

@@ -1,79 +1,184 @@
 ---
 library_name: transformers
 tags:
 - medical
 - trl
 - trainer
-license: apache-2.0
-thumbnail: https://huggingface.co/ShieldX/manovyadh-1.1B-v1-chat/blob/main/manovyadh.png
 datasets:
 - ShieldX/manovyadh-3.5k
-language:
-- en
 metrics:
 - accuracy
 pipeline_tag: text-generation
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 widget:
-  - text: >
-      ###SYSTEM: You are an AI assistant that helps people cope with stress and improve their mental health. User will tell you about their feelings and challenges. Your task is to listen empathetically and offer helpful suggestions. While responding, think about the user’s needs and goals and show compassion and support
-      ###USER: I don't know how to tell someone how I feel about them. How can I get better at expressing how I feel??
-      ###ASSISTANT:
 model-index:
-  - name: manovyadh-1.1B-v1-chat
-    results:
-      - task:
-          type: text-generation
-        dataset:
-          name: ai2_arc
-          type: arc
-        metrics:
-          - name: pass@1
-            type: pass@1
-            value: 35.92
-        source:
-          name: Open LLM Leaderboard
-          url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
-      - task:
-          type: text-generation
-        dataset:
-          name: hellaswag
-          type: hellaswag
-        metrics:
-          - name: pass@1
-            type: pass@1
-            value: 60.03
-        source:
-          name: Open LLM Leaderboard
-          url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
-      - task:
-          type: text-generation
-        dataset:
-          name: truthful_qa
-          type: truthful_qa
-        metrics:
-          - name: pass@1
-            type: pass@1
-            value: 39.17
-        source:
-          name: Open LLM Leaderboard
-          url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
-      - task:
-          type: text-generation
-        dataset:
-          name: winogrande
-          type: winogrande
-        metrics:
-          - name: pass@1
-            type: pass@1
-            value: 61.09
-        source:
-          name: Open LLM Leaderboard
-          url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
 ---
 # Uploaded  model
@@ -346,4 +451,17 @@ ShieldX a.k.a Rohan Shaw
 # Model Card Contact
-email : [email protected]

 ---
+language:
+- en
+license: apache-2.0
 library_name: transformers
 tags:
 - medical
 - trl
 - trainer
 datasets:
 - ShieldX/manovyadh-3.5k
 metrics:
 - accuracy
+thumbnail: https://huggingface.co/ShieldX/manovyadh-1.1B-v1-chat/blob/main/manovyadh.png
 pipeline_tag: text-generation
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 widget:
+- text: '###SYSTEM: You are an AI assistant that helps people cope with stress and
+    improve their mental health. User will tell you about their feelings and challenges.
+    Your task is to listen empathetically and offer helpful suggestions. While responding,
+    think about the user’s needs and goals and show compassion and support
+    ###USER: I don''t know how to tell someone how I feel about them. How can I get
+    better at expressing how I feel??
+    ###ASSISTANT:
+    '
 model-index:
+- name: manovyadh-1.1B-v1-chat
+  results:
+  - task:
+      type: text-generation
+    dataset:
+      name: ai2_arc
+      type: arc
+    metrics:
+    - type: pass@1
+      value: 35.92
+      name: pass@1
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+    dataset:
+      name: hellaswag
+      type: hellaswag
+    metrics:
+    - type: pass@1
+      value: 60.03
+      name: pass@1
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+    dataset:
+      name: truthful_qa
+      type: truthful_qa
+    metrics:
+    - type: pass@1
+      value: 39.17
+      name: pass@1
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+    dataset:
+      name: winogrande
+      type: winogrande
+    metrics:
+    - type: pass@1
+      value: 61.09
+      name: pass@1
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: AI2 Reasoning Challenge (25-Shot)
+      type: ai2_arc
+      config: ARC-Challenge
+      split: test
+      args:
+        num_few_shot: 25
+    metrics:
+    - type: acc_norm
+      value: 35.92
+      name: normalized accuracy
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=ShieldX/manovyadh-1.1B-v1-chat
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: HellaSwag (10-Shot)
+      type: hellaswag
+      split: validation
+      args:
+        num_few_shot: 10
+    metrics:
+    - type: acc_norm
+      value: 60.03
+      name: normalized accuracy
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=ShieldX/manovyadh-1.1B-v1-chat
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MMLU (5-Shot)
+      type: cais/mmlu
+      config: all
+      split: test
+      args:
+        num_few_shot: 5
+    metrics:
+    - type: acc
+      value: 25.82
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=ShieldX/manovyadh-1.1B-v1-chat
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: TruthfulQA (0-shot)
+      type: truthful_qa
+      config: multiple_choice
+      split: validation
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: mc2
+      value: 39.17
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=ShieldX/manovyadh-1.1B-v1-chat
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: Winogrande (5-shot)
+      type: winogrande
+      config: winogrande_xl
+      split: validation
+      args:
+        num_few_shot: 5
+    metrics:
+    - type: acc
+      value: 61.09
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=ShieldX/manovyadh-1.1B-v1-chat
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: GSM8k (5-shot)
+      type: gsm8k
+      config: main
+      split: test
+      args:
+        num_few_shot: 5
+    metrics:
+    - type: acc
+      value: 1.74
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=ShieldX/manovyadh-1.1B-v1-chat
+      name: Open LLM Leaderboard
 ---
 # Uploaded  model
 # Model Card Contact
+email : [email protected]
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_ShieldX__manovyadh-1.1B-v1-chat)
+|             Metric              |Value|
+|---------------------------------|----:|
+|Avg.                             |37.30|
+|AI2 Reasoning Challenge (25-Shot)|35.92|
+|HellaSwag (10-Shot)              |60.03|
+|MMLU (5-Shot)                    |25.82|
+|TruthfulQA (0-shot)              |39.17|
+|Winogrande (5-shot)              |61.09|
+|GSM8k (5-shot)                   | 1.74|