SEED Benchmark Leaderboard Update
Browse files- __pycache__/constants.cpython-38.pyc +0 -0
- app.py +23 -14
- constants.py +6 -2
- file/result.csv +22 -22
- src/__pycache__/utils_display.cpython-38.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0
__pycache__/constants.cpython-38.pyc
ADDED
|
Binary file (7.6 kB). View file
|
|
|
app.py
CHANGED
|
@@ -126,6 +126,9 @@ def add_new_eval(
|
|
| 126 |
model_type,
|
| 127 |
model_name,
|
| 128 |
LLM_name,
|
|
|
|
|
|
|
|
|
|
| 129 |
each_task_accuracy[1],
|
| 130 |
each_task_accuracy[2],
|
| 131 |
each_task_accuracy[3],
|
|
@@ -135,19 +138,25 @@ def add_new_eval(
|
|
| 135 |
each_task_accuracy[7],
|
| 136 |
each_task_accuracy[8],
|
| 137 |
each_task_accuracy[9],
|
| 138 |
-
average_accuracy_image,
|
| 139 |
each_task_accuracy[10],
|
| 140 |
each_task_accuracy[11],
|
| 141 |
each_task_accuracy[12],
|
| 142 |
-
|
| 143 |
-
overall_accuracy]
|
| 144 |
-
# pdb.set_trace()
|
| 145 |
csv_data.loc[col] = new_data
|
| 146 |
csv_data = csv_data.to_csv(CSV_DIR, index=False)
|
| 147 |
return 0
|
| 148 |
|
| 149 |
def get_baseline_df():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
df = pd.read_csv(CSV_DIR)
|
|
|
|
| 151 |
return df
|
| 152 |
|
| 153 |
block = gr.Blocks()
|
|
@@ -173,8 +182,8 @@ with block:
|
|
| 173 |
|
| 174 |
# selection for column part:
|
| 175 |
checkbox_group = gr.CheckboxGroup(
|
| 176 |
-
choices=
|
| 177 |
-
value=
|
| 178 |
label="Select options",
|
| 179 |
interactive=True,
|
| 180 |
)
|
|
@@ -191,9 +200,9 @@ with block:
|
|
| 191 |
|
| 192 |
def on_checkbox_group_change(selected_columns):
|
| 193 |
# pdb.set_trace()
|
| 194 |
-
selected_columns = [item for item in
|
| 195 |
present_columns = MODEL_INFO + selected_columns
|
| 196 |
-
updated_data =
|
| 197 |
updated_headers = present_columns
|
| 198 |
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
| 199 |
|
|
@@ -229,10 +238,10 @@ with block:
|
|
| 229 |
with gr.Row():
|
| 230 |
with gr.Column():
|
| 231 |
model_name_textbox = gr.Textbox(
|
| 232 |
-
label="Model
|
| 233 |
)
|
| 234 |
revision_name_textbox = gr.Textbox(
|
| 235 |
-
label="Revision Model Name", placeholder="LLaMA"
|
| 236 |
)
|
| 237 |
model_type = gr.Dropdown(
|
| 238 |
choices=[
|
|
@@ -241,7 +250,7 @@ with block:
|
|
| 241 |
"VideoLLM",
|
| 242 |
"Other",
|
| 243 |
],
|
| 244 |
-
label="Model
|
| 245 |
multiselect=False,
|
| 246 |
value="ImageLLM",
|
| 247 |
interactive=True,
|
|
@@ -254,18 +263,18 @@ with block:
|
|
| 254 |
|
| 255 |
LLM_type = gr.Dropdown(
|
| 256 |
choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
|
| 257 |
-
label="LLM
|
| 258 |
multiselect=False,
|
| 259 |
value="LLaMA-7B",
|
| 260 |
interactive=True,
|
| 261 |
)
|
| 262 |
LLM_name_textbox = gr.Textbox(
|
| 263 |
-
label="LLM
|
| 264 |
placeholder="LLaMA-13B"
|
| 265 |
)
|
| 266 |
Evaluation_dimension = gr.Dropdown(
|
| 267 |
choices=["All", "Image", "Video"],
|
| 268 |
-
label="Evaluation
|
| 269 |
multiselect=False,
|
| 270 |
value="All",
|
| 271 |
interactive=True,
|
|
|
|
| 126 |
model_type,
|
| 127 |
model_name,
|
| 128 |
LLM_name,
|
| 129 |
+
overall_accuracy,
|
| 130 |
+
average_accuracy_image,
|
| 131 |
+
average_accuracy_video,
|
| 132 |
each_task_accuracy[1],
|
| 133 |
each_task_accuracy[2],
|
| 134 |
each_task_accuracy[3],
|
|
|
|
| 138 |
each_task_accuracy[7],
|
| 139 |
each_task_accuracy[8],
|
| 140 |
each_task_accuracy[9],
|
|
|
|
| 141 |
each_task_accuracy[10],
|
| 142 |
each_task_accuracy[11],
|
| 143 |
each_task_accuracy[12],
|
| 144 |
+
]
|
|
|
|
|
|
|
| 145 |
csv_data.loc[col] = new_data
|
| 146 |
csv_data = csv_data.to_csv(CSV_DIR, index=False)
|
| 147 |
return 0
|
| 148 |
|
| 149 |
def get_baseline_df():
|
| 150 |
+
# pdb.set_trace()
|
| 151 |
+
df = pd.read_csv(CSV_DIR)
|
| 152 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
| 153 |
+
present_columns = MODEL_INFO + checkbox_group.value
|
| 154 |
+
df = df[present_columns]
|
| 155 |
+
return df
|
| 156 |
+
|
| 157 |
+
def get_all_df():
|
| 158 |
df = pd.read_csv(CSV_DIR)
|
| 159 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
| 160 |
return df
|
| 161 |
|
| 162 |
block = gr.Blocks()
|
|
|
|
| 182 |
|
| 183 |
# selection for column part:
|
| 184 |
checkbox_group = gr.CheckboxGroup(
|
| 185 |
+
choices=TASK_INFO_v2,
|
| 186 |
+
value=AVG_INFO,
|
| 187 |
label="Select options",
|
| 188 |
interactive=True,
|
| 189 |
)
|
|
|
|
| 200 |
|
| 201 |
def on_checkbox_group_change(selected_columns):
|
| 202 |
# pdb.set_trace()
|
| 203 |
+
selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
|
| 204 |
present_columns = MODEL_INFO + selected_columns
|
| 205 |
+
updated_data = get_all_df()[present_columns]
|
| 206 |
updated_headers = present_columns
|
| 207 |
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
| 208 |
|
|
|
|
| 238 |
with gr.Row():
|
| 239 |
with gr.Column():
|
| 240 |
model_name_textbox = gr.Textbox(
|
| 241 |
+
label="Model name", placeholder="LLaMA-7B"
|
| 242 |
)
|
| 243 |
revision_name_textbox = gr.Textbox(
|
| 244 |
+
label="Revision Model Name", placeholder="LLaMA-7B"
|
| 245 |
)
|
| 246 |
model_type = gr.Dropdown(
|
| 247 |
choices=[
|
|
|
|
| 250 |
"VideoLLM",
|
| 251 |
"Other",
|
| 252 |
],
|
| 253 |
+
label="Model type",
|
| 254 |
multiselect=False,
|
| 255 |
value="ImageLLM",
|
| 256 |
interactive=True,
|
|
|
|
| 263 |
|
| 264 |
LLM_type = gr.Dropdown(
|
| 265 |
choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
|
| 266 |
+
label="LLM type",
|
| 267 |
multiselect=False,
|
| 268 |
value="LLaMA-7B",
|
| 269 |
interactive=True,
|
| 270 |
)
|
| 271 |
LLM_name_textbox = gr.Textbox(
|
| 272 |
+
label="LLM model (for Other)",
|
| 273 |
placeholder="LLaMA-13B"
|
| 274 |
)
|
| 275 |
Evaluation_dimension = gr.Dropdown(
|
| 276 |
choices=["All", "Image", "Video"],
|
| 277 |
+
label="Evaluation dimension",
|
| 278 |
multiselect=False,
|
| 279 |
value="All",
|
| 280 |
interactive=True,
|
constants.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
# this is .py for store constants
|
| 2 |
MODEL_INFO = ["Model Type", "Model", "Language Model"]
|
| 3 |
TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"]
|
| 4 |
-
|
|
|
|
|
|
|
| 5 |
DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
|
| 6 |
CSV_DIR = "./file/result.csv"
|
| 7 |
|
| 8 |
-
COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
|
|
|
|
|
|
| 9 |
DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
|
| 10 |
|
| 11 |
UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5 & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\
|
|
|
|
| 1 |
# this is .py for store constants
|
| 2 |
MODEL_INFO = ["Model Type", "Model", "Language Model"]
|
| 3 |
TASK_INFO = ["Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Avg. Img", "Action Recognition", "Action Prediction", "Procedure Understanding", "Avg. Video", "Avg. All"]
|
| 4 |
+
TASK_INFO_v2 = ["Avg. All", "Avg. Img", "Avg. Video", "Scene Understanding", "Instance Identity", "Instance Attributes", "Instance Localization", "Instance Counting", "Spatial Relation", "Instance Interaction", "Visual Reasoning", "Text Recognition", "Action Recognition", "Action Prediction", "Procedure Understanding"]
|
| 5 |
+
|
| 6 |
+
AVG_INFO = ["Avg. All", "Avg. Img", "Avg. Video"]
|
| 7 |
DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"]
|
| 8 |
CSV_DIR = "./file/result.csv"
|
| 9 |
|
| 10 |
+
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
| 11 |
+
COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
|
| 12 |
+
|
| 13 |
DATA_NUM = [3158, 1831, 4649, 978, 2447, 657, 97, 331, 85, 1740, 2077, 1192]
|
| 14 |
|
| 15 |
UNTUNED_MODEL_RESULTS = '''LLM & Flan-T5 & Flan-T5-XL &23.0 &29.0 &32.8 &31.8 &20.5 &31.8 &33.0 &18.2 &19.4 &23.2 &34.9 &25.4 \\
|
file/result.csv
CHANGED
|
@@ -1,22 +1,22 @@
|
|
| 1 |
-
|
| 2 |
-
LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,
|
| 3 |
-
LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,
|
| 4 |
-
LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,
|
| 5 |
-
ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,
|
| 6 |
-
ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,
|
| 7 |
-
ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,
|
| 8 |
-
ImageLLM,[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,42.7,34.9,33.5,28.4,41.9,30.8,27.8,46.8,27.7,
|
| 9 |
-
ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Flan-T5-XL,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,
|
| 10 |
-
ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,
|
| 11 |
-
ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,
|
| 12 |
-
ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,
|
| 13 |
-
ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,
|
| 14 |
-
ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,
|
| 15 |
-
ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.9,42.
|
| 16 |
-
ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,
|
| 17 |
-
ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,
|
| 18 |
-
ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,
|
| 19 |
-
ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,25.9,
|
| 20 |
-
VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,
|
| 21 |
-
VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,
|
| 22 |
-
VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,
|
|
|
|
| 1 |
+
Model Type,Model,Language Model,Avg. All,Avg. Img,Avg. Video,Scene Understanding,Instance Identity,Instance Attributes,Instance Localization,Instance Counting,Spatial Relation,Instance Interaction,Visual Reasoning,Text Recognition,Action Recognition,Action Prediction,Procedure Understanding
|
| 2 |
+
LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,27.7,27.3,28.6,23.0,29.0,32.8,31.8,20.5,31.8,33.0,18.2,19.4,23.2,34.9,25.4
|
| 3 |
+
LLM,[Vicuna](https://huggingface.co/lmsys/vicuna-7b-v1.3),Vicuna-7B,28.5,28.2,29.5,23.4,30.7,29.7,30.9,30.8,28.6,29.8,18.5,13.4,27.3,34.5,23.8
|
| 4 |
+
LLM,[LLaMA](https://research.facebook.com/publications/llama-open-and-efficient-foundation-language-models/),LLaMA-7B,26.8,26.6,27.3,26.3,27.4,26.2,28.3,25.1,28.8,19.2,37.0,9.0,33.0,23.1,26.2
|
| 5 |
+
ImageLLM,[BLIP-2](https://github.com/salesforce/LAVIS),Flan-T5-XL,46.4,49.7,36.7,59.1,53.9,49.2,42.3,43.2,36.7,55.7,45.6,25.9,32.6,47.5,24.0
|
| 6 |
+
ImageLLM,[InstructBLIP](https://github.com/salesforce/LAVIS),Flan-T5-XL,52.7,57.8,38.3,60.3,58.5,63.4,40.6,58.4,38.7,51.6,45.9,25.9,33.1,49.1,27.1
|
| 7 |
+
ImageLLM,[InstructBLIP-Vicuna](https://github.com/salesforce/LAVIS),Vicuna-7B,53.4,58.8,38.1,60.2,58.9,65.6,43.6,57.2,40.3,52.6,47.7,43.5,34.5,49.6,23.1
|
| 8 |
+
ImageLLM,[LLaVA](https://github.com/haotian-liu/LLaVA),LLaMA-7B,33.5,37.0,23.8,42.7,34.9,33.5,28.4,41.9,30.8,27.8,46.8,27.7,29.7,21.4,19.1
|
| 9 |
+
ImageLLM,[MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4),Flan-T5-XL,42.8,47.4,29.9,56.3,49.2,45.8,37.9,45.3,32.6,47.4,57.1,11.8,38.2,24.5,27.1
|
| 10 |
+
ImageLLM,[VPGTrans](https://github.com/VPGTrans/VPGTrans),LLaMA-7B,39.1,41.8,31.4,51.9,44.1,39.9,36.1,33.7,36.4,32.0,53.2,30.6,39.5,24.3,31.9
|
| 11 |
+
ImageLLM,[MultiModal-GPT](https://github.com/open-mmlab/Multimodal-GPT),LLaMA-7B,33.2,34.5,29.2,43.6,37.9,31.5,30.8,27.3,30.1,29.9,51.4,18.8,36.9,25.8,24.0
|
| 12 |
+
ImageLLM,[Otter](https://github.com/Luodian/Otter),LLaMA-7B,33.9,35.2,30.4,44.9,38.6,32.2,30.9,26.3,31.8,32.0,51.4,31.8,37.9,27.2,24.8
|
| 13 |
+
ImageLLM,[Otter](https://github.com/Luodian/Otter),MPT-7B,39.7,42.9,30.6,51.3,43.5,42.3,34.2,38.4,30.9,40.2,55.3,24.7,36.8,29.2,23.8
|
| 14 |
+
ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),LLaMA-7B,33.1,34.5,29.3,43.9,38.1,31.3,30.1,27.3,30.6,29.9,50.2,20.0,37.2,25.4,24.2
|
| 15 |
+
ImageLLM,[OpenFlamingo](https://github.com/mlfoundations/open_flamingo),MPT-7B,40.9,42.7,35.7,53.2,45.3,40.0,31.2,39.3,32.6,36.1,51.4,25.9,42.9,34.7,26.9
|
| 16 |
+
ImageLLM,[LLaMA-AdapterV2](https://github.com/OpenGVLab/LLaMA-Adapter),LLaMA-7B,32.7,35.2,25.8,45.2,38.5,29.3,33.0,29.7,35.5,39.2,52.0,24.7,38.6,18.5,19.6
|
| 17 |
+
ImageLLM,[GVT](https://github.com/TencentARC/GVT),Vicuna-7B,33.5,35.5,27.8,41.7,35.5,31.8,29.5,36.2,32.0,32.0,51.1,27.1,33.9,25.4,23.0
|
| 18 |
+
ImageLLM,[mPLUG-Owl](https://github.com/X-PLUG/mPLUG-Owl),LLaMA-7B,34.0,37.9,23.0,49.7,45.3,32.5,36.7,27.3,32.7,44.3,54.7,28.8,26.7,17.9,26.5
|
| 19 |
+
ImageLLM,[Kosmos-2](https://github.com/microsoft/unilm/tree/master/kosmos-2),Decoder Only 1.3B,50.0,54.4,37.5,63.4,57.1,58.5,44.0,41.4,37.9,55.7,60.7,25.9,41.3,40.4,27.0
|
| 20 |
+
VideoLLM,[VideoChat](https://github.com/OpenGVLab/Ask-Anything),Vicuna-7B,37.6,39.0,33.7,47.1,43.8,34.9,40.0,32.8,34.6,42.3,50.5,17.7,34.9,36.4,27.3
|
| 21 |
+
VideoLLM,[Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT),LLaMA-7B,31.2,33.9,23.5,37.2,31.4,33.2,28.4,35.5,29.5,23.7,42.3,25.9,27.6,21.3,21.1
|
| 22 |
+
VideoLLM,[Valley](https://github.com/RupertLuo/Valley),LLaMA-13B,30.3,32.0,25.4,39.3,32.9,31.6,27.9,24.2,30.1,27.8,43.8,11.8,31.3,23.2,20.7
|
src/__pycache__/utils_display.cpython-38.pyc
CHANGED
|
Binary files a/src/__pycache__/utils_display.cpython-38.pyc and b/src/__pycache__/utils_display.cpython-38.pyc differ
|
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc
CHANGED
|
Binary files a/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc and b/src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc differ
|
|
|