Spaces:
Paused
Paused
Synced repo using 'sync_with_huggingface' Github Action
Browse files- data_handler.py +22 -3
- model_handler.py +40 -45
data_handler.py
CHANGED
|
@@ -34,7 +34,23 @@ def upload_test_data(df_state):
|
|
| 34 |
def import_data(file):
|
| 35 |
if file is not None:
|
| 36 |
try:
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
return {
|
| 40 |
df_display: gr.update(value=df_state.value, visible=True),
|
|
@@ -42,10 +58,13 @@ def upload_test_data(df_state):
|
|
| 42 |
df_state: df_state,
|
| 43 |
error_display: gr.update(visible=False) # Hide previous errors
|
| 44 |
}
|
| 45 |
-
except json.JSONDecodeError
|
| 46 |
return {
|
| 47 |
df_display: gr.update(visible=False),
|
| 48 |
-
error_display: gr.update(
|
|
|
|
|
|
|
|
|
|
| 49 |
import_button: gr.update(visible=True),
|
| 50 |
df_state: None
|
| 51 |
}
|
|
|
|
| 34 |
def import_data(file):
|
| 35 |
if file is not None:
|
| 36 |
try:
|
| 37 |
+
loaded_json = json.load(open(file.name))
|
| 38 |
+
|
| 39 |
+
# Handle various common JSON structures
|
| 40 |
+
if isinstance(loaded_json, list):
|
| 41 |
+
# Top-level list
|
| 42 |
+
df = pd.json_normalize(loaded_json, sep=".")
|
| 43 |
+
elif isinstance(loaded_json, dict):
|
| 44 |
+
# Dictionary could contain a "data" key or not
|
| 45 |
+
if "data" in loaded_json and isinstance(loaded_json["data"], list):
|
| 46 |
+
df = pd.json_normalize(loaded_json["data"], sep=".")
|
| 47 |
+
else:
|
| 48 |
+
# Flatten the top-level dictionary
|
| 49 |
+
df = pd.json_normalize(loaded_json, sep=".")
|
| 50 |
+
else:
|
| 51 |
+
raise ValueError("Unsupported JSON structure. Please provide a list or object.")
|
| 52 |
+
|
| 53 |
+
df_state.value = df
|
| 54 |
|
| 55 |
return {
|
| 56 |
df_display: gr.update(value=df_state.value, visible=True),
|
|
|
|
| 58 |
df_state: df_state,
|
| 59 |
error_display: gr.update(visible=False) # Hide previous errors
|
| 60 |
}
|
| 61 |
+
except json.JSONDecodeError:
|
| 62 |
return {
|
| 63 |
df_display: gr.update(visible=False),
|
| 64 |
+
error_display: gr.update(
|
| 65 |
+
value="**Error:** Invalid JSON file. Please upload a valid JSON file.",
|
| 66 |
+
visible=True
|
| 67 |
+
),
|
| 68 |
import_button: gr.update(visible=True),
|
| 69 |
df_state: None
|
| 70 |
}
|
model_handler.py
CHANGED
|
@@ -30,11 +30,9 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 30 |
return {}
|
| 31 |
return model_data
|
| 32 |
|
| 33 |
-
|
| 34 |
model_data = load_model_data()
|
| 35 |
model_choices = list(model_data.keys())
|
| 36 |
|
| 37 |
-
# Define dropdowns using model choices
|
| 38 |
with gr.Row(visible=False) as evaluator_row:
|
| 39 |
judge_a_dropdown = gr.Dropdown(
|
| 40 |
choices=["Selene"], label="Judge A", value="Selene", interactive=False
|
|
@@ -43,26 +41,20 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 43 |
choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
|
| 44 |
)
|
| 45 |
|
| 46 |
-
# A Markdown for "Evaluation in progress..." and final heading
|
| 47 |
loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
|
| 48 |
|
| 49 |
-
# NEW: define a Dataframe to show final evaluation results, like in data_handler
|
| 50 |
evaluation_result_df = gr.Dataframe(
|
| 51 |
visible=False,
|
| 52 |
label="Evaluation Results",
|
| 53 |
elem_classes=["truncate_cells"]
|
| 54 |
)
|
| 55 |
|
| 56 |
-
# Define the three-button row AFTER the markdown,
|
| 57 |
-
# so it appears *below* the "Evaluation Complete" message.
|
| 58 |
with gr.Row(visible=False) as evaluation_nav_row:
|
| 59 |
back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
|
| 60 |
run_evaluation_button = gr.Button("Run Evaluation", visible=False)
|
| 61 |
analyze_results_button = gr.Button("Analyze Results", visible=False)
|
| 62 |
|
| 63 |
-
# Show evaluator selection UI
|
| 64 |
def show_evaluator_selection(current_df):
|
| 65 |
-
# Hide Criteria UI and show Evaluator UI
|
| 66 |
updates = {
|
| 67 |
criteria_group: gr.update(visible=False),
|
| 68 |
save_prompt_button: gr.update(visible=False),
|
|
@@ -70,7 +62,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 70 |
evaluation_nav_row: gr.update(visible=True),
|
| 71 |
run_evaluation_button: gr.update(visible=True),
|
| 72 |
back_to_criteria_button: gr.update(visible=True),
|
| 73 |
-
# By default, hide "Analyze Results" and the result dataframe
|
| 74 |
analyze_results_button: gr.update(visible=False),
|
| 75 |
evaluation_result_df: gr.update(visible=False),
|
| 76 |
}
|
|
@@ -79,14 +70,12 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 79 |
and hasattr(current_df.value, "attrs")
|
| 80 |
and current_df.value.attrs.get("eval_done")
|
| 81 |
):
|
| 82 |
-
# If a previous evaluation was completed, show the heading + dataframe
|
| 83 |
updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
|
| 84 |
updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
|
| 85 |
updates[analyze_results_button] = gr.update(visible=True)
|
| 86 |
|
| 87 |
return updates
|
| 88 |
|
| 89 |
-
# Note that we pass df_state to show_evaluator_selection
|
| 90 |
save_prompt_button.click(
|
| 91 |
fn=show_evaluator_selection,
|
| 92 |
inputs=[df_state],
|
|
@@ -103,7 +92,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 103 |
],
|
| 104 |
)
|
| 105 |
|
| 106 |
-
# Back to Criteria
|
| 107 |
def back_to_criteria():
|
| 108 |
return {
|
| 109 |
save_prompt_button: gr.update(visible=True),
|
|
@@ -111,7 +99,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 111 |
evaluator_row: gr.update(visible=False),
|
| 112 |
evaluation_nav_row: gr.update(visible=False),
|
| 113 |
run_evaluation_button: gr.update(visible=False),
|
| 114 |
-
# Hide the "Evaluation Complete" markdown
|
| 115 |
loading_spinner: gr.update(visible=False),
|
| 116 |
analyze_results_button: gr.update(visible=False),
|
| 117 |
evaluation_result_df: gr.update(visible=False),
|
|
@@ -134,37 +121,39 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 134 |
|
| 135 |
# Run evaluation
|
| 136 |
def run_evaluation(judge_a, judge_b):
|
| 137 |
-
#
|
| 138 |
-
yield {
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
template_str = prompt_state.value['template']
|
| 142 |
mappings = prompt_state.value['mappings']
|
| 143 |
evaluation_criteria = mappings.get('evaluation_criteria')
|
| 144 |
-
|
| 145 |
-
# Create Jinja template for Judge B only
|
| 146 |
template = Template(template_str)
|
| 147 |
-
|
| 148 |
-
# Submit prompt to chosen models
|
| 149 |
for index, row in df_state.value.iterrows():
|
| 150 |
-
# Create a context dictionary for this row
|
| 151 |
context = {}
|
| 152 |
model_context = None
|
| 153 |
expected_output = None
|
| 154 |
-
|
| 155 |
for key, column in mappings.items():
|
| 156 |
if key == 'evaluation_criteria':
|
| 157 |
-
continue
|
| 158 |
elif column and column != 'None':
|
| 159 |
context[key] = str(row[column])
|
| 160 |
if column == 'model_context':
|
| 161 |
model_context = str(row[column])
|
| 162 |
elif column == 'expected_model_output':
|
| 163 |
expected_output = str(row[column])
|
| 164 |
-
|
| 165 |
-
#
|
| 166 |
current_prompt = template.render(**context)
|
| 167 |
-
# For Judge A (Atla Selene), call get_atla_response directly
|
| 168 |
response_a = get_atla_response(
|
| 169 |
"atla-selene",
|
| 170 |
model_input=context.get('model_input'),
|
|
@@ -174,47 +163,53 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
|
|
| 174 |
evaluation_criteria=evaluation_criteria
|
| 175 |
)
|
| 176 |
response_b = get_model_response(
|
| 177 |
-
judge_b,
|
| 178 |
-
model_data.get(judge_b),
|
| 179 |
current_prompt
|
| 180 |
)
|
| 181 |
-
|
| 182 |
-
# Parse
|
| 183 |
-
if isinstance(response_a, dict):
|
| 184 |
score_a, critique_a = response_a['score'], response_a['critique']
|
| 185 |
-
else:
|
| 186 |
score_a, critique_a = "Error", response_a
|
| 187 |
-
|
| 188 |
score_b, critique_b = parse_model_response(response_b)
|
| 189 |
-
|
| 190 |
df_state.value.loc[index, 'score_a'] = score_a
|
| 191 |
df_state.value.loc[index, 'critique_a'] = critique_a
|
| 192 |
df_state.value.loc[index, 'score_b'] = score_b
|
| 193 |
df_state.value.loc[index, 'critique_b'] = critique_b
|
| 194 |
-
|
| 195 |
import time
|
| 196 |
-
time.sleep(2)
|
| 197 |
-
|
| 198 |
-
# Hide
|
| 199 |
yield {loading_spinner: gr.update(visible=False)}
|
| 200 |
-
|
| 201 |
-
# Show
|
| 202 |
yield {
|
| 203 |
loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
|
| 204 |
evaluation_result_df: gr.update(value=df_state.value, visible=True),
|
| 205 |
analyze_results_button: gr.update(visible=True),
|
|
|
|
|
|
|
| 206 |
}
|
| 207 |
|
| 208 |
-
# Store the "already run evaluation" flag safely in .attrs
|
| 209 |
if hasattr(df_state.value, "attrs"):
|
| 210 |
df_state.value.attrs["eval_done"] = True
|
| 211 |
|
|
|
|
| 212 |
run_evaluation_button.click(
|
| 213 |
fn=run_evaluation,
|
| 214 |
inputs=[judge_a_dropdown, judge_b_dropdown],
|
| 215 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
)
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
return model_selection_group, df_state, analyze_results_button
|
|
|
|
| 30 |
return {}
|
| 31 |
return model_data
|
| 32 |
|
|
|
|
| 33 |
model_data = load_model_data()
|
| 34 |
model_choices = list(model_data.keys())
|
| 35 |
|
|
|
|
| 36 |
with gr.Row(visible=False) as evaluator_row:
|
| 37 |
judge_a_dropdown = gr.Dropdown(
|
| 38 |
choices=["Selene"], label="Judge A", value="Selene", interactive=False
|
|
|
|
| 41 |
choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
|
| 42 |
)
|
| 43 |
|
|
|
|
| 44 |
loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
|
| 45 |
|
|
|
|
| 46 |
evaluation_result_df = gr.Dataframe(
|
| 47 |
visible=False,
|
| 48 |
label="Evaluation Results",
|
| 49 |
elem_classes=["truncate_cells"]
|
| 50 |
)
|
| 51 |
|
|
|
|
|
|
|
| 52 |
with gr.Row(visible=False) as evaluation_nav_row:
|
| 53 |
back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
|
| 54 |
run_evaluation_button = gr.Button("Run Evaluation", visible=False)
|
| 55 |
analyze_results_button = gr.Button("Analyze Results", visible=False)
|
| 56 |
|
|
|
|
| 57 |
def show_evaluator_selection(current_df):
|
|
|
|
| 58 |
updates = {
|
| 59 |
criteria_group: gr.update(visible=False),
|
| 60 |
save_prompt_button: gr.update(visible=False),
|
|
|
|
| 62 |
evaluation_nav_row: gr.update(visible=True),
|
| 63 |
run_evaluation_button: gr.update(visible=True),
|
| 64 |
back_to_criteria_button: gr.update(visible=True),
|
|
|
|
| 65 |
analyze_results_button: gr.update(visible=False),
|
| 66 |
evaluation_result_df: gr.update(visible=False),
|
| 67 |
}
|
|
|
|
| 70 |
and hasattr(current_df.value, "attrs")
|
| 71 |
and current_df.value.attrs.get("eval_done")
|
| 72 |
):
|
|
|
|
| 73 |
updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
|
| 74 |
updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
|
| 75 |
updates[analyze_results_button] = gr.update(visible=True)
|
| 76 |
|
| 77 |
return updates
|
| 78 |
|
|
|
|
| 79 |
save_prompt_button.click(
|
| 80 |
fn=show_evaluator_selection,
|
| 81 |
inputs=[df_state],
|
|
|
|
| 92 |
],
|
| 93 |
)
|
| 94 |
|
|
|
|
| 95 |
def back_to_criteria():
|
| 96 |
return {
|
| 97 |
save_prompt_button: gr.update(visible=True),
|
|
|
|
| 99 |
evaluator_row: gr.update(visible=False),
|
| 100 |
evaluation_nav_row: gr.update(visible=False),
|
| 101 |
run_evaluation_button: gr.update(visible=False),
|
|
|
|
| 102 |
loading_spinner: gr.update(visible=False),
|
| 103 |
analyze_results_button: gr.update(visible=False),
|
| 104 |
evaluation_result_df: gr.update(visible=False),
|
|
|
|
| 121 |
|
| 122 |
# Run evaluation
|
| 123 |
def run_evaluation(judge_a, judge_b):
|
| 124 |
+
# 1) Immediately hide old results and disable navigation while running
|
| 125 |
+
yield {
|
| 126 |
+
loading_spinner: gr.update(value="Evaluation in progress...", visible=True),
|
| 127 |
+
evaluation_result_df: gr.update(visible=False),
|
| 128 |
+
analyze_results_button: gr.update(visible=False),
|
| 129 |
+
run_evaluation_button: gr.update(interactive=False),
|
| 130 |
+
back_to_criteria_button: gr.update(interactive=False),
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
# Perform the actual evaluation
|
| 134 |
template_str = prompt_state.value['template']
|
| 135 |
mappings = prompt_state.value['mappings']
|
| 136 |
evaluation_criteria = mappings.get('evaluation_criteria')
|
| 137 |
+
|
|
|
|
| 138 |
template = Template(template_str)
|
| 139 |
+
|
|
|
|
| 140 |
for index, row in df_state.value.iterrows():
|
|
|
|
| 141 |
context = {}
|
| 142 |
model_context = None
|
| 143 |
expected_output = None
|
| 144 |
+
|
| 145 |
for key, column in mappings.items():
|
| 146 |
if key == 'evaluation_criteria':
|
| 147 |
+
continue
|
| 148 |
elif column and column != 'None':
|
| 149 |
context[key] = str(row[column])
|
| 150 |
if column == 'model_context':
|
| 151 |
model_context = str(row[column])
|
| 152 |
elif column == 'expected_model_output':
|
| 153 |
expected_output = str(row[column])
|
| 154 |
+
|
| 155 |
+
# Render the template for Judge B
|
| 156 |
current_prompt = template.render(**context)
|
|
|
|
| 157 |
response_a = get_atla_response(
|
| 158 |
"atla-selene",
|
| 159 |
model_input=context.get('model_input'),
|
|
|
|
| 163 |
evaluation_criteria=evaluation_criteria
|
| 164 |
)
|
| 165 |
response_b = get_model_response(
|
| 166 |
+
judge_b,
|
| 167 |
+
model_data.get(judge_b),
|
| 168 |
current_prompt
|
| 169 |
)
|
| 170 |
+
|
| 171 |
+
# Parse ATLA response
|
| 172 |
+
if isinstance(response_a, dict):
|
| 173 |
score_a, critique_a = response_a['score'], response_a['critique']
|
| 174 |
+
else:
|
| 175 |
score_a, critique_a = "Error", response_a
|
| 176 |
+
|
| 177 |
score_b, critique_b = parse_model_response(response_b)
|
| 178 |
+
|
| 179 |
df_state.value.loc[index, 'score_a'] = score_a
|
| 180 |
df_state.value.loc[index, 'critique_a'] = critique_a
|
| 181 |
df_state.value.loc[index, 'score_b'] = score_b
|
| 182 |
df_state.value.loc[index, 'critique_b'] = critique_b
|
| 183 |
+
|
| 184 |
import time
|
| 185 |
+
time.sleep(2) # simulating time-consuming operations
|
| 186 |
+
|
| 187 |
+
# 2) Hide spinner
|
| 188 |
yield {loading_spinner: gr.update(visible=False)}
|
| 189 |
+
|
| 190 |
+
# 3) Show final results and re-enable buttons
|
| 191 |
yield {
|
| 192 |
loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
|
| 193 |
evaluation_result_df: gr.update(value=df_state.value, visible=True),
|
| 194 |
analyze_results_button: gr.update(visible=True),
|
| 195 |
+
run_evaluation_button: gr.update(interactive=True),
|
| 196 |
+
back_to_criteria_button: gr.update(interactive=True),
|
| 197 |
}
|
| 198 |
|
|
|
|
| 199 |
if hasattr(df_state.value, "attrs"):
|
| 200 |
df_state.value.attrs["eval_done"] = True
|
| 201 |
|
| 202 |
+
# Include back_to_criteria_button & run_evaluation_button in outputs so we can update them
|
| 203 |
run_evaluation_button.click(
|
| 204 |
fn=run_evaluation,
|
| 205 |
inputs=[judge_a_dropdown, judge_b_dropdown],
|
| 206 |
+
outputs=[
|
| 207 |
+
loading_spinner,
|
| 208 |
+
evaluation_result_df,
|
| 209 |
+
analyze_results_button,
|
| 210 |
+
run_evaluation_button,
|
| 211 |
+
back_to_criteria_button,
|
| 212 |
+
],
|
| 213 |
)
|
| 214 |
|
|
|
|
|
|
|
| 215 |
return model_selection_group, df_state, analyze_results_button
|