Spaces:

kaikaidai
/

Sandbox_Test

Paused

App Files Files Community

kaikaidai commited on Feb 18

Commit

25afc99

verified ·

1 Parent(s): 94cc8e3

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (2) hide show

data_handler.py +22 -3
model_handler.py +40 -45

data_handler.py CHANGED Viewed

@@ -34,7 +34,23 @@ def upload_test_data(df_state):
     def import_data(file):
         if file is not None:
             try:
-                df_state.value = pd.json_normalize(json.load(open(file.name)))
                 return {
                     df_display: gr.update(value=df_state.value, visible=True),
@@ -42,10 +58,13 @@ def upload_test_data(df_state):
                     df_state: df_state,
                     error_display: gr.update(visible=False)  # Hide previous errors
                 }
-            except json.JSONDecodeError as e:
                 return {
                     df_display: gr.update(visible=False),
-                    error_display: gr.update(value="**Error:** Invalid JSON file. Please upload a valid JSON file.", visible=True),
                     import_button: gr.update(visible=True),
                     df_state: None
                 }

     def import_data(file):
         if file is not None:
             try:
+                loaded_json = json.load(open(file.name))
+                # Handle various common JSON structures
+                if isinstance(loaded_json, list):
+                    # Top-level list
+                    df = pd.json_normalize(loaded_json, sep=".")
+                elif isinstance(loaded_json, dict):
+                    # Dictionary could contain a "data" key or not
+                    if "data" in loaded_json and isinstance(loaded_json["data"], list):
+                        df = pd.json_normalize(loaded_json["data"], sep=".")
+                    else:
+                        # Flatten the top-level dictionary
+                        df = pd.json_normalize(loaded_json, sep=".")
+                else:
+                    raise ValueError("Unsupported JSON structure. Please provide a list or object.")
+                df_state.value = df
                 return {
                     df_display: gr.update(value=df_state.value, visible=True),
                     df_state: df_state,
                     error_display: gr.update(visible=False)  # Hide previous errors
                 }
+            except json.JSONDecodeError:
                 return {
                     df_display: gr.update(visible=False),
+                    error_display: gr.update(
+                        value="**Error:** Invalid JSON file. Please upload a valid JSON file.",
+                        visible=True
+                    ),
                     import_button: gr.update(visible=True),
                     df_state: None
                 }

model_handler.py CHANGED Viewed

@@ -30,11 +30,9 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
                 return {}
             return model_data
         model_data = load_model_data()
         model_choices = list(model_data.keys())
-        # Define dropdowns using model choices
         with gr.Row(visible=False) as evaluator_row:
                judge_a_dropdown = gr.Dropdown(
                    choices=["Selene"], label="Judge A", value="Selene", interactive=False
@@ -43,26 +41,20 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
                    choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
                )
-        # A Markdown for "Evaluation in progress..." and final heading
         loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
-        # NEW: define a Dataframe to show final evaluation results, like in data_handler
         evaluation_result_df = gr.Dataframe(
             visible=False,
             label="Evaluation Results",
             elem_classes=["truncate_cells"]
         )
-        # Define the three-button row AFTER the markdown,
-        # so it appears *below* the "Evaluation Complete" message.
         with gr.Row(visible=False) as evaluation_nav_row:
             back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
             run_evaluation_button = gr.Button("Run Evaluation", visible=False)
             analyze_results_button = gr.Button("Analyze Results", visible=False)
-        # Show evaluator selection UI
         def show_evaluator_selection(current_df):
-            # Hide Criteria UI and show Evaluator UI
             updates = {
                 criteria_group: gr.update(visible=False),
                 save_prompt_button: gr.update(visible=False),
@@ -70,7 +62,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
                 evaluation_nav_row: gr.update(visible=True),
                 run_evaluation_button: gr.update(visible=True),
                 back_to_criteria_button: gr.update(visible=True),
-                # By default, hide "Analyze Results" and the result dataframe
                 analyze_results_button: gr.update(visible=False),
                 evaluation_result_df: gr.update(visible=False),
             }
@@ -79,14 +70,12 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
                 and hasattr(current_df.value, "attrs")
                 and current_df.value.attrs.get("eval_done")
             ):
-                # If a previous evaluation was completed, show the heading + dataframe
                 updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
                 updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
                 updates[analyze_results_button] = gr.update(visible=True)
             return updates
-        # Note that we pass df_state to show_evaluator_selection
         save_prompt_button.click(
             fn=show_evaluator_selection,
             inputs=[df_state],
@@ -103,7 +92,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
             ],
         )
-        # Back to Criteria
         def back_to_criteria():
             return {
                 save_prompt_button: gr.update(visible=True),
@@ -111,7 +99,6 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
                 evaluator_row: gr.update(visible=False),
                 evaluation_nav_row: gr.update(visible=False),
                 run_evaluation_button: gr.update(visible=False),
-                # Hide the "Evaluation Complete" markdown
                 loading_spinner: gr.update(visible=False),
                 analyze_results_button: gr.update(visible=False),
                 evaluation_result_df: gr.update(visible=False),
@@ -134,37 +121,39 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
         # Run evaluation
         def run_evaluation(judge_a, judge_b):
-            # Show loading spinner
-            yield {loading_spinner: gr.update(visible=True)}
-            # Get template and mappings from prompt state
             template_str = prompt_state.value['template']
             mappings = prompt_state.value['mappings']
             evaluation_criteria = mappings.get('evaluation_criteria')
-            # Create Jinja template for Judge B only
             template = Template(template_str)
-            # Submit prompt to chosen models
             for index, row in df_state.value.iterrows():
-                # Create a context dictionary for this row
                 context = {}
                 model_context = None
                 expected_output = None
                 for key, column in mappings.items():
                     if key == 'evaluation_criteria':
-                        continue  # Skip as we handle it separately
                     elif column and column != 'None':
                         context[key] = str(row[column])
                         if column == 'model_context':
                             model_context = str(row[column])
                         elif column == 'expected_model_output':
                             expected_output = str(row[column])
-                # For Judge B, render the template using Jinja
                 current_prompt = template.render(**context)
-                # For Judge A (Atla Selene), call get_atla_response directly
                 response_a = get_atla_response(
                     "atla-selene",
                     model_input=context.get('model_input'),
@@ -174,47 +163,53 @@ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button
                     evaluation_criteria=evaluation_criteria
                 )
                 response_b = get_model_response(
-                    judge_b,
-                    model_data.get(judge_b),
                     current_prompt
                 )
-                # Parse the responses - handle Atla response differently
-                if isinstance(response_a, dict):  # Atla response
                     score_a, critique_a = response_a['score'], response_a['critique']
-                else:  # Error case
                     score_a, critique_a = "Error", response_a
                 score_b, critique_b = parse_model_response(response_b)
                 df_state.value.loc[index, 'score_a'] = score_a
                 df_state.value.loc[index, 'critique_a'] = critique_a
                 df_state.value.loc[index, 'score_b'] = score_b
                 df_state.value.loc[index, 'critique_b'] = critique_b
             import time
-            time.sleep(2)
-            # Hide loading spinner
             yield {loading_spinner: gr.update(visible=False)}
-            # Show "Evaluation Complete" heading and the final DataFrame
             yield {
                 loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
                 evaluation_result_df: gr.update(value=df_state.value, visible=True),
                 analyze_results_button: gr.update(visible=True),
             }
-            # Store the "already run evaluation" flag safely in .attrs
             if hasattr(df_state.value, "attrs"):
                 df_state.value.attrs["eval_done"] = True
         run_evaluation_button.click(
             fn=run_evaluation,
             inputs=[judge_a_dropdown, judge_b_dropdown],
-            outputs=[loading_spinner, evaluation_result_df, analyze_results_button],
         )
     return model_selection_group, df_state, analyze_results_button

                 return {}
             return model_data
         model_data = load_model_data()
         model_choices = list(model_data.keys())
         with gr.Row(visible=False) as evaluator_row:
                judge_a_dropdown = gr.Dropdown(
                    choices=["Selene"], label="Judge A", value="Selene", interactive=False
                    choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
                )
         loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
         evaluation_result_df = gr.Dataframe(
             visible=False,
             label="Evaluation Results",
             elem_classes=["truncate_cells"]
         )
         with gr.Row(visible=False) as evaluation_nav_row:
             back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
             run_evaluation_button = gr.Button("Run Evaluation", visible=False)
             analyze_results_button = gr.Button("Analyze Results", visible=False)
         def show_evaluator_selection(current_df):
             updates = {
                 criteria_group: gr.update(visible=False),
                 save_prompt_button: gr.update(visible=False),
                 evaluation_nav_row: gr.update(visible=True),
                 run_evaluation_button: gr.update(visible=True),
                 back_to_criteria_button: gr.update(visible=True),
                 analyze_results_button: gr.update(visible=False),
                 evaluation_result_df: gr.update(visible=False),
             }
                 and hasattr(current_df.value, "attrs")
                 and current_df.value.attrs.get("eval_done")
             ):
                 updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
                 updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
                 updates[analyze_results_button] = gr.update(visible=True)
             return updates
         save_prompt_button.click(
             fn=show_evaluator_selection,
             inputs=[df_state],
             ],
         )
         def back_to_criteria():
             return {
                 save_prompt_button: gr.update(visible=True),
                 evaluator_row: gr.update(visible=False),
                 evaluation_nav_row: gr.update(visible=False),
                 run_evaluation_button: gr.update(visible=False),
                 loading_spinner: gr.update(visible=False),
                 analyze_results_button: gr.update(visible=False),
                 evaluation_result_df: gr.update(visible=False),
         # Run evaluation
         def run_evaluation(judge_a, judge_b):
+            # 1) Immediately hide old results and disable navigation while running
+            yield {
+                loading_spinner: gr.update(value="Evaluation in progress...", visible=True),
+                evaluation_result_df: gr.update(visible=False),
+                analyze_results_button: gr.update(visible=False),
+                run_evaluation_button: gr.update(interactive=False),
+                back_to_criteria_button: gr.update(interactive=False),
+            }
+            # Perform the actual evaluation
             template_str = prompt_state.value['template']
             mappings = prompt_state.value['mappings']
             evaluation_criteria = mappings.get('evaluation_criteria')
             template = Template(template_str)
             for index, row in df_state.value.iterrows():
                 context = {}
                 model_context = None
                 expected_output = None
                 for key, column in mappings.items():
                     if key == 'evaluation_criteria':
+                        continue
                     elif column and column != 'None':
                         context[key] = str(row[column])
                         if column == 'model_context':
                             model_context = str(row[column])
                         elif column == 'expected_model_output':
                             expected_output = str(row[column])
+                # Render the template for Judge B
                 current_prompt = template.render(**context)
                 response_a = get_atla_response(
                     "atla-selene",
                     model_input=context.get('model_input'),
                     evaluation_criteria=evaluation_criteria
                 )
                 response_b = get_model_response(
+                    judge_b,
+                    model_data.get(judge_b),
                     current_prompt
                 )
+                # Parse ATLA response
+                if isinstance(response_a, dict):
                     score_a, critique_a = response_a['score'], response_a['critique']
+                else:
                     score_a, critique_a = "Error", response_a
                 score_b, critique_b = parse_model_response(response_b)
                 df_state.value.loc[index, 'score_a'] = score_a
                 df_state.value.loc[index, 'critique_a'] = critique_a
                 df_state.value.loc[index, 'score_b'] = score_b
                 df_state.value.loc[index, 'critique_b'] = critique_b
             import time
+            time.sleep(2)  # simulating time-consuming operations
+            # 2) Hide spinner
             yield {loading_spinner: gr.update(visible=False)}
+            # 3) Show final results and re-enable buttons
             yield {
                 loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
                 evaluation_result_df: gr.update(value=df_state.value, visible=True),
                 analyze_results_button: gr.update(visible=True),
+                run_evaluation_button: gr.update(interactive=True),
+                back_to_criteria_button: gr.update(interactive=True),
             }
             if hasattr(df_state.value, "attrs"):
                 df_state.value.attrs["eval_done"] = True
+        # Include back_to_criteria_button & run_evaluation_button in outputs so we can update them
         run_evaluation_button.click(
             fn=run_evaluation,
             inputs=[judge_a_dropdown, judge_b_dropdown],
+            outputs=[
+                loading_spinner,
+                evaluation_result_df,
+                analyze_results_button,
+                run_evaluation_button,
+                back_to_criteria_button,
+            ],
         )
     return model_selection_group, df_state, analyze_results_button