Spaces:

bigcode
/

arena

Running

App Files Files Community

terryyz commited on Oct 1

Commit

9f4f2cd

1 Parent(s): 6553e1c

update

Browse files

Files changed (4) hide show

app.py +42 -2
completion.py +2 -3
ranking.py +18 -1
voting.py +45 -1

app.py CHANGED Viewed

@@ -1206,6 +1206,14 @@ def build_ui():
         """
         gr.Markdown("# 🌸 BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
         # Main tabs
         with gr.Tabs():
@@ -1489,6 +1497,37 @@ def build_ui():
                 )
             # Ranking Tab
             ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
         # Event handlers
         # Create state variables for the run buttons
@@ -1949,9 +1988,10 @@ def build_ui():
             # Always show thank you message and clear everything immediately
             gr.Info("Thank you for your vote! 🎉 Your feedback has been recorded and new models have been selected.", duration=5)
             # revval the model names in the info message
-            gr.Info(f"Now you can see model names! 👀 \nModel A: {model_a}, Model B: {model_b}", duration=15)
             # Get new random models for the next session
             model_a, model_b = get_random_models()

         """
         gr.Markdown("# 🌸 BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
+        gr.HTML("""
+        <p align="center" style="font-size: 1.2em;">
+            <a href="https://github.com/bigcode-project/bigcodearena">GitHub</a> |
+            <a href="https://huggingface.co/collections/bigcode/bigcodearena-68cd3a196e5147cc45f8ea3d">HF Collection</a> |
+            <a href="https://arxiv.org">ArXiv</a>
+        </p>
+        """)
         # Main tabs
         with gr.Tabs():
                 )
             # Ranking Tab
             ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
+            # Note Tab
+            with gr.Tab("📝 Note", id="note"):
+                gr.Markdown("## 📋 Important Information")
+                # Privacy Notice Section
+                gr.Markdown("### Privacy Notice")
+                gr.Markdown(
+                    """
+                    Your conversations and certain other personal information will be disclosed to the relevant AI providers
+                    and may otherwise be disclosed publicly to help support our community and advance AI research.
+                    **Do not submit to our Services any personal information or other sensitive information that you would not
+                    want to be shared publicly.** By continuing to use our Services, you acknowledge and direct us to engage
+                    in such sharing.
+                    """
+                )
+                # Citation Section
+                gr.Markdown("### Citation")
+                gr.Markdown(
+                    """
+                    If you use BigCodeArena in your research, please cite our work:
+                    ```bibtex
+                    @article{zhuo2025bigcodearena,
+                      title={BigCodeArena: Unveiling More Reliable Human Preferences in Code Generation via Execution},
+                      year={2025}
+                    }
+                    ```
+                    """
+                )
         # Event handlers
         # Create state variables for the run buttons
             # Always show thank you message and clear everything immediately
             gr.Info("Thank you for your vote! 🎉 Your feedback has been recorded and new models have been selected.", duration=5)
+            gr.Info(f"Model B: {model_b}", duration=20)
+            gr.Info(f"Model A: {model_a}", duration=20)
             # revval the model names in the info message
+            gr.Info(f"Now you can see model names! 👀", duration=20)
             # Get new random models for the next session
             model_a, model_b = get_random_models()

completion.py CHANGED Viewed

@@ -168,10 +168,9 @@ def get_endpoint(endpoint_list):
 # load config args from config yaml files
 def make_config(config_file: str) -> dict:
-    config_kwargs = {}
     with open(config_file, "r") as f:
-        config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
     return config_kwargs

 # load config args from config yaml files
 def make_config(config_file: str) -> dict:
     with open(config_file, "r") as f:
+        config_kwargs = yaml.safe_load(os.path.expandvars(f.read()))
     return config_kwargs

ranking.py CHANGED Viewed

@@ -50,7 +50,7 @@ def load_ranking_data(hf_token=None, force_reload=False):
                 download_mode="force_redownload",
             )
         else:
-            dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
         # Convert to pandas DataFrame - handle both Dataset and DatasetDict
         if hasattr(dataset, "to_pandas"):
             df = dataset.to_pandas()
@@ -60,6 +60,17 @@ def load_ranking_data(hf_token=None, force_reload=False):
         if df.empty:
             return pd.DataFrame()
         # Convert vote format for Elo calculation and count votes
         battle_data = []
         vote_counts = defaultdict(int)
@@ -146,6 +157,12 @@ def create_ranking_tab():
     """Create the ranking tab UI component"""
     with gr.Tab("📊 Ranking", id="ranking"):
         gr.Markdown("## 🏆 Model Leaderboard")
         ranking_table = gr.Dataframe(
             headers=[

                 download_mode="force_redownload",
             )
         else:
+            dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload")
         # Convert to pandas DataFrame - handle both Dataset and DatasetDict
         if hasattr(dataset, "to_pandas"):
             df = dataset.to_pandas()
         if df.empty:
             return pd.DataFrame()
+        # Filter to only include samples where both models have code in their responses
+        # code_a and code_b should be non-empty lists
+        if 'code_a' in df.columns and 'code_b' in df.columns:
+            df = df[
+                df['code_a'].apply(lambda x: isinstance(x, list) and len(x) > 0) &
+                df['code_b'].apply(lambda x: isinstance(x, list) and len(x) > 0)
+            ]
+            if df.empty:
+                return pd.DataFrame()
         # Convert vote format for Elo calculation and count votes
         battle_data = []
         vote_counts = defaultdict(int)
     """Create the ranking tab UI component"""
     with gr.Tab("📊 Ranking", id="ranking"):
         gr.Markdown("## 🏆 Model Leaderboard")
+        gr.Markdown(
+            """
+            > **Note:** This ranking table shows raw results from user votes.
+            > More detailed analysis will be added manually.
+            """
+        )
         ranking_table = gr.Dataframe(
             headers=[

voting.py CHANGED Viewed

@@ -9,6 +9,7 @@ import datetime
 import os
 import threading
 from datasets import Dataset, load_dataset
 # HuggingFace dataset configuration
@@ -42,6 +43,43 @@ def serialize_interactions(interactions):
     return serialized
 def save_vote_to_hf(
     model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
 ):
@@ -78,6 +116,10 @@ def save_vote_to_hf(
         serialized_action_a = serialize_interactions(action_a)
         serialized_action_b = serialize_interactions(action_b)
         # Create vote data with full conversation history and actions organized by turns
         # Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
         # Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
@@ -90,12 +132,14 @@ def save_vote_to_hf(
             "action_b": serialized_action_b,  # Actions organized by turns for model B
             "conversation_a": serialized_conversation_a,  # Full conversation history for model A
             "conversation_b": serialized_conversation_b,  # Full conversation history for model B
             "vote": vote_result,  # "left", "right", "tie", "both_bad"
         }
         # Try to load existing dataset or create new one
         try:
-            dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
             # Convert to pandas DataFrame - handle both Dataset and DatasetDict
             if hasattr(dataset, "to_pandas"):
                 df = dataset.to_pandas()

 import os
 import threading
 from datasets import Dataset, load_dataset
+from sandbox.code_analyzer import extract_code_from_markdown
 # HuggingFace dataset configuration
     return serialized
+def extract_code_snippets_from_conversation(conversation):
+    """
+    Extract code snippets and install commands from all assistant messages in a conversation.
+    Args:
+        conversation: List of message dicts with 'role' and 'content' keys
+    Returns:
+        List of dicts containing code snippets and install commands for each turn
+    """
+    if not conversation:
+        return []
+    code_snippets = []
+    for msg in conversation:
+        if msg.get("role") == "assistant":
+            content = msg.get("content", "")
+            if content:
+                # Extract code from markdown in the assistant message
+                extract_result = extract_code_from_markdown(
+                    message=content,
+                    enable_auto_env=True
+                )
+                if extract_result is not None:
+                    code, code_language, env_selection, install_command = extract_result
+                    # Add code snippet info for this turn
+                    code_snippets.append({
+                        "code": code,
+                        "code_language": code_language,
+                        "install_command": install_command,
+                        "environment": str(env_selection) if env_selection else None
+                    })
+    return code_snippets
 def save_vote_to_hf(
     model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
 ):
         serialized_action_a = serialize_interactions(action_a)
         serialized_action_b = serialize_interactions(action_b)
+        # Extract code snippets and install commands from conversations
+        code_a = extract_code_snippets_from_conversation(conversation_a or [])
+        code_b = extract_code_snippets_from_conversation(conversation_b or [])
         # Create vote data with full conversation history and actions organized by turns
         # Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
         # Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
             "action_b": serialized_action_b,  # Actions organized by turns for model B
             "conversation_a": serialized_conversation_a,  # Full conversation history for model A
             "conversation_b": serialized_conversation_b,  # Full conversation history for model B
+            "code_a": code_a,  # List of code snippets and install commands for model A
+            "code_b": code_b,  # List of code snippets and install commands for model B
             "vote": vote_result,  # "left", "right", "tie", "both_bad"
         }
         # Try to load existing dataset or create new one
         try:
+            dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload")
             # Convert to pandas DataFrame - handle both Dataset and DatasetDict
             if hasattr(dataset, "to_pandas"):
                 df = dataset.to_pandas()