Spaces:
Running
Running
terryyz
commited on
Commit
Β·
9f4f2cd
1
Parent(s):
6553e1c
update
Browse files- app.py +42 -2
- completion.py +2 -3
- ranking.py +18 -1
- voting.py +45 -1
app.py
CHANGED
|
@@ -1206,6 +1206,14 @@ def build_ui():
|
|
| 1206 |
"""
|
| 1207 |
|
| 1208 |
gr.Markdown("# πΈ BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1209 |
|
| 1210 |
# Main tabs
|
| 1211 |
with gr.Tabs():
|
|
@@ -1489,6 +1497,37 @@ def build_ui():
|
|
| 1489 |
)
|
| 1490 |
# Ranking Tab
|
| 1491 |
ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1492 |
|
| 1493 |
# Event handlers
|
| 1494 |
# Create state variables for the run buttons
|
|
@@ -1949,9 +1988,10 @@ def build_ui():
|
|
| 1949 |
|
| 1950 |
# Always show thank you message and clear everything immediately
|
| 1951 |
gr.Info("Thank you for your vote! π Your feedback has been recorded and new models have been selected.", duration=5)
|
| 1952 |
-
|
|
|
|
| 1953 |
# revval the model names in the info message
|
| 1954 |
-
gr.Info(f"Now you can see model names! π
|
| 1955 |
|
| 1956 |
# Get new random models for the next session
|
| 1957 |
model_a, model_b = get_random_models()
|
|
|
|
| 1206 |
"""
|
| 1207 |
|
| 1208 |
gr.Markdown("# πΈ BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
|
| 1209 |
+
|
| 1210 |
+
gr.HTML("""
|
| 1211 |
+
<p align="center" style="font-size: 1.2em;">
|
| 1212 |
+
<a href="https://github.com/bigcode-project/bigcodearena">GitHub</a> |
|
| 1213 |
+
<a href="https://huggingface.co/collections/bigcode/bigcodearena-68cd3a196e5147cc45f8ea3d">HF Collection</a> |
|
| 1214 |
+
<a href="https://arxiv.org">ArXiv</a>
|
| 1215 |
+
</p>
|
| 1216 |
+
""")
|
| 1217 |
|
| 1218 |
# Main tabs
|
| 1219 |
with gr.Tabs():
|
|
|
|
| 1497 |
)
|
| 1498 |
# Ranking Tab
|
| 1499 |
ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
|
| 1500 |
+
|
| 1501 |
+
# Note Tab
|
| 1502 |
+
with gr.Tab("π Note", id="note"):
|
| 1503 |
+
gr.Markdown("## π Important Information")
|
| 1504 |
+
|
| 1505 |
+
# Privacy Notice Section
|
| 1506 |
+
gr.Markdown("### Privacy Notice")
|
| 1507 |
+
gr.Markdown(
|
| 1508 |
+
"""
|
| 1509 |
+
Your conversations and certain other personal information will be disclosed to the relevant AI providers
|
| 1510 |
+
and may otherwise be disclosed publicly to help support our community and advance AI research.
|
| 1511 |
+
|
| 1512 |
+
**Do not submit to our Services any personal information or other sensitive information that you would not
|
| 1513 |
+
want to be shared publicly.** By continuing to use our Services, you acknowledge and direct us to engage
|
| 1514 |
+
in such sharing.
|
| 1515 |
+
"""
|
| 1516 |
+
)
|
| 1517 |
+
|
| 1518 |
+
# Citation Section
|
| 1519 |
+
gr.Markdown("### Citation")
|
| 1520 |
+
gr.Markdown(
|
| 1521 |
+
"""
|
| 1522 |
+
If you use BigCodeArena in your research, please cite our work:
|
| 1523 |
+
```bibtex
|
| 1524 |
+
@article{zhuo2025bigcodearena,
|
| 1525 |
+
title={BigCodeArena: Unveiling More Reliable Human Preferences in Code Generation via Execution},
|
| 1526 |
+
year={2025}
|
| 1527 |
+
}
|
| 1528 |
+
```
|
| 1529 |
+
"""
|
| 1530 |
+
)
|
| 1531 |
|
| 1532 |
# Event handlers
|
| 1533 |
# Create state variables for the run buttons
|
|
|
|
| 1988 |
|
| 1989 |
# Always show thank you message and clear everything immediately
|
| 1990 |
gr.Info("Thank you for your vote! π Your feedback has been recorded and new models have been selected.", duration=5)
|
| 1991 |
+
gr.Info(f"Model B: {model_b}", duration=20)
|
| 1992 |
+
gr.Info(f"Model A: {model_a}", duration=20)
|
| 1993 |
# revval the model names in the info message
|
| 1994 |
+
gr.Info(f"Now you can see model names! π", duration=20)
|
| 1995 |
|
| 1996 |
# Get new random models for the next session
|
| 1997 |
model_a, model_b = get_random_models()
|
completion.py
CHANGED
|
@@ -168,10 +168,9 @@ def get_endpoint(endpoint_list):
|
|
| 168 |
|
| 169 |
# load config args from config yaml files
|
| 170 |
def make_config(config_file: str) -> dict:
|
| 171 |
-
config_kwargs = {}
|
| 172 |
with open(config_file, "r") as f:
|
| 173 |
-
config_kwargs = yaml.
|
| 174 |
-
|
| 175 |
return config_kwargs
|
| 176 |
|
| 177 |
|
|
|
|
| 168 |
|
| 169 |
# load config args from config yaml files
|
| 170 |
def make_config(config_file: str) -> dict:
|
|
|
|
| 171 |
with open(config_file, "r") as f:
|
| 172 |
+
config_kwargs = yaml.safe_load(os.path.expandvars(f.read()))
|
| 173 |
+
|
| 174 |
return config_kwargs
|
| 175 |
|
| 176 |
|
ranking.py
CHANGED
|
@@ -50,7 +50,7 @@ def load_ranking_data(hf_token=None, force_reload=False):
|
|
| 50 |
download_mode="force_redownload",
|
| 51 |
)
|
| 52 |
else:
|
| 53 |
-
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
|
| 54 |
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 55 |
if hasattr(dataset, "to_pandas"):
|
| 56 |
df = dataset.to_pandas()
|
|
@@ -60,6 +60,17 @@ def load_ranking_data(hf_token=None, force_reload=False):
|
|
| 60 |
if df.empty:
|
| 61 |
return pd.DataFrame()
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# Convert vote format for Elo calculation and count votes
|
| 64 |
battle_data = []
|
| 65 |
vote_counts = defaultdict(int)
|
|
@@ -146,6 +157,12 @@ def create_ranking_tab():
|
|
| 146 |
"""Create the ranking tab UI component"""
|
| 147 |
with gr.Tab("π Ranking", id="ranking"):
|
| 148 |
gr.Markdown("## π Model Leaderboard")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
ranking_table = gr.Dataframe(
|
| 151 |
headers=[
|
|
|
|
| 50 |
download_mode="force_redownload",
|
| 51 |
)
|
| 52 |
else:
|
| 53 |
+
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload")
|
| 54 |
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 55 |
if hasattr(dataset, "to_pandas"):
|
| 56 |
df = dataset.to_pandas()
|
|
|
|
| 60 |
if df.empty:
|
| 61 |
return pd.DataFrame()
|
| 62 |
|
| 63 |
+
# Filter to only include samples where both models have code in their responses
|
| 64 |
+
# code_a and code_b should be non-empty lists
|
| 65 |
+
if 'code_a' in df.columns and 'code_b' in df.columns:
|
| 66 |
+
df = df[
|
| 67 |
+
df['code_a'].apply(lambda x: isinstance(x, list) and len(x) > 0) &
|
| 68 |
+
df['code_b'].apply(lambda x: isinstance(x, list) and len(x) > 0)
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
if df.empty:
|
| 72 |
+
return pd.DataFrame()
|
| 73 |
+
|
| 74 |
# Convert vote format for Elo calculation and count votes
|
| 75 |
battle_data = []
|
| 76 |
vote_counts = defaultdict(int)
|
|
|
|
| 157 |
"""Create the ranking tab UI component"""
|
| 158 |
with gr.Tab("π Ranking", id="ranking"):
|
| 159 |
gr.Markdown("## π Model Leaderboard")
|
| 160 |
+
gr.Markdown(
|
| 161 |
+
"""
|
| 162 |
+
> **Note:** This ranking table shows raw results from user votes.
|
| 163 |
+
> More detailed analysis will be added manually.
|
| 164 |
+
"""
|
| 165 |
+
)
|
| 166 |
|
| 167 |
ranking_table = gr.Dataframe(
|
| 168 |
headers=[
|
voting.py
CHANGED
|
@@ -9,6 +9,7 @@ import datetime
|
|
| 9 |
import os
|
| 10 |
import threading
|
| 11 |
from datasets import Dataset, load_dataset
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
# HuggingFace dataset configuration
|
|
@@ -42,6 +43,43 @@ def serialize_interactions(interactions):
|
|
| 42 |
return serialized
|
| 43 |
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
def save_vote_to_hf(
|
| 46 |
model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
|
| 47 |
):
|
|
@@ -78,6 +116,10 @@ def save_vote_to_hf(
|
|
| 78 |
serialized_action_a = serialize_interactions(action_a)
|
| 79 |
serialized_action_b = serialize_interactions(action_b)
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Create vote data with full conversation history and actions organized by turns
|
| 82 |
# Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
|
| 83 |
# Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
|
|
@@ -90,12 +132,14 @@ def save_vote_to_hf(
|
|
| 90 |
"action_b": serialized_action_b, # Actions organized by turns for model B
|
| 91 |
"conversation_a": serialized_conversation_a, # Full conversation history for model A
|
| 92 |
"conversation_b": serialized_conversation_b, # Full conversation history for model B
|
|
|
|
|
|
|
| 93 |
"vote": vote_result, # "left", "right", "tie", "both_bad"
|
| 94 |
}
|
| 95 |
|
| 96 |
# Try to load existing dataset or create new one
|
| 97 |
try:
|
| 98 |
-
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
|
| 99 |
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 100 |
if hasattr(dataset, "to_pandas"):
|
| 101 |
df = dataset.to_pandas()
|
|
|
|
| 9 |
import os
|
| 10 |
import threading
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
+
from sandbox.code_analyzer import extract_code_from_markdown
|
| 13 |
|
| 14 |
|
| 15 |
# HuggingFace dataset configuration
|
|
|
|
| 43 |
return serialized
|
| 44 |
|
| 45 |
|
| 46 |
+
def extract_code_snippets_from_conversation(conversation):
|
| 47 |
+
"""
|
| 48 |
+
Extract code snippets and install commands from all assistant messages in a conversation.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
conversation: List of message dicts with 'role' and 'content' keys
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
List of dicts containing code snippets and install commands for each turn
|
| 55 |
+
"""
|
| 56 |
+
if not conversation:
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
code_snippets = []
|
| 60 |
+
for msg in conversation:
|
| 61 |
+
if msg.get("role") == "assistant":
|
| 62 |
+
content = msg.get("content", "")
|
| 63 |
+
if content:
|
| 64 |
+
# Extract code from markdown in the assistant message
|
| 65 |
+
extract_result = extract_code_from_markdown(
|
| 66 |
+
message=content,
|
| 67 |
+
enable_auto_env=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if extract_result is not None:
|
| 71 |
+
code, code_language, env_selection, install_command = extract_result
|
| 72 |
+
# Add code snippet info for this turn
|
| 73 |
+
code_snippets.append({
|
| 74 |
+
"code": code,
|
| 75 |
+
"code_language": code_language,
|
| 76 |
+
"install_command": install_command,
|
| 77 |
+
"environment": str(env_selection) if env_selection else None
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
return code_snippets
|
| 81 |
+
|
| 82 |
+
|
| 83 |
def save_vote_to_hf(
|
| 84 |
model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
|
| 85 |
):
|
|
|
|
| 116 |
serialized_action_a = serialize_interactions(action_a)
|
| 117 |
serialized_action_b = serialize_interactions(action_b)
|
| 118 |
|
| 119 |
+
# Extract code snippets and install commands from conversations
|
| 120 |
+
code_a = extract_code_snippets_from_conversation(conversation_a or [])
|
| 121 |
+
code_b = extract_code_snippets_from_conversation(conversation_b or [])
|
| 122 |
+
|
| 123 |
# Create vote data with full conversation history and actions organized by turns
|
| 124 |
# Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
|
| 125 |
# Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
|
|
|
|
| 132 |
"action_b": serialized_action_b, # Actions organized by turns for model B
|
| 133 |
"conversation_a": serialized_conversation_a, # Full conversation history for model A
|
| 134 |
"conversation_b": serialized_conversation_b, # Full conversation history for model B
|
| 135 |
+
"code_a": code_a, # List of code snippets and install commands for model A
|
| 136 |
+
"code_b": code_b, # List of code snippets and install commands for model B
|
| 137 |
"vote": vote_result, # "left", "right", "tie", "both_bad"
|
| 138 |
}
|
| 139 |
|
| 140 |
# Try to load existing dataset or create new one
|
| 141 |
try:
|
| 142 |
+
dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload")
|
| 143 |
# Convert to pandas DataFrame - handle both Dataset and DatasetDict
|
| 144 |
if hasattr(dataset, "to_pandas"):
|
| 145 |
df = dataset.to_pandas()
|