terryyz commited on
Commit
9f4f2cd
Β·
1 Parent(s): 6553e1c
Files changed (4) hide show
  1. app.py +42 -2
  2. completion.py +2 -3
  3. ranking.py +18 -1
  4. voting.py +45 -1
app.py CHANGED
@@ -1206,6 +1206,14 @@ def build_ui():
1206
  """
1207
 
1208
  gr.Markdown("# 🌸 BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
 
 
 
 
 
 
 
 
1209
 
1210
  # Main tabs
1211
  with gr.Tabs():
@@ -1489,6 +1497,37 @@ def build_ui():
1489
  )
1490
  # Ranking Tab
1491
  ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1492
 
1493
  # Event handlers
1494
  # Create state variables for the run buttons
@@ -1949,9 +1988,10 @@ def build_ui():
1949
 
1950
  # Always show thank you message and clear everything immediately
1951
  gr.Info("Thank you for your vote! πŸŽ‰ Your feedback has been recorded and new models have been selected.", duration=5)
1952
-
 
1953
  # revval the model names in the info message
1954
- gr.Info(f"Now you can see model names! πŸ‘€ \nModel A: {model_a}, Model B: {model_b}", duration=15)
1955
 
1956
  # Get new random models for the next session
1957
  model_a, model_b = get_random_models()
 
1206
  """
1207
 
1208
  gr.Markdown("# 🌸 BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
1209
+
1210
+ gr.HTML("""
1211
+ <p align="center" style="font-size: 1.2em;">
1212
+ <a href="https://github.com/bigcode-project/bigcodearena">GitHub</a> |
1213
+ <a href="https://huggingface.co/collections/bigcode/bigcodearena-68cd3a196e5147cc45f8ea3d">HF Collection</a> |
1214
+ <a href="https://arxiv.org">ArXiv</a>
1215
+ </p>
1216
+ """)
1217
 
1218
  # Main tabs
1219
  with gr.Tabs():
 
1497
  )
1498
  # Ranking Tab
1499
  ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
1500
+
1501
+ # Note Tab
1502
+ with gr.Tab("πŸ“ Note", id="note"):
1503
+ gr.Markdown("## πŸ“‹ Important Information")
1504
+
1505
+ # Privacy Notice Section
1506
+ gr.Markdown("### Privacy Notice")
1507
+ gr.Markdown(
1508
+ """
1509
+ Your conversations and certain other personal information will be disclosed to the relevant AI providers
1510
+ and may otherwise be disclosed publicly to help support our community and advance AI research.
1511
+
1512
+ **Do not submit to our Services any personal information or other sensitive information that you would not
1513
+ want to be shared publicly.** By continuing to use our Services, you acknowledge and direct us to engage
1514
+ in such sharing.
1515
+ """
1516
+ )
1517
+
1518
+ # Citation Section
1519
+ gr.Markdown("### Citation")
1520
+ gr.Markdown(
1521
+ """
1522
+ If you use BigCodeArena in your research, please cite our work:
1523
+ ```bibtex
1524
+ @article{zhuo2025bigcodearena,
1525
+ title={BigCodeArena: Unveiling More Reliable Human Preferences in Code Generation via Execution},
1526
+ year={2025}
1527
+ }
1528
+ ```
1529
+ """
1530
+ )
1531
 
1532
  # Event handlers
1533
  # Create state variables for the run buttons
 
1988
 
1989
  # Always show thank you message and clear everything immediately
1990
  gr.Info("Thank you for your vote! πŸŽ‰ Your feedback has been recorded and new models have been selected.", duration=5)
1991
+ gr.Info(f"Model B: {model_b}", duration=20)
1992
+ gr.Info(f"Model A: {model_a}", duration=20)
1993
  # revval the model names in the info message
1994
+ gr.Info(f"Now you can see model names! πŸ‘€", duration=20)
1995
 
1996
  # Get new random models for the next session
1997
  model_a, model_b = get_random_models()
completion.py CHANGED
@@ -168,10 +168,9 @@ def get_endpoint(endpoint_list):
168
 
169
  # load config args from config yaml files
170
  def make_config(config_file: str) -> dict:
171
- config_kwargs = {}
172
  with open(config_file, "r") as f:
173
- config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
174
-
175
  return config_kwargs
176
 
177
 
 
168
 
169
  # load config args from config yaml files
170
  def make_config(config_file: str) -> dict:
 
171
  with open(config_file, "r") as f:
172
+ config_kwargs = yaml.safe_load(os.path.expandvars(f.read()))
173
+
174
  return config_kwargs
175
 
176
 
ranking.py CHANGED
@@ -50,7 +50,7 @@ def load_ranking_data(hf_token=None, force_reload=False):
50
  download_mode="force_redownload",
51
  )
52
  else:
53
- dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
54
  # Convert to pandas DataFrame - handle both Dataset and DatasetDict
55
  if hasattr(dataset, "to_pandas"):
56
  df = dataset.to_pandas()
@@ -60,6 +60,17 @@ def load_ranking_data(hf_token=None, force_reload=False):
60
  if df.empty:
61
  return pd.DataFrame()
62
 
 
 
 
 
 
 
 
 
 
 
 
63
  # Convert vote format for Elo calculation and count votes
64
  battle_data = []
65
  vote_counts = defaultdict(int)
@@ -146,6 +157,12 @@ def create_ranking_tab():
146
  """Create the ranking tab UI component"""
147
  with gr.Tab("πŸ“Š Ranking", id="ranking"):
148
  gr.Markdown("## πŸ† Model Leaderboard")
 
 
 
 
 
 
149
 
150
  ranking_table = gr.Dataframe(
151
  headers=[
 
50
  download_mode="force_redownload",
51
  )
52
  else:
53
+ dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload")
54
  # Convert to pandas DataFrame - handle both Dataset and DatasetDict
55
  if hasattr(dataset, "to_pandas"):
56
  df = dataset.to_pandas()
 
60
  if df.empty:
61
  return pd.DataFrame()
62
 
63
+ # Filter to only include samples where both models have code in their responses
64
+ # code_a and code_b should be non-empty lists
65
+ if 'code_a' in df.columns and 'code_b' in df.columns:
66
+ df = df[
67
+ df['code_a'].apply(lambda x: isinstance(x, list) and len(x) > 0) &
68
+ df['code_b'].apply(lambda x: isinstance(x, list) and len(x) > 0)
69
+ ]
70
+
71
+ if df.empty:
72
+ return pd.DataFrame()
73
+
74
  # Convert vote format for Elo calculation and count votes
75
  battle_data = []
76
  vote_counts = defaultdict(int)
 
157
  """Create the ranking tab UI component"""
158
  with gr.Tab("πŸ“Š Ranking", id="ranking"):
159
  gr.Markdown("## πŸ† Model Leaderboard")
160
+ gr.Markdown(
161
+ """
162
+ > **Note:** This ranking table shows raw results from user votes.
163
+ > More detailed analysis will be added manually.
164
+ """
165
+ )
166
 
167
  ranking_table = gr.Dataframe(
168
  headers=[
voting.py CHANGED
@@ -9,6 +9,7 @@ import datetime
9
  import os
10
  import threading
11
  from datasets import Dataset, load_dataset
 
12
 
13
 
14
  # HuggingFace dataset configuration
@@ -42,6 +43,43 @@ def serialize_interactions(interactions):
42
  return serialized
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def save_vote_to_hf(
46
  model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
47
  ):
@@ -78,6 +116,10 @@ def save_vote_to_hf(
78
  serialized_action_a = serialize_interactions(action_a)
79
  serialized_action_b = serialize_interactions(action_b)
80
 
 
 
 
 
81
  # Create vote data with full conversation history and actions organized by turns
82
  # Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
83
  # Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
@@ -90,12 +132,14 @@ def save_vote_to_hf(
90
  "action_b": serialized_action_b, # Actions organized by turns for model B
91
  "conversation_a": serialized_conversation_a, # Full conversation history for model A
92
  "conversation_b": serialized_conversation_b, # Full conversation history for model B
 
 
93
  "vote": vote_result, # "left", "right", "tie", "both_bad"
94
  }
95
 
96
  # Try to load existing dataset or create new one
97
  try:
98
- dataset = load_dataset(HF_DATASET_NAME, split="train", token=token)
99
  # Convert to pandas DataFrame - handle both Dataset and DatasetDict
100
  if hasattr(dataset, "to_pandas"):
101
  df = dataset.to_pandas()
 
9
  import os
10
  import threading
11
  from datasets import Dataset, load_dataset
12
+ from sandbox.code_analyzer import extract_code_from_markdown
13
 
14
 
15
  # HuggingFace dataset configuration
 
43
  return serialized
44
 
45
 
46
+ def extract_code_snippets_from_conversation(conversation):
47
+ """
48
+ Extract code snippets and install commands from all assistant messages in a conversation.
49
+
50
+ Args:
51
+ conversation: List of message dicts with 'role' and 'content' keys
52
+
53
+ Returns:
54
+ List of dicts containing code snippets and install commands for each turn
55
+ """
56
+ if not conversation:
57
+ return []
58
+
59
+ code_snippets = []
60
+ for msg in conversation:
61
+ if msg.get("role") == "assistant":
62
+ content = msg.get("content", "")
63
+ if content:
64
+ # Extract code from markdown in the assistant message
65
+ extract_result = extract_code_from_markdown(
66
+ message=content,
67
+ enable_auto_env=True
68
+ )
69
+
70
+ if extract_result is not None:
71
+ code, code_language, env_selection, install_command = extract_result
72
+ # Add code snippet info for this turn
73
+ code_snippets.append({
74
+ "code": code,
75
+ "code_language": code_language,
76
+ "install_command": install_command,
77
+ "environment": str(env_selection) if env_selection else None
78
+ })
79
+
80
+ return code_snippets
81
+
82
+
83
  def save_vote_to_hf(
84
  model_a, model_b, prompt, response_a, response_b, vote_result, interactions_a=None, interactions_b=None, conversation_a=None, conversation_b=None, hf_token=None
85
  ):
 
116
  serialized_action_a = serialize_interactions(action_a)
117
  serialized_action_b = serialize_interactions(action_b)
118
 
119
+ # Extract code snippets and install commands from conversations
120
+ code_a = extract_code_snippets_from_conversation(conversation_a or [])
121
+ code_b = extract_code_snippets_from_conversation(conversation_b or [])
122
+
123
  # Create vote data with full conversation history and actions organized by turns
124
  # Each conversation is a list of messages in format: [{"role": "user"/"assistant", "content": "...", "action": [...]}, ...]
125
  # Actions are organized as list of lists: [[turn1_interactions], [turn2_interactions], ...]
 
132
  "action_b": serialized_action_b, # Actions organized by turns for model B
133
  "conversation_a": serialized_conversation_a, # Full conversation history for model A
134
  "conversation_b": serialized_conversation_b, # Full conversation history for model B
135
+ "code_a": code_a, # List of code snippets and install commands for model A
136
+ "code_b": code_b, # List of code snippets and install commands for model B
137
  "vote": vote_result, # "left", "right", "tie", "both_bad"
138
  }
139
 
140
  # Try to load existing dataset or create new one
141
  try:
142
+ dataset = load_dataset(HF_DATASET_NAME, split="train", token=token, download_mode="force_redownload")
143
  # Convert to pandas DataFrame - handle both Dataset and DatasetDict
144
  if hasattr(dataset, "to_pandas"):
145
  df = dataset.to_pandas()