Spaces:

bigcode
/

arena

Running

App Files Files Community

terryyz commited on Sep 5

Commit

275e3c8

1 Parent(s): 1bb2113

update

Browse files

Files changed (5) hide show

app.py +17 -13
sandbox/code_runner.py +37 -8
sandbox/prompts.py +3 -11
sandbox/sandbox_manager.py +2 -0
sandbox/sandbox_telemetry.py +2 -2

app.py CHANGED Viewed

@@ -237,7 +237,7 @@ def extract_and_execute_code(message, sandbox_state):
         return sandbox_state, "", ""
     code, code_language, env_selection, install_command = extract_result
     # Update sandbox state (now a dictionary)
     sandbox_state['code_to_execute'] = code
     sandbox_state['install_command'] = install_command
@@ -366,7 +366,7 @@ def add_text_and_generate(state0, state1, text, temperature, max_tokens, model_a
                 if sandbox_output0:
                     sandbox_view_a += sandbox_output0
                 if sandbox_error0:
-                    sandbox_view_a = f"<details closed><summary><strong>🚨 Errors</strong></summary>\n\n```\n{sandbox_error0}\n```\n\n</details>\n\n" + sandbox_view_a
             # Process results for model B
             if code1.strip():
@@ -385,7 +385,7 @@ def add_text_and_generate(state0, state1, text, temperature, max_tokens, model_a
                 if sandbox_output1:
                     sandbox_view_b += sandbox_output1
                 if sandbox_error1:
-                    sandbox_view_b = f"<details closed><summary><strong>🚨 Errors</strong></summary>\n\n```\n{sandbox_error1}\n```\n\n</details>\n\n" + sandbox_view_b
         except Exception as e:
             # Fallback to sequential processing
@@ -401,7 +401,7 @@ def add_text_and_generate(state0, state1, text, temperature, max_tokens, model_a
                 if sandbox_output0:
                     sandbox_view_a += sandbox_output0
                 if sandbox_error0:
-                    sandbox_view_a = f"<details closed><summary><strong>🚨 Errors</strong></summary>\n\n```\n{sandbox_error0}\n```\n\n</details>\n\n" + sandbox_view_a
             if code1.strip():
                 install_command1 = sandbox_state1.get('install_command', "")
@@ -415,7 +415,7 @@ def add_text_and_generate(state0, state1, text, temperature, max_tokens, model_a
                 if sandbox_output1:
                     sandbox_view_b += f"## Output\n{sandbox_output1}"
                 if sandbox_error1:
-                    sandbox_view_b = f"<details closed><summary><strong>🚨 Errors</strong></summary>\n\n```\n{sandbox_error1}\n```\n\n</details>\n\n" + sandbox_view_b
             sandbox_time = time.time() - sandbox_start_time
         finally:
@@ -592,7 +592,7 @@ def retry_last_message(state0, state1, model_a, model_b):
         result[19] if len(result) > 19 else "",  # sandbox_view_b
         new_state0,  # state0_var
         new_state1,  # state1_var
-        "",  # Clear text input
         f"**Model A:** {model_a}",  # Update model display A
         f"**Model B:** {model_b}",  # Update model display B
         gr.update(visible=show_vote_buttons),  # vote_section
@@ -652,7 +652,7 @@ def send_to_left_only(state0, state1, text, temperature, max_tokens, model_a, mo
         if sandbox_output0:
             sandbox_view_a += f"# Output\n{sandbox_output0}"
         if sandbox_error0:
-            sandbox_view_a = f"<details closed><summary><strong>🚨 Errors</strong></summary>\n\n```\n{sandbox_error0.strip()}\n```\n\n</details>\n\n" + sandbox_view_a
     # Calculate conversation statistics
     turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
@@ -685,7 +685,7 @@ def send_to_left_only(state0, state1, text, temperature, max_tokens, model_a, mo
         "",  # sandbox_view_b (empty)
         state0,  # state0_var
         state1,  # state1_var
-        "",  # Clear text input
         f"**Model A:** {model_a}",  # Update model display A
         f"**Model B:** {model_b}",  # Update model display B
         gr.update(visible=show_vote_buttons),  # vote_section
@@ -745,7 +745,7 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
         if sandbox_output1:
             sandbox_view_b += f"# Output\n{sandbox_output1}"
         if sandbox_error1:
-            sandbox_view_b = f"<details closed><summary><strong>🚨 Errors</strong></summary>\n\n```\n{sandbox_error1.strip()}\n```\n\n</details>\n\n" + sandbox_view_b
     # Calculate conversation statistics
     turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]]) if state0 else 0
@@ -778,7 +778,7 @@ def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, m
         sandbox_view_b,  # sandbox_view_b
         state0,  # state0_var
         state1,  # state1_var
-        "",  # Clear text input
         f"**Model A:** {model_a}",  # Update model display A
         f"**Model B:** {model_b}",  # Update model display B
         gr.update(visible=show_vote_buttons),  # vote_section
@@ -886,7 +886,7 @@ def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tu
     # Determine environment
     env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
     try:
         if env == SandboxEnvironment.HTML:
             sandbox_url, sandbox_id, stderr = run_html_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
@@ -909,6 +909,7 @@ def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tu
             return result['sandbox_url'], "", result['stderr']
         elif env == SandboxEnvironment.GRADIO:
             sandbox_url, sandbox_id, stderr = run_gradio_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
             sandbox_state['sandbox_id'] = sandbox_id
             return sandbox_url, "", stderr
@@ -926,6 +927,7 @@ def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tu
             return sandbox_url, "", stderr
         elif env == SandboxEnvironment.PYTHON_RUNNER:
             output, stderr = run_code_interpreter(code, 'python', install_command)
             return "", output, stderr
@@ -1697,7 +1699,7 @@ def build_ui():
                 result[19] if len(result) > 19 else "",  # sandbox_view_b
                 new_state0,  # state0_var
                 new_state1,  # state1_var
-                "",  # Clear text input
                 f"**Model A:** {model_a}",  # Update model display A
                 f"**Model B:** {model_b}",  # Update model display B
                 gr.update(visible=show_vote_buttons),  # vote_section
@@ -2046,7 +2048,7 @@ def build_ui():
             # Get new random models
             model_a, model_b = get_random_models()
-            # Clear everything and start fresh immediately
             return (
                 "Thank you for your vote! 🎉",  # vote status with thank you message
                 None,  # Clear state0
@@ -2075,6 +2077,7 @@ def build_ui():
                 gr.update(interactive=False),  # Disable vote_right_btn
                 gr.update(interactive=False),  # Disable vote_tie_btn
                 gr.update(interactive=False),  # Disable vote_both_bad_btn
             )
         # Vote button click handlers
@@ -2115,6 +2118,7 @@ def build_ui():
                     vote_right_btn,  # vote_right_btn
                     vote_tie_btn,  # vote_tie_btn
                     vote_both_bad_btn,  # vote_both_bad_btn
                 ],
             )

         return sandbox_state, "", ""
     code, code_language, env_selection, install_command = extract_result
     # Update sandbox state (now a dictionary)
     sandbox_state['code_to_execute'] = code
     sandbox_state['install_command'] = install_command
                 if sandbox_output0:
                     sandbox_view_a += sandbox_output0
                 if sandbox_error0:
+                    sandbox_view_a = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0}\n```\n\n</details>\n\n" + sandbox_view_a
             # Process results for model B
             if code1.strip():
                 if sandbox_output1:
                     sandbox_view_b += sandbox_output1
                 if sandbox_error1:
+                    sandbox_view_b = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1}\n```\n\n</details>\n\n" + sandbox_view_b
         except Exception as e:
             # Fallback to sequential processing
                 if sandbox_output0:
                     sandbox_view_a += sandbox_output0
                 if sandbox_error0:
+                    sandbox_view_a = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0}\n```\n\n</details>\n\n" + sandbox_view_a
             if code1.strip():
                 install_command1 = sandbox_state1.get('install_command', "")
                 if sandbox_output1:
                     sandbox_view_b += f"## Output\n{sandbox_output1}"
                 if sandbox_error1:
+                    sandbox_view_b = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1}\n```\n\n</details>\n\n" + sandbox_view_b
             sandbox_time = time.time() - sandbox_start_time
         finally:
         result[19] if len(result) > 19 else "",  # sandbox_view_b
         new_state0,  # state0_var
         new_state1,  # state1_var
+        last_user_message,  # Keep original text input
         f"**Model A:** {model_a}",  # Update model display A
         f"**Model B:** {model_b}",  # Update model display B
         gr.update(visible=show_vote_buttons),  # vote_section
         if sandbox_output0:
             sandbox_view_a += f"# Output\n{sandbox_output0}"
         if sandbox_error0:
+            sandbox_view_a = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0.strip()}\n```\n\n</details>\n\n" + sandbox_view_a
     # Calculate conversation statistics
     turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
         "",  # sandbox_view_b (empty)
         state0,  # state0_var
         state1,  # state1_var
+        text,  # Keep original text input
         f"**Model A:** {model_a}",  # Update model display A
         f"**Model B:** {model_b}",  # Update model display B
         gr.update(visible=show_vote_buttons),  # vote_section
         if sandbox_output1:
             sandbox_view_b += f"# Output\n{sandbox_output1}"
         if sandbox_error1:
+            sandbox_view_b = f"<details closed><summary><strong>🚨 Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1.strip()}\n```\n\n</details>\n\n" + sandbox_view_b
     # Calculate conversation statistics
     turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]]) if state0 else 0
         sandbox_view_b,  # sandbox_view_b
         state0,  # state0_var
         state1,  # state1_var
+        text,  # Keep original text input
         f"**Model A:** {model_a}",  # Update model display A
         f"**Model B:** {model_b}",  # Update model display B
         gr.update(visible=show_vote_buttons),  # vote_section
     # Determine environment
     env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
+    print(f"DEBUG: env: {env}")
     try:
         if env == SandboxEnvironment.HTML:
             sandbox_url, sandbox_id, stderr = run_html_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
             return result['sandbox_url'], "", result['stderr']
         elif env == SandboxEnvironment.GRADIO:
+            print(f"DEBUG: running gradio sandbox")
             sandbox_url, sandbox_id, stderr = run_gradio_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
             sandbox_state['sandbox_id'] = sandbox_id
             return sandbox_url, "", stderr
             return sandbox_url, "", stderr
         elif env == SandboxEnvironment.PYTHON_RUNNER:
+            print(f"DEBUG: running python runner")
             output, stderr = run_code_interpreter(code, 'python', install_command)
             return "", output, stderr
                 result[19] if len(result) > 19 else "",  # sandbox_view_b
                 new_state0,  # state0_var
                 new_state1,  # state1_var
+                text,  # Keep original text input
                 f"**Model A:** {model_a}",  # Update model display A
                 f"**Model B:** {model_b}",  # Update model display B
                 gr.update(visible=show_vote_buttons),  # vote_section
             # Get new random models
             model_a, model_b = get_random_models()
+            # Clear everything and start fresh immediately, but preserve examples
             return (
                 "Thank you for your vote! 🎉",  # vote status with thank you message
                 None,  # Clear state0
                 gr.update(interactive=False),  # Disable vote_right_btn
                 gr.update(interactive=False),  # Disable vote_tie_btn
                 gr.update(interactive=False),  # Disable vote_both_bad_btn
+                "",  # Clear text_input to preserve examples
             )
         # Vote button click handlers
                     vote_right_btn,  # vote_right_btn
                     vote_tie_btn,  # vote_tie_btn
                     vote_both_bad_btn,  # vote_both_bad_btn
+                    text_input,  # text_input (to preserve examples)
                 ],
             )

sandbox/code_runner.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, Generator, Literal, TypeAlias, TypedDict, Set
 import uuid
 import time
 import gradio as gr
 import base64
 from e2b_code_interpreter import Sandbox as CodeSandbox
 from gradio_sandboxcomponent import SandboxComponent
@@ -444,15 +444,19 @@ def run_html_sandbox(code: str, install_command: str, existing_sandbox_id: str |
     sandbox.files.make_dir(project_root)
     # Run install command if provided
     if install_command.strip():
         is_success, stdout, stderr = run_command_in_sandbox(
             sandbox=sandbox,
             command=install_command,
             timeout=60 * 3,
         )
         if not is_success:
             print(f"Install command failed: {stderr}")
-            return "", sandbox.sandbox_id, '\n'.join(stderr)
     # replace placeholder URLs with SVG data URLs
     code = replace_placeholder_urls(code)
@@ -461,7 +465,7 @@ def run_html_sandbox(code: str, install_command: str, existing_sandbox_id: str |
     sandbox.files.write(file_path, code, "user", 60)
     sandbox_url = get_sandbox_app_url(sandbox, 'html')
-    return (sandbox_url, sandbox.sandbox_id, '')
 def run_react_sandbox(code: str, install_command: str, existing_sandbox_id: str | None = None) -> CodeRunResult:
@@ -661,6 +665,30 @@ def run_gradio_sandbox(code: str, install_command: str, existing_sandbox_id: str
     sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
     file_path = "~/gradio_app/main.py"
     sandbox.files.write(file_path, code, "user", 60)
     stderrs = []
@@ -676,12 +704,12 @@ def run_gradio_sandbox(code: str, install_command: str, existing_sandbox_id: str
             stderrs.extend(stderr)
         if not is_success:
             print(f"Install command failed: {stderr}")
-            return "", sandbox.sandbox_id, '\n'.join(stderr)
     stderr = run_background_command_with_timeout(
         sandbox,
         f"python {file_path}",
-        timeout=10,
     )
     stderrs.append(stderr)
@@ -710,12 +738,13 @@ def run_streamlit_sandbox(code: str, install_command: str, existing_sandbox_id:
             stderrs.extend(stderr)
         if not is_success:
             print(f"Install command failed: {stderr}")
-            return "", sandbox.sandbox_id, '\n'.join(stderr)
     stderr = run_background_command_with_timeout(
         sandbox,
         r"sudo kill -9 $(ss -lptn 'sport = :8501' | grep -oP '(?<=pid=)\d+'); streamlit run ~/mystreamlit/app.py --server.port 8501 --server.headless true",
-        timeout=8,
     )
     stderrs.append(stderr)

 import uuid
 import time
 import gradio as gr
+import re
 import base64
 from e2b_code_interpreter import Sandbox as CodeSandbox
 from gradio_sandboxcomponent import SandboxComponent
     sandbox.files.make_dir(project_root)
     # Run install command if provided
+    stderrs = []
     if install_command.strip():
         is_success, stdout, stderr = run_command_in_sandbox(
             sandbox=sandbox,
             command=install_command,
             timeout=60 * 3,
         )
+        if stderr:
+            stderrs.extend(stderr)
         if not is_success:
             print(f"Install command failed: {stderr}")
+            # Don't return early - continue with HTML setup
+            stderrs.append(f"Install command failed: {' '.join(stderr)}")
     # replace placeholder URLs with SVG data URLs
     code = replace_placeholder_urls(code)
     sandbox.files.write(file_path, code, "user", 60)
     sandbox_url = get_sandbox_app_url(sandbox, 'html')
+    return (sandbox_url, sandbox.sandbox_id, '\n'.join(stderrs))
 def run_react_sandbox(code: str, install_command: str, existing_sandbox_id: str | None = None) -> CodeRunResult:
     sandbox = reuse_or_create_sandbox(sandbox_id=existing_sandbox_id)
     file_path = "~/gradio_app/main.py"
+    # Remove server_port and server_name arguments from demo.launch() calls
+    code = re.sub(r'\.launch\([^)]*server_port\s*=\s*[^,)]+[,\s]*', '.launch(', code)
+    code = re.sub(r'\.launch\([^)]*server_name\s*=\s*[^,)]+[,\s]*', '.launch(', code)
+    # Ensure demo.launch() uses the correct server configuration
+    if 'demo.launch(' in code:
+        # Replace demo.launch() with proper configuration
+        code = re.sub(
+            r'demo\.launch\([^)]*\)',
+            f'demo.launch(server_name="0.0.0.0", server_port=7860, share=False)',
+            code
+        )
+    elif '.launch(' in code:
+        # Handle other patterns like app.launch(), interface.launch(), etc.
+        code = re.sub(
+            r'(\w+)\.launch\([^)]*\)',
+            rf'\1.launch(server_name="0.0.0.0", server_port=7860, share=False)',
+            code
+        )
+    else:
+        # If no launch() call found, add one at the end
+        code += f'\n\n# Auto-added launch configuration\nif __name__ == "__main__":\n    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)'
     sandbox.files.write(file_path, code, "user", 60)
     stderrs = []
             stderrs.extend(stderr)
         if not is_success:
             print(f"Install command failed: {stderr}")
+            # Don't return early - continue with Gradio setup
+            stderrs.append(f"Install command failed: {' '.join(stderr)}")
     stderr = run_background_command_with_timeout(
         sandbox,
         f"python {file_path}",
+        timeout=15,
     )
     stderrs.append(stderr)
             stderrs.extend(stderr)
         if not is_success:
             print(f"Install command failed: {stderr}")
+            # Don't return early - continue with Streamlit setup
+            stderrs.append(f"Install command failed: {' '.join(stderr)}")
     stderr = run_background_command_with_timeout(
         sandbox,
         r"sudo kill -9 $(ss -lptn 'sport = :8501' | grep -oP '(?<=pid=)\d+'); streamlit run ~/mystreamlit/app.py --server.port 8501 --server.headless true",
+        timeout=15,
     )
     stderrs.append(stderr)

sandbox/prompts.py CHANGED Viewed

@@ -28,18 +28,8 @@ The code must be in the markdown format:
 ```
 Before you begin writing any code, you must follow these fundamental rules:
-- You are NOT allowed to start directly with a code block. Before writing code, ALWAYS think carefully step-by-step
-- Your response must contain a clear explanation of the solution you are providing
 - ALWAYS generate complete, self-contained code in a single file
 - You CAN NOT split your program into multiple files or multiple code blocks
-- If you use any external libraries, make sure to specify them for the installation command in either `pip install` or `npm install`
-- You prefer JavaScript over HTML
-- Each code block must be completely independent. If modifications are needed, the entire code block must be rewritten
-- When fetching data, you MUST use external libraries and packages, and avoid using placeholder URLs or URLs that require API keys
-- Make sure the program is functional by creating a state when needed and having no required props
-- Make sure to include all necessary code in one file
-- There are no additional files in the local file system, unless you create them inside the same program
-- Do not touch project dependencies files like package.json, package-lock.json, requirements.txt, etc
 When developing with React or Vue components, follow these specific requirements:
 - Use TypeScript or JavaScript as the language
@@ -54,6 +44,8 @@ When developing with React or Vue components, follow these specific requirements
 For Python development, you must follow these constraints:
 - For any programs that require user inputs, you MUST USE `gradio` or `streamlit`
 - Choose suitable PyPI packages to be imported, e.g., `import pandas`
 - Avoid using libraries that require desktop GUI interfaces, with the exceptions of `pygame`, `gradio`, and `streamlit` which are explicitly supported
 - For PyGame applications, we use pygbag to build the application. You have to write the main function as an async function like:
@@ -115,7 +107,7 @@ FOR NPM INSTALLATIONS:
 FOR PIP INSTALLATIONS:
 - YOU MUST NOT INSTALL ANY DEEP LEARNING DEPENDENCIES. THE ENVIRONMENT IS CPU ONLY.
 - IF THE USER SAYS TO INSTALL A PACKAGE, YOU MUST INSTALL IT.
-- Use `uv pip install --system` to install packages.
 YOU DONT NEED TO INSTALL ANY FOLLOWING DEPENDENCIES:
 - `gradio`, `streamlit`, `pygame`, `mermaid`, `react`, `react-dom`, `vue`

 ```
 Before you begin writing any code, you must follow these fundamental rules:
 - ALWAYS generate complete, self-contained code in a single file
 - You CAN NOT split your program into multiple files or multiple code blocks
 When developing with React or Vue components, follow these specific requirements:
 - Use TypeScript or JavaScript as the language
 For Python development, you must follow these constraints:
 - For any programs that require user inputs, you MUST USE `gradio` or `streamlit`
+- Gradio Apps MUST start at port 7860
+- Streamlit Apps MUST start at port 8501
 - Choose suitable PyPI packages to be imported, e.g., `import pandas`
 - Avoid using libraries that require desktop GUI interfaces, with the exceptions of `pygame`, `gradio`, and `streamlit` which are explicitly supported
 - For PyGame applications, we use pygbag to build the application. You have to write the main function as an async function like:
 FOR PIP INSTALLATIONS:
 - YOU MUST NOT INSTALL ANY DEEP LEARNING DEPENDENCIES. THE ENVIRONMENT IS CPU ONLY.
 - IF THE USER SAYS TO INSTALL A PACKAGE, YOU MUST INSTALL IT.
+- Use `uv pip install` to install packages.
 YOU DONT NEED TO INSTALL ANY FOLLOWING DEPENDENCIES:
 - `gradio`, `streamlit`, `pygame`, `mermaid`, `react`, `react-dom`, `vue`

sandbox/sandbox_manager.py CHANGED Viewed

@@ -75,6 +75,8 @@ def run_command_in_sandbox(
     stderrs: list[str] = []
     try:
         command_result = sandbox.commands.run(
             cmd=command,
             cwd=working_directory,

     stderrs: list[str] = []
     try:
+        if "uv" in command:
+            command = "uv venv;" + command
         command_result = sandbox.commands.run(
             cmd=command,
             cwd=working_directory,

sandbox/sandbox_telemetry.py CHANGED Viewed

@@ -205,8 +205,8 @@ def log_sandbox_telemetry_gradio_fn(
         default=str,
         ensure_ascii=False
     )
-    filename = get_sandbox_log_filename(sandbox_state)
-    upsert_sandbox_log(filename=filename, data=log_data)
     # # Upload to Azure Blob Storage
     # if AZURE_BLOB_STORAGE_CONNECTION_STRING:

         default=str,
         ensure_ascii=False
     )
+    # filename = get_sandbox_log_filename(sandbox_state)
+    # upsert_sandbox_log(filename=filename, data=log_data)
     # # Upload to Azure Blob Storage
     # if AZURE_BLOB_STORAGE_CONNECTION_STRING: