Spaces:
Running
Running
File size: 96,965 Bytes
fe3f5b0 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 fe3f5b0 1bb2113 fe3f5b0 406df06 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 ba99c06 fe3f5b0 ba99c06 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 275e3c8 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 275e3c8 1bb2113 275e3c8 1bb2113 275e3c8 1bb2113 275e3c8 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 48a3a23 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 ba99c06 1bb2113 48a3a23 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 275e3c8 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 275e3c8 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 48a3a23 275e3c8 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 275e3c8 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 48a3a23 275e3c8 1bb2113 48a3a23 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 1568b04 1bb2113 f319157 1bb2113 1568b04 1bb2113 f319157 1bb2113 ba99c06 1bb2113 fe3f5b0 1bb2113 ba99c06 1bb2113 ba99c06 1bb2113 fe3f5b0 1bb2113 fe3f5b0 ba99c06 1bb2113 ba99c06 fe3f5b0 1bb2113 ba99c06 1bb2113 fe3f5b0 1bb2113 ba99c06 1bb2113 48a3a23 275e3c8 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 48a3a23 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 48a3a23 1bb2113 fe3f5b0 48a3a23 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 fe3f5b0 1bb2113 ba99c06 1bb2113 bc55571 1bb2113 bc55571 1bb2113 bc55571 1bb2113 275e3c8 1bb2113 ba99c06 1bb2113 275e3c8 1bb2113 48a3a23 1bb2113 ba99c06 1bb2113 275e3c8 1bb2113 ba99c06 1bb2113 fe3f5b0 1bb2113 fe3f5b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 |
"""
Simple BigCodeArena - A simplified AI coding battle arena
Focuses on core functionality: two models, automatic code extraction, and execution
"""
import gradio as gr
from gradio_sandboxcomponent import SandboxComponent
import pandas as pd
import datetime
import os
import asyncio
import concurrent.futures
import random
import time
import numpy as np
from collections import defaultdict
from datasets import Dataset, load_dataset
# Import Elo calculation utilities
from elo_calculation import (
calculate_elo_with_confidence_intervals,
create_ranking_dataframe,
)
# Import ranking functionality
from ranking import (
load_ranking_data,
update_ranking_display,
force_update_ranking_display,
create_ranking_tab,
setup_ranking_handlers,
)
# Import voting functionality
from voting import (
handle_vote,
save_vote_to_hf,
serialize_interactions,
create_vote_ui,
should_show_vote_buttons,
get_vote_ui_updates,
setup_vote_handlers,
)
# Import completion utilities
from completion import make_config, registered_api_completion
from sandbox.prompts import GENERAL_SANDBOX_INSTRUCTION
# Import code extraction utilities
from sandbox.code_analyzer import (
SandboxEnvironment,
extract_code_from_markdown,
)
# Import sandbox execution functions
from sandbox.code_runner import (
run_html_sandbox,
run_react_sandbox,
run_vue_sandbox,
run_pygame_sandbox,
run_gradio_sandbox,
run_streamlit_sandbox,
run_code_interpreter,
run_c_code,
run_cpp_code,
run_java_code,
run_golang_code,
run_rust_code,
mermaid_to_html,
javascript_to_html
)
# Import sandbox telemetry
from sandbox.sandbox_telemetry import log_sandbox_telemetry_gradio_fn
# Create a proper sandbox state structure
def create_sandbox_state() -> dict:
"""Create a new sandbox state for a model"""
return {
'enable_sandbox': True,
'enabled_round': 0,
'sandbox_run_round': 0,
'edit_round': 0,
'sandbox_environment': SandboxEnvironment.AUTO,
'auto_selected_sandbox_environment': None,
'sandbox_instruction': "Run the extracted code in the appropriate sandbox environment",
'code_to_execute': "",
'code_dependencies': ([], []),
'btn_list_length': 5,
'sandbox_id': None,
'chat_session_id': None,
'conv_id': None,
"sandbox_output": None,
"sandbox_error": None,
}
def reset_sandbox_state(state: dict) -> dict:
"""Reset the sandbox state"""
state['enabled_round'] = 0
state['sandbox_run_round'] = 0
state['edit_round'] = 0
state['auto_selected_sandbox_environment'] = None
state['code_to_execute'] = ""
state['code_dependencies'] = ([], [])
state['sandbox_error'] = None
state['sandbox_output'] = None
state['sandbox_id'] = None
state['conv_id'] = None
state['chat_session_id'] = None
return state
# Load API configuration
def load_api_config():
"""Load API configuration from yaml file"""
try:
config = make_config("api_config.yaml")
return config
except Exception as e:
return {}
# Global variables
api_config = load_api_config()
available_models = list(api_config.keys()) if api_config else []
# HuggingFace dataset configuration
HF_DATASET_NAME = os.getenv("HF_DATASET_NAME")
HF_TOKEN = os.getenv("HF_TOKEN")
def get_random_models():
"""Get two random models from available models using weighted sampling"""
if len(available_models) < 2:
return available_models[0] if available_models else None, available_models[0] if available_models else None
# Use get_battle_pair for weighted sampling
return get_battle_pair(available_models, {}, [], {}, [])
# Configuration for battle sampling
ANON_MODELS = [] # Models that should not battle against each other in anonymous mode
BATTLE_STRICT_TARGETS = {} # Strict battle targets for specific models
def get_sample_weight(model, outage_models, sampling_weights, sampling_boost_models=None):
"""Get the sampling weight for a model"""
# Check if model is in outage
if model in outage_models:
return 0
# Get base weight from API config
model_config = api_config.get(model, {})
base_weight = model_config.get('weight', 1.0) # Default weight is 1.0
# Apply custom sampling weights if provided
if model in sampling_weights:
base_weight *= sampling_weights[model]
# Apply boost if model is in boost list
if sampling_boost_models and model in sampling_boost_models:
base_weight *= 2.0 # Example boost factor
return base_weight
def is_model_match_pattern(model, pattern):
"""Check if model matches a pattern (for battle strict targets)"""
# Simple pattern matching - can be extended for more complex patterns
if isinstance(pattern, str):
return pattern in model
elif isinstance(pattern, list):
return any(p in model for p in pattern)
return False
def get_battle_pair(
models, battle_targets, outage_models, sampling_weights, sampling_boost_models
):
"""
Sample a pair of models for battle using weighted sampling.
Args:
models: List of available model names
battle_targets: Dict mapping models to their preferred battle targets
outage_models: List of models currently in outage
sampling_weights: Dict of custom sampling weights per model
sampling_boost_models: List of models to boost in sampling
Returns:
Tuple of (model_a, model_b) for battle
"""
if len(models) == 1:
return models[0], models[0]
# Calculate weights for all models
model_weights = []
for model in models:
weight = get_sample_weight(
model, outage_models, sampling_weights, sampling_boost_models
)
model_weights.append(weight)
total_weight = np.sum(model_weights)
if total_weight == 0:
# Fallback to uniform sampling if all weights are 0
return random.sample(models, 2)
model_weights = np.array(model_weights) / total_weight
# Sample first model
chosen_idx = np.random.choice(len(models), p=model_weights)
chosen_model = models[chosen_idx]
# Find eligible rival models
rival_models = []
rival_weights = []
for model in models:
if model == chosen_model:
continue
if model in ANON_MODELS and chosen_model in ANON_MODELS:
continue
if chosen_model in BATTLE_STRICT_TARGETS:
if not is_model_match_pattern(model, BATTLE_STRICT_TARGETS[chosen_model]):
continue
if model in BATTLE_STRICT_TARGETS:
if not is_model_match_pattern(chosen_model, BATTLE_STRICT_TARGETS[model]):
continue
weight = get_sample_weight(model, outage_models, sampling_weights)
if (
weight != 0
and chosen_model in battle_targets
and model in battle_targets[chosen_model]
):
# boost to higher chance for targeted battles
weight = 0.5 * total_weight / len(battle_targets[chosen_model])
rival_models.append(model)
rival_weights.append(weight)
if not rival_models:
# Fallback: if no eligible rivals, pick any other model
rival_models = [m for m in models if m != chosen_model]
if rival_models:
rival_model = random.choice(rival_models)
else:
rival_model = chosen_model
else:
rival_weights = np.array(rival_weights) / np.sum(rival_weights)
rival_idx = np.random.choice(len(rival_models), p=rival_weights)
rival_model = rival_models[rival_idx]
# Randomly swap order
swap = np.random.randint(2)
if swap == 0:
return chosen_model, rival_model
else:
return rival_model, chosen_model
def create_chat_state(model_name: str) -> dict:
"""Create a new chat state for a model"""
return {
"model_name": model_name,
"messages": [],
"sandbox_state": create_sandbox_state(),
"has_output": False,
"generating": False, # Track if model is currently generating
"interactions": [], # Store user interactions
}
def generate_response_with_completion(state, temperature, max_tokens):
"""Generate response using the completion API system with full conversation history"""
if state is None:
return state, ""
# Get the last user message
user_message = None
for msg in reversed(state["messages"]):
if msg["role"] == "user":
user_message = msg["content"]
break
if not user_message:
return state, ""
# Prepare messages for API call - include full conversation history
messages = [{"role": "system", "content": GENERAL_SANDBOX_INSTRUCTION}]
for msg in state["messages"]:
if msg["role"] in ["user", "assistant"] and msg["content"] is not None:
messages.append({"role": msg["role"], "content": msg["content"]})
# Get model config
model_name = state["model_name"]
if model_name not in api_config:
return state, f"Error: Model {model_name} not configured"
model_config = api_config[model_name]
api_type = model_config.get("api_type", "openai")
# retrieve the api completion function from register
api_completion_func = registered_api_completion[api_type]
# build arguments for api completions
# Use the actual model identifier from config, not the display name
actual_model = model_config.get("model", model_name)
kwargs = {
"model": actual_model,
"temperature": temperature,
"max_tokens": max_tokens,
"api_dict": model_config.get("endpoints", [{}])[0] if model_config.get("endpoints") else None,
"messages": messages,
}
output = api_completion_func(**kwargs)
# Extract the answer from the response
if isinstance(output, dict) and "answer" in output:
response_text = output["answer"]
# Return response as dict with content and interaction keys
response_dict = {
"content": response_text,
"interaction": state.get("interactions", [])
}
return state, response_dict
else:
error_msg = f"Error: Invalid response format from {api_type}"
# Return error as dict with content and interaction keys
error_dict = {
"content": error_msg,
"interaction": state.get("interactions", [])
}
return state, error_dict
def generate_response_async(state, temperature, max_tokens):
"""Async wrapper for generate_response_with_completion"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# Run the synchronous function in a thread pool
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(generate_response_with_completion, state, temperature, max_tokens)
return future.result()
finally:
loop.close()
async def generate_responses_parallel(state0, state1, temperature, max_tokens):
"""Generate responses for both models in parallel with error handling"""
loop = asyncio.get_event_loop()
# Run both model generations in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
future0 = loop.run_in_executor(executor, generate_response_with_completion, state0, temperature, max_tokens)
future1 = loop.run_in_executor(executor, generate_response_with_completion, state1, temperature, max_tokens)
# Wait for both to complete with error handling
try:
result0, result1 = await asyncio.gather(future0, future1, return_exceptions=True)
# Handle exceptions
if isinstance(result0, Exception):
result0 = (state0, {"content": f"Error: {str(result0)}", "interaction": []})
if isinstance(result1, Exception):
result1 = (state1, {"content": f"Error: {str(result1)}", "interaction": []})
except Exception as e:
# Fallback to sequential processing
result0 = generate_response_with_completion(state0, temperature, max_tokens)
result1 = generate_response_with_completion(state1, temperature, max_tokens)
return result0, result1
def extract_and_execute_code(message, sandbox_state):
"""Extract code from message and prepare for execution"""
if not message:
return sandbox_state, "", ""
# Extract code using the same logic as code_runner.py
extract_result = extract_code_from_markdown(
message=message,
enable_auto_env=True
)
if extract_result is None:
return sandbox_state, "", ""
code, code_language, env_selection, install_command = extract_result
# Update sandbox state (now a dictionary)
sandbox_state['code_to_execute'] = code
sandbox_state['install_command'] = install_command
sandbox_state['auto_selected_sandbox_environment'] = env_selection
return sandbox_state, code, str(env_selection)
def add_text_and_generate(state0, state1, text, temperature, max_tokens, model_a, model_b):
"""Add text and generate responses for both models"""
if not text.strip():
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", ""
# Initialize states if needed
if state0 is None or state1 is None:
if state0 is None:
state0 = create_chat_state(model_a)
if state1 is None:
state1 = create_chat_state(model_b)
# Add user message to both states
state0["messages"].append({"role": "user", "content": text})
state1["messages"].append({"role": "user", "content": text})
# Mark that generation is starting - this will be used to hide vote buttons
state0["generating"] = True
state1["generating"] = True
# Generate responses in parallel
start_time = time.time()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result0, result1 = loop.run_until_complete(
generate_responses_parallel(state0, state1, temperature, max_tokens)
)
state0, response0 = result0
state1, response1 = result1
generation_time = time.time() - start_time
except Exception as e:
# Fallback to sequential processing
state0, response0 = generate_response_with_completion(state0, temperature, max_tokens)
state1, response1 = generate_response_with_completion(state1, temperature, max_tokens)
generation_time = time.time() - start_time
finally:
loop.close()
# Add the assistant responses to the message history
state0["messages"].append({"role": "assistant", "content": response0["content"]})
state1["messages"].append({"role": "assistant", "content": response1["content"]})
# Format chat history for display
chat0 = format_chat_history(state0["messages"])
chat1 = format_chat_history(state1["messages"])
# Extract code from responses for sandbox
sandbox_state0 = (
state0.get("sandbox_state", create_sandbox_state())
if state0
else create_sandbox_state()
)
sandbox_state1 = (
state1.get("sandbox_state", create_sandbox_state())
if state1
else create_sandbox_state()
)
sandbox_state0, code0, env0 = extract_and_execute_code(response0["content"], sandbox_state0)
sandbox_state1, code1, env1 = extract_and_execute_code(response1["content"], sandbox_state1)
# Update sandbox states in the main states
if state0 is not None:
state0["sandbox_state"] = sandbox_state0
state0["has_output"] = True
state0["generating"] = False # Mark generation as complete
if state1 is not None:
state1["sandbox_state"] = sandbox_state1
state1["has_output"] = True
state1["generating"] = False # Mark generation as complete
# Clear previous sandbox outputs when new message is sent
sandbox_output0 = ""
sandbox_output1 = ""
# Force clear sandbox components to ensure refresh
sandbox_component_update0 = gr.update(value=("", False, []), visible=False)
sandbox_component_update1 = gr.update(value=("", False, []), visible=False)
# Also clear the sandbox view components to show fresh results
sandbox_view_a = ""
sandbox_view_b = ""
# Run sandbox executions in parallel if both models have code
if code0.strip() or code1.strip():
sandbox_start_time = time.time()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# Prepare sandbox execution parameters
install_command0 = sandbox_state0.get('install_command', "") if code0.strip() else ""
install_command1 = sandbox_state1.get('install_command', "") if code1.strip() else ""
# Run both sandbox executions in parallel
result0, result1 = loop.run_until_complete(
run_sandboxes_parallel(
sandbox_state0, code0, install_command0,
sandbox_state1, code1, install_command1
)
)
sandbox_time = time.time() - sandbox_start_time
# Process results for model A
if code0.strip():
sandbox_url0, sandbox_output0, sandbox_error0 = result0
# Check if this is a web-based environment that should use SandboxComponent
env_type = sandbox_state0.get('auto_selected_sandbox_environment') or sandbox_state0.get('sandbox_environment')
# Use the URL directly from the function return
if sandbox_url0:
# Force refresh by using a unique key and clearing first
sandbox_component_update0 = gr.update(
value=(sandbox_url0, True, []),
visible=True,
key=f"sandbox_a_{int(time.time() * 1000)}" # Unique key to force refresh
)
# Update sandbox view with output and errors
if sandbox_output0:
sandbox_view_a += sandbox_output0
if sandbox_error0:
sandbox_view_a = f"<details closed><summary><strong>๐จ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0}\n```\n\n</details>\n\n" + sandbox_view_a
# Process results for model B
if code1.strip():
sandbox_url1, sandbox_output1, sandbox_error1 = result1
# Check if this is a web-based environment that should use SandboxComponent
env_type = sandbox_state1.get('auto_selected_sandbox_environment') or sandbox_state1.get('sandbox_environment')
# Use the URL directly from the function return
if sandbox_url1:
# Force refresh by using a unique key and clearing first
sandbox_component_update1 = gr.update(
value=(sandbox_url1, True, []),
visible=True,
key=f"sandbox_b_{int(time.time() * 1000)}" # Unique key to force refresh
)
if sandbox_output1:
sandbox_view_b += sandbox_output1
if sandbox_error1:
sandbox_view_b = f"<details closed><summary><strong>๐จ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1}\n```\n\n</details>\n\n" + sandbox_view_b
except Exception as e:
# Fallback to sequential processing
if code0.strip():
install_command0 = sandbox_state0.get('install_command', "")
sandbox_url0, sandbox_output0, sandbox_error0 = run_sandbox_code(sandbox_state0, code0, install_command0)
if sandbox_url0:
sandbox_component_update0 = gr.update(
value=(sandbox_url0, True, []),
visible=True,
key=f"sandbox_a_fallback_{int(time.time() * 1000)}"
)
if sandbox_output0:
sandbox_view_a += sandbox_output0
if sandbox_error0:
sandbox_view_a = f"<details closed><summary><strong>๐จ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0}\n```\n\n</details>\n\n" + sandbox_view_a
if code1.strip():
install_command1 = sandbox_state1.get('install_command', "")
sandbox_url1, sandbox_output1, sandbox_error1 = run_sandbox_code(sandbox_state1, code1, install_command1)
if sandbox_url1:
sandbox_component_update1 = gr.update(
value=(sandbox_url1, True, []),
visible=True,
key=f"sandbox_b_fallback_{int(time.time() * 1000)}"
)
if sandbox_output1:
sandbox_view_b += f"## Output\n{sandbox_output1}"
if sandbox_error1:
sandbox_view_b = f"<details closed><summary><strong>๐จ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1}\n```\n\n</details>\n\n" + sandbox_view_b
sandbox_time = time.time() - sandbox_start_time
finally:
loop.close()
else:
# No code to execute, but still ensure sandbox components are cleared
sandbox_component_update0 = gr.update(value=("", False, []), visible=False)
sandbox_component_update1 = gr.update(value=("", False, []), visible=False)
# Calculate conversation statistics
turn_count_a = (
len(
[
msg
for msg in state0["messages"]
if msg["role"] == "assistant" and msg["content"]
]
)
if state0
else 0
)
turn_count_b = (
len(
[
msg
for msg in state1["messages"]
if msg["role"] == "assistant" and msg["content"]
]
)
if state1
else 0
)
# Format conversation statistics
chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages']) if state0 else 0}"
chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages']) if state1 else 0}"
# Get install commands from sandbox states
install_command0 = sandbox_state0.get('install_command', '') if sandbox_state0 else ''
install_command1 = sandbox_state1.get('install_command', '') if sandbox_state1 else ''
return state0, state1, chat0, chat1, response0, response1, code0, code1, env0, env1, sandbox_state0, sandbox_state1, sandbox_output0, sandbox_output1, sandbox_component_update0, sandbox_component_update1, chat_stats_a, chat_stats_b, sandbox_view_a, sandbox_view_b, install_command0, install_command1
def format_chat_history(messages):
"""Format messages for chat display with turn numbers"""
formatted = []
for msg in messages:
if msg["role"] == "user" and msg["content"]:
# Add turn number to user messages
formatted.append({
"role": "user",
"content": msg['content']
})
elif msg["role"] == "assistant" and msg["content"]:
# Add turn number to assistant messages
formatted.append({
"role": "assistant",
"content": msg['content']
})
return formatted
def clear_chat(state0, state1):
"""Clear chat history"""
if state0 and "sandbox_state" in state0:
reset_sandbox_state(state0["sandbox_state"])
state0["interactions"] = [] # Clear interactions
state0["generating"] = False # Reset generating flag
if state1 and "sandbox_state" in state1:
reset_sandbox_state(state1["sandbox_state"])
state1["interactions"] = [] # Clear interactions
state1["generating"] = False # Reset generating flag
# Get current model names for display
model_a, model_b = get_random_models()
print(f"Model A: {model_a}, Model B: {model_b}")
return (
None, # state0
None, # state1
"", # chatbot_a
"", # chatbot_b
"", # response_a
"", # response_b
"", # code_a
"", # code_b
None, # sandbox_state0
None, # sandbox_state1
"", # sandbox_view_a
"", # sandbox_view_b
gr.update(value=("", False, []), visible=False), # sandbox_component_a
gr.update(value=("", False, []), visible=False), # sandbox_component_b
"**Conversation:** 0 turns | **Total Messages:** 0", # chat_stats_a
"**Conversation:** 0 turns | **Total Messages:** 0", # chat_stats_b
"", # sandbox_view_a (duplicate)
"", # sandbox_view_b (duplicate)
"", # install_command_a
"", # install_command_b
f"**Model A:** {model_a}", # model_display_a
f"**Model B:** {model_b}", # model_display_b
"", # text_input
gr.update(visible=False), # vote_section
gr.update(visible=False), # vote_buttons_row
"", # vote_status
gr.update(interactive=False), # vote_left_btn
gr.update(interactive=False), # vote_right_btn
gr.update(interactive=False), # vote_tie_btn
gr.update(interactive=False), # vote_both_bad_btn
)
def retry_last_message(state0, state1, model_a, model_b):
"""Retry the last user message"""
if not state0 or not state1:
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
# Get the last user message
last_user_message = ""
for msg in reversed(state0["messages"]):
if msg["role"] == "user":
last_user_message = msg["content"]
break
if not last_user_message:
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
# Remove the last user message and assistant responses from both states
if state0["messages"] and state0["messages"][-1]["role"] == "assistant":
state0["messages"].pop() # Remove last assistant response
if state0["messages"] and state0["messages"][-1]["role"] == "user":
state0["messages"].pop() # Remove last user message
if state1["messages"] and state1["messages"][-1]["role"] == "assistant":
state1["messages"].pop() # Remove last assistant response
if state1["messages"] and state1["messages"][-1]["role"] == "user":
state1["messages"].pop() # Remove last user message
# Generate new responses with the same message
result = add_text_and_generate(state0, state1, last_user_message, 0.4, 8192, model_a, model_b)
# Extract the state from the result
new_state0, new_state1 = result[0], result[1]
# Check if both models have output and are not generating to show vote buttons
show_vote_buttons = should_show_vote_buttons(new_state0, new_state1)
# Return all the original outputs plus the updated state for run buttons
return (
new_state0, # state0
new_state1, # state1
result[2], # chatbot_a (chat0)
result[3], # chatbot_b (chat1)
(
result[4]["content"] if isinstance(result[4], dict) else result[4]
), # response_a (response0)
(
result[5]["content"] if isinstance(result[5], dict) else result[5]
), # response_b (response1)
result[6], # code_a (code0)
result[7], # code_b (code1)
result[10] if len(result) > 10 else "", # sandbox_state0
result[11] if len(result) > 11 else "", # sandbox_state1
result[12] if len(result) > 12 else "", # sandbox_output0
result[13] if len(result) > 13 else "", # sandbox_output1
(
result[14] if len(result) > 14 else gr.update(visible=False)
), # sandbox_component_update0
(
result[15] if len(result) > 15 else gr.update(visible=False)
), # sandbox_component_update1
(
result[16] if len(result) > 16 else "**Conversation:** 0 turns"
), # chat_stats_a
(
result[17] if len(result) > 17 else "**Conversation:** 0 turns"
), # chat_stats_b
result[18] if len(result) > 18 else "", # sandbox_view_a
result[19] if len(result) > 19 else "", # sandbox_view_b
new_state0, # state0_var
new_state1, # state1_var
last_user_message, # Keep original text input
f"**Model A:** {model_a}", # Update model display A
f"**Model B:** {model_b}", # Update model display B
gr.update(visible=show_vote_buttons), # vote_section
gr.update(visible=show_vote_buttons), # vote_buttons_row
gr.update(visible=False), # vote_status
gr.update(interactive=show_vote_buttons), # vote_left_btn
gr.update(interactive=show_vote_buttons), # vote_right_btn
gr.update(interactive=show_vote_buttons), # vote_tie_btn
gr.update(interactive=show_vote_buttons), # vote_both_bad_btn
)
def send_to_left_only(state0, state1, text, temperature, max_tokens, model_a, model_b):
"""Send message to left model (Model A) only"""
if not text.strip():
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
# Initialize states if needed
if state0 is None:
state0 = create_chat_state(model_a)
if state1 is None:
state1 = create_chat_state(model_b)
# Add user message to left state only
state0["messages"].append({"role": "user", "content": text})
state0["generating"] = True
# Generate response for left model only
state0, response0 = generate_response_with_completion(state0, temperature, max_tokens)
state0["messages"].append({"role": "assistant", "content": response0["content"]})
state0["has_output"] = True
state0["generating"] = False
# Format chat history for display
chat0 = format_chat_history(state0["messages"])
chat1 = format_chat_history(state1["messages"]) if state1 else []
# Extract code from response for sandbox
sandbox_state0 = state0.get("sandbox_state", create_sandbox_state())
sandbox_state0, code0, env0 = extract_and_execute_code(response0["content"], sandbox_state0)
state0["sandbox_state"] = sandbox_state0
# Clear previous sandbox outputs
sandbox_output0 = ""
sandbox_component_update0 = gr.update(value=("", False, []), visible=False)
sandbox_view_a = ""
# Run sandbox execution if there's code
if code0.strip():
install_command0 = sandbox_state0.get('install_command', "")
sandbox_url0, sandbox_output0, sandbox_error0 = run_sandbox_code(sandbox_state0, code0, install_command0)
if sandbox_url0:
sandbox_component_update0 = gr.update(
value=(sandbox_url0, True, []),
visible=True,
key=f"sandbox_a_{int(time.time() * 1000)}"
)
if sandbox_output0:
sandbox_view_a += f"# Output\n{sandbox_output0}"
if sandbox_error0:
sandbox_view_a = f"<details closed><summary><strong>๐จ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error0.strip()}\n```\n\n</details>\n\n" + sandbox_view_a
# Calculate conversation statistics
turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]])
turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]]) if state1 else 0
chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages'])}"
chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages']) if state1 else 0}"
# Don't show vote buttons since only one model responded
show_vote_buttons = False
return (
state0, # state0
state1, # state1
chat0, # chatbot_a
chat1, # chatbot_b
(
response0["content"] if isinstance(response0, dict) else response0
), # response_a
"", # response_b (empty)
code0, # code_a
"", # code_b (empty)
sandbox_state0, # sandbox_state0
(
state1.get("sandbox_state", create_sandbox_state())
if state1
else create_sandbox_state()
), # sandbox_state1
sandbox_output0, # sandbox_output0
"", # sandbox_output1 (empty)
sandbox_component_update0, # sandbox_component_update0
gr.update(value=("", False, []), visible=False), # sandbox_component_update1
chat_stats_a, # chat_stats_a
chat_stats_b, # chat_stats_b
sandbox_view_a, # sandbox_view_a
"", # sandbox_view_b (empty)
state0, # state0_var
state1, # state1_var
state0.get('install_command', ''), # state0_install_command
state1.get('install_command', ''), # state1_install_command
text, # Keep original text input
f"**Model A:** {model_a}", # Update model display A
f"**Model B:** {model_b}", # Update model display B
gr.update(visible=show_vote_buttons), # vote_section
gr.update(visible=show_vote_buttons), # vote_buttons_row
gr.update(visible=False), # vote_status
gr.update(interactive=show_vote_buttons), # vote_left_btn
gr.update(interactive=show_vote_buttons), # vote_right_btn
gr.update(interactive=show_vote_buttons), # vote_tie_btn
gr.update(interactive=show_vote_buttons), # vote_both_bad_btn
)
def send_to_right_only(state0, state1, text, temperature, max_tokens, model_a, model_b):
"""Send message to right model (Model B) only"""
if not text.strip():
return state0, state1, "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
# Initialize states if needed
if state0 is None:
state0 = create_chat_state(model_a)
if state1 is None:
state1 = create_chat_state(model_b)
# Add user message to right state only
state1["messages"].append({"role": "user", "content": text})
state1["generating"] = True
# Generate response for right model only
state1, response1 = generate_response_with_completion(state1, temperature, max_tokens)
state1["messages"].append({"role": "assistant", "content": response1["content"]})
state1["has_output"] = True
state1["generating"] = False
# Format chat history for display
chat0 = format_chat_history(state0["messages"]) if state0 else []
chat1 = format_chat_history(state1["messages"])
# Extract code from response for sandbox
sandbox_state1 = state1.get("sandbox_state", create_sandbox_state())
sandbox_state1, code1, env1 = extract_and_execute_code(response1["content"], sandbox_state1)
state1["sandbox_state"] = sandbox_state1
# Clear previous sandbox outputs
sandbox_output1 = ""
sandbox_component_update1 = gr.update(value=("", False, []), visible=False)
sandbox_view_b = ""
# Run sandbox execution if there's code
if code1.strip():
install_command1 = sandbox_state1.get('install_command', "")
sandbox_url1, sandbox_output1, sandbox_error1 = run_sandbox_code(sandbox_state1, code1, install_command1)
if sandbox_url1:
sandbox_component_update1 = gr.update(
value=(sandbox_url1, True, []),
visible=True,
key=f"sandbox_b_{int(time.time() * 1000)}"
)
if sandbox_output1:
sandbox_view_b += f"# Output\n{sandbox_output1}"
if sandbox_error1:
sandbox_view_b = f"<details closed><summary><strong>๐จ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error1.strip()}\n```\n\n</details>\n\n" + sandbox_view_b
# Calculate conversation statistics
turn_count_a = len([msg for msg in state0["messages"] if msg["role"] == "assistant" and msg["content"]]) if state0 else 0
turn_count_b = len([msg for msg in state1["messages"] if msg["role"] == "assistant" and msg["content"]])
chat_stats_a = f"**Conversation:** {turn_count_a} turns | **Total Messages:** {len(state0['messages']) if state0 else 0}"
chat_stats_b = f"**Conversation:** {turn_count_b} turns | **Total Messages:** {len(state1['messages'])}"
# Don't show vote buttons since only one model responded
show_vote_buttons = False
return (
state0, # state0
state1, # state1
chat0, # chatbot_a
chat1, # chatbot_b
"", # response_a (empty)
(
response1["content"] if isinstance(response1, dict) else response1
), # response_b
"", # code_a (empty)
code1, # code_b
(
state0.get("sandbox_state", create_sandbox_state())
if state0
else create_sandbox_state()
), # sandbox_state0
sandbox_state1, # sandbox_state1
"", # sandbox_output0 (empty)
sandbox_output1, # sandbox_output1
gr.update(value=("", False, []), visible=False), # sandbox_component_update0
sandbox_component_update1, # sandbox_component_update1
chat_stats_a, # chat_stats_a
chat_stats_b, # chat_stats_b
"", # sandbox_view_a (empty)
sandbox_view_b, # sandbox_view_b
state0, # state0_var
state1, # state1_var
state0.get('install_command', ''), # state0_install_command
state1.get('install_command', ''), # state1_install_command
text, # Keep original text input
f"**Model A:** {model_a}", # Update model display A
f"**Model B:** {model_b}", # Update model display B
gr.update(visible=show_vote_buttons), # vote_section
gr.update(visible=show_vote_buttons), # vote_buttons_row
gr.update(visible=False), # vote_status
gr.update(interactive=show_vote_buttons), # vote_left_btn
gr.update(interactive=show_vote_buttons), # vote_right_btn
gr.update(interactive=show_vote_buttons), # vote_tie_btn
gr.update(interactive=show_vote_buttons), # vote_both_bad_btn
)
def rerun_code_execution(state, current_code: str, current_install_command: str, model_name: str) -> tuple[dict, str, str, str]:
"""Re-run code execution for a specific model using the current code and install command from the UI components"""
if not state or not state.get("sandbox_state"):
return state, "", "", ""
sandbox_state = state["sandbox_state"]
if not current_code.strip():
return state, "", "", "No code to re-run"
# Update the sandbox state with the current code and install command
sandbox_state['code_to_execute'] = current_code
sandbox_state['install_command'] = current_install_command
# Re-run the code execution with the updated code and install command
sandbox_url, sandbox_output, sandbox_error = run_sandbox_code(sandbox_state, current_code, current_install_command)
# Update sandbox view with new output
sandbox_view = ""
if sandbox_output:
sandbox_view += sandbox_output
if sandbox_error:
sandbox_view = f"<details closed><summary><strong>๐จ Errors/Warnings</strong></summary>\n\n```\n{sandbox_error}\n```\n\n</details>\n\n" + sandbox_view
# Update sandbox component if we have a URL
sandbox_component_update = None
if sandbox_url:
sandbox_component_update = gr.update(
value=(sandbox_url, True, []),
visible=True,
key=f"sandbox_{model_name.lower()}_{int(time.time() * 1000)}" # Unique key to force refresh
)
return state, sandbox_view, sandbox_component_update, sandbox_error
def run_sandbox_code(sandbox_state: dict, code: str, install_command: str) -> tuple[str, str, str]:
"""Run code in the appropriate sandbox environment"""
if not code.strip():
return "", "", "No code to run"
# Update sandbox state
sandbox_state['code_to_execute'] = code
sandbox_state['install_command'] = install_command
# Determine environment
env = sandbox_state.get('auto_selected_sandbox_environment') or sandbox_state.get('sandbox_environment')
try:
if env == SandboxEnvironment.HTML:
sandbox_url, sandbox_id, stderr = run_html_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = sandbox_id
return sandbox_url, "", stderr
elif env == SandboxEnvironment.REACT:
result = run_react_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = result['sandbox_id']
return result['sandbox_url'], "", result['stderr']
elif env == SandboxEnvironment.VUE:
result = run_vue_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = result['sandbox_id']
return result['sandbox_url'], "", result['stderr']
elif env == SandboxEnvironment.PYGAME:
result = run_pygame_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = result['sandbox_id']
return result['sandbox_url'], "", result['stderr']
elif env == SandboxEnvironment.GRADIO:
sandbox_url, sandbox_id, stderr = run_gradio_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = sandbox_id
return sandbox_url, "", stderr
elif env == SandboxEnvironment.STREAMLIT:
sandbox_url, sandbox_id, stderr = run_streamlit_sandbox(code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = sandbox_id
return sandbox_url, "", stderr
elif env == SandboxEnvironment.MERMAID:
# Convert Mermaid to HTML and run in HTML sandbox
html_code = mermaid_to_html(code, theme='light')
sandbox_url, sandbox_id, stderr = run_html_sandbox(html_code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = sandbox_id
return sandbox_url, "", stderr
elif env == SandboxEnvironment.PYTHON_RUNNER:
output, stderr = run_code_interpreter(code, 'python', install_command)
return "", output, stderr
elif env == SandboxEnvironment.JAVASCRIPT_RUNNER:
html_code = javascript_to_html(code)
sandbox_url, sandbox_id, stderr = run_html_sandbox(html_code, install_command, sandbox_state.get('sandbox_id'))
sandbox_state['sandbox_id'] = sandbox_id
return sandbox_url, "", stderr
elif env == SandboxEnvironment.C_RUNNER:
output, stderr = run_c_code(code, sandbox_state.get('sandbox_id'))
return "", output, stderr
elif env == SandboxEnvironment.CPP_RUNNER:
output, stderr = run_cpp_code(code, sandbox_state.get('sandbox_id'))
return "", output, stderr
elif env == SandboxEnvironment.JAVA_RUNNER:
output, stderr = run_java_code(code, sandbox_state.get('sandbox_id'))
return "", output, stderr
elif env == SandboxEnvironment.GOLANG_RUNNER:
output, stderr = run_golang_code(code, sandbox_state.get('sandbox_id'))
return "", output, stderr
elif env == SandboxEnvironment.RUST_RUNNER:
output, stderr = run_rust_code(code, sandbox_state.get('sandbox_id'))
return "", output, stderr
else:
# Fallback to Python runner
output, stderr = run_code_interpreter(code, 'python', install_command)
return "", output, stderr
except Exception as e:
return "", "", str(e)
async def run_sandbox_code_async(sandbox_state: dict, code: str, install_command: str) -> tuple[str, str, str]:
"""Async wrapper for run_sandbox_code"""
loop = asyncio.get_event_loop()
# Run sandbox execution in a thread pool to avoid blocking
with concurrent.futures.ThreadPoolExecutor() as executor:
future = loop.run_in_executor(executor, run_sandbox_code, sandbox_state, code, install_command)
return await future
async def run_sandboxes_parallel(sandbox_state0, code0, install_command0, sandbox_state1, code1, install_command1):
"""Run both sandbox executions in parallel with error handling"""
loop = asyncio.get_event_loop()
# Create tasks for both sandbox executions
task0 = loop.run_in_executor(None, run_sandbox_code, sandbox_state0, code0, install_command0)
task1 = loop.run_in_executor(None, run_sandbox_code, sandbox_state1, code1, install_command1)
# Wait for both to complete with error handling
try:
result0, result1 = await asyncio.gather(task0, task1, return_exceptions=True)
# Handle exceptions
if isinstance(result0, Exception):
result0 = ("", "", f"Sandbox execution error: {str(result0)}")
if isinstance(result1, Exception):
result1 = ("", "", f"Sandbox execution error: {str(result1)}")
except Exception as e:
# Fallback to sequential processing
result0 = run_sandbox_code(sandbox_state0, code0, install_command0)
result1 = run_sandbox_code(sandbox_state1, code1, install_command1)
return result0, result1
def instantiate_send_button():
"""Create a send button with icon"""
return gr.Button(
"๐",
size="lg",
scale=0,
min_width=60,
variant="primary",
elem_id="send-btn"
)
def instantiate_retry_button():
"""Create a retry button with icon"""
return gr.Button(
"๐",
size="lg",
scale=0,
min_width=60,
variant="secondary",
elem_id="retry-btn"
)
def instantiate_send_left_button():
"""Create a send left button with icon"""
return gr.Button(
"โฌ
๏ธ",
size="lg",
scale=0,
min_width=60,
variant="secondary",
elem_id="send-left-btn"
)
def instantiate_send_right_button():
"""Create a send right button with icon"""
return gr.Button(
"โก๏ธ",
size="lg",
scale=0,
min_width=60,
variant="secondary",
elem_id="send-right-btn"
)
def instantiate_clear_button():
"""Create a clear button with icon"""
return gr.Button(
"๐๏ธ",
size="sm",
scale=0,
min_width=40,
variant="secondary",
elem_id="clear-btn"
)
def build_ui():
"""Build a UI for the coding arena with integrated sandbox"""
# Get random models for this session
model_a, model_b = get_random_models()
print(f"Model A: {model_a}, Model B: {model_b}")
with gr.Blocks(title="BigCodeArena", theme=gr.themes.Soft()) as demo:
# Add custom CSS for centering and button styling
demo.css = """
.center-text {
text-align: center !important;
}
.input-row {
display: flex;
align-items: center;
gap: 12px;
}
.input-row .gr-textbox {
flex: 1;
}
.input-row .gr-button {
flex-shrink: 0;
height: 40px;
font-size: 16px;
}
.button-grid {
display: flex;
flex-direction: column;
gap: 8px;
}
.button-grid .gr-row {
display: flex;
gap: 8px;
}
.button-grid .gr-button {
flex: 1;
min-width: 60px;
}
"""
gr.Markdown("# ๐ธ BigCodeArena - Start Your Vibe Coding!", elem_classes="center-text")
# Main tabs
with gr.Tabs():
# Arena Tab
with gr.Tab("๐ฅ Arena", id="arena"):
# Model display (non-interactive)
with gr.Row():
with gr.Column():
model_display_a = gr.Markdown(
f"**Model A:** {model_a}", visible=False
)
with gr.Column():
model_display_b = gr.Markdown(
f"**Model B:** {model_b}", visible=False
)
# Sandbox section with tabs for each model - Collapsible and open by default
with gr.Accordion("๐๏ธ Code Execution & Sandbox", open=True):
with gr.Row():
# Model A Sandbox
with gr.Column():
gr.Markdown("### Model A Sandbox")
with gr.Tabs() as tabs_a:
with gr.Tab("View", id=0):
sandbox_view_a = gr.Markdown(
"**Sandbox output will appear here automatically**"
)
sandbox_component_a = SandboxComponent(
value=("", False, []),
label="Model A Sandbox",
visible=False,
)
with gr.Tab("Code", id=1):
code_a = gr.Code(
label="Extracted Code",
language="python",
lines=8,
interactive=True,
)
install_command_a = gr.Textbox(
label="Install Command",
placeholder="bash command to install dependencies",
interactive=True,
lines=1,
)
rerun_code_a_btn = gr.Button(
"๐ Re-run Code",
)
# Model B Sandbox
with gr.Column():
gr.Markdown("### Model B Sandbox")
with gr.Tabs() as tabs_b:
with gr.Tab("View", id=2):
sandbox_view_b = gr.Markdown(
"**Sandbox output will appear here automatically**"
)
sandbox_component_b = SandboxComponent(
value=("", False, []),
label="Model B Sandbox",
visible=False,
)
with gr.Tab("Code", id=3):
code_b = gr.Code(
label="Extracted Code",
language="python",
lines=8,
interactive=True,
)
install_command_b = gr.Textbox(
label="Install Command",
placeholder="bash command to install dependencies",
interactive=True,
lines=1,
)
rerun_code_b_btn = gr.Button(
"๐ Re-run Code",
)
# Vote UI components
vote_components = create_vote_ui()
vote_section = vote_components["vote_section"]
vote_buttons_row = vote_components["vote_buttons_row"]
vote_left_btn = vote_components["vote_left_btn"]
vote_right_btn = vote_components["vote_right_btn"]
vote_tie_btn = vote_components["vote_tie_btn"]
vote_both_bad_btn = vote_components["vote_both_bad_btn"]
vote_status = vote_components["vote_status"]
# Main chat interface - Collapsible and hidden by default
with gr.Accordion("๐ฌ Chat Interface", open=False):
with gr.Row():
with gr.Column():
gr.Markdown("## Model A")
chatbot_a = gr.Chatbot(
label="Model A",
height=300,
show_copy_button=True,
type="messages",
)
chat_stats_a = gr.Markdown("**Conversation:** 0 turns")
with gr.Column():
gr.Markdown("## Model B")
chatbot_b = gr.Chatbot(
label="Model B",
height=300,
show_copy_button=True,
type="messages",
)
chat_stats_b = gr.Markdown("**Conversation:** 0 turns")
# Input section with 2x2 button grid
with gr.Row(elem_classes="input-row"):
text_input = gr.Textbox(
label="Enter your coding prompt",
placeholder="e.g., 'Write a Python function to calculate fibonacci numbers'",
lines=1,
scale=1
)
with gr.Column(scale=0, min_width=140, elem_classes="button-grid"):
with gr.Row():
send_btn = instantiate_send_button()
retry_btn = instantiate_retry_button()
with gr.Row():
send_left_btn = instantiate_send_left_button()
send_right_btn = instantiate_send_right_button()
# Additional control buttons
with gr.Row():
clear_btn = gr.Button("๐๏ธ Clear Chat", variant="secondary")
refresh_models_btn = gr.Button(
"๐ New Random Models", variant="secondary"
)
# Advanced Settings (Collapsible)
with gr.Accordion("โ๏ธ Advanced Settings", open=False):
with gr.Row():
with gr.Column(scale=1):
temperature = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.4,
step=0.1,
label="Temperature",
)
with gr.Column(scale=1):
max_tokens = gr.Slider(
minimum=1024,
maximum=32768,
value=8192,
label="Max Tokens",
)
# Examples
gr.Examples(
examples=[
[
"ไฝฟ็จSVG็ปๅถๆฅ่ไธป้ข็ๅจๆๅพๆก๏ผๅ
ๆฌ๏ผ1๏ผไธไธช็บข่ฒ็็ฏ็ฌผ๏ผๅธฆๆ้่ฒ็ๆต่ 2๏ผไธไธช้่ฒ็็ฆๅญ๏ผไฝฟ็จไนฆๆณๅญไฝ 3๏ผ่ๆฏๆทปๅ ไธไบ็่ฑๆๆ 4๏ผๅจ็ฏ็ฌผๅ็ฆๅญๅจๅดๆทปๅ ไธไบ็ฅฅไบๅพๆกใ็กฎไฟๅพๆกๅธๅฑ็พ่ง๏ผ้ข่ฒๆญ้
็ฌฆๅๆฅ่ไผ ็ป้ฃๆ ผใ"
],
[
"SVGใไฝฟ็จใใฆๆฅๆฌใฎไผ็ตฑ็ใชๅๆใใฟใผใณใๆ็ปใใฆใใ ใใใ1๏ผๆณข็ด๏ผใใใชใฟ๏ผๆจกๆง 2๏ผๅธๆพๆจกๆง 3๏ผ้บปใฎ่ๆจกๆง 4๏ผ้ทๆ๏ผใใใใ๏ผๆจกๆงใๅซใใฆใใ ใใใ่ฒใฏไผ็ตฑ็ใชๆฅๆฌใฎ่ฒ๏ผ่่ฒใๆฑ่ฒใ้่ฒใชใฉ๏ผใไฝฟ็จใใใฌใคใขใฆใใฏใใฉใณในใใ้
็ฝฎใใฆใใ ใใใ"
],
[
"Write HTML with P5.js that simulates 25 particles in a vacuum space of a cylindrical container, bouncing within its boundaries. Use different colors for each ball and ensure they leave a trail showing their movement. Add a slow rotation of the container to give better view of what's going on in the scene. Make sure to create proper collision detection and physic rules to ensure particles remain in the container. Add an external spherical container. Add a slow zoom in and zoom out effect to the whole scene."
],
[
"Write a Python script to scrape NVIDIA's stock price for the past month using the yfinance library. Clean the data and create an interactive visualization using Matplotlib. Include: 1) A candlestick chart showing daily price movements 2) A line chart with 7-day and 30-day moving averages. Add hover tooltips showing exact values and date. Make the layout professional with proper titles and axis labels."
],
[
"Write a Python script that uses the Gradio library to create a chatbot that can have conversations with users. The chatbot should maintain conversation history, display messages in a chat interface, and respond to user inputs. Include a text input field for users to type messages and a chatbot component to display the conversation. The bot should give helpful responses and remember the context of the conversation."
],
[
"Write a Todo list app using React.js. The app should allow users to add, delete, and mark tasks as completed. Include features like filtering tasks by status (completed, active), sorting tasks by priority, and displaying the total number of tasks."
],
[
"Write a Python script using the Streamlit library to create a web application for uploading and displaying files. The app should allow users to upload files of type .csv or .txt. If a .csv file is uploaded, display its contents as a table using Streamlit's st.dataframe() method. If a .txt file is uploaded, display its content as plain text."
],
[
"Write a Python function to solve the Trapping Rain Water problem. The function should take a list of non-negative integers representing the height of bars in a histogram and return the total amount of water trapped between the bars after raining. Use an efficient algorithm with a time complexity of O(n)."
],
[
"Create a simple Pygame script for a game where the player controls a bouncing ball that changes direction when it collides with the edges of the window. Add functionality for the player to control a paddle using arrow keys, aiming to keep the ball from touching the bottom of the screen. Include basic collision detection and a scoring system that increases as the ball bounces off the paddle. You need to add clickable buttons to start the game, and reset the game."
],
[
"Create a financial management Dashboard using Vue.js, focusing on local data handling without APIs. Include features like a clean dashboard for tracking income and expenses, dynamic charts for visualizing finances, and a budget planner. Implement functionalities for adding, editing, and deleting transactions, as well as filtering by date or category. Ensure responsive design and smooth user interaction for an intuitive experience."
],
[
"Create a Mermaid diagram to visualize a flowchart of a user login process. Include the following steps: User enters login credentials; Credentials are validated; If valid, the user is directed to the dashboard; If invalid, an error message is shown, and the user can retry or reset the password."
],
[
"Write a Python function to calculate the Fibonacci sequence up to n numbers. Then write test cases to verify the function works correctly for edge cases like negative numbers, zero, and large inputs."
],
[
"Build an HTML page for a Kanban board with three columns with Vue.js: To Do, In Progress, and Done. Each column should allow adding, moving, and deleting tasks. Implement drag-and-drop functionality using Vue Draggable and persist the state using Vuex."
],
[
"Develop a Streamlit app that takes a CSV file as input and provides: 1) Basic statistics about the data 2) Interactive visualizations using Plotly 3) A data cleaning interface with options to handle missing values 4) An option to download the cleaned data."
],
[
"Write an HTML page with embedded JavaScript that creates an interactive periodic table. Each element should display its properties on hover and allow filtering by category (metals, non-metals, etc.). Include a search bar to find elements by name or symbol."
],
[
"Here's a Python function that sorts a list of dictionaries by a specified key:\n\n```python\ndef sort_dicts(data, key):\n return sorted(data, key=lambda x: x[key])\n```\n\nWrite test cases to verify the function works correctly for edge cases like empty lists, missing keys, and different data types. If you use unittest, please use `unittest.main(argv=['first-arg-is-ignored'], exit=False)` to run the tests."
],
[
"Create a React component for a fitness tracker that shows: 1) Daily step count 2) Calories burned 3) Distance walked 4) A progress bar for daily goals."
],
[
"Build a Vue.js dashboard for monitoring server health. Include: 1) Real-time CPU and memory usage graphs 2) Disk space visualization 3) Network activity monitor 4) Alerts for critical thresholds."
],
[
"Write a C program that calculates and prints the first 100 prime numbers in a formatted table with 10 numbers per row. Include a function to check if a number is prime and use it in your solution."
],
[
"Write a C++ program that implements a simple calculator using object-oriented programming. Create a Calculator class with methods for addition, subtraction, multiplication, and division. Include error handling for division by zero."
],
[
"Write a Rust program that generates and prints a Pascal's Triangle with 10 rows. Format the output to center-align the numbers in each row."
],
[
"Write a Java program that simulates a simple bank account system. Create a BankAccount class with methods for deposit, withdrawal, and balance inquiry. Include error handling for insufficient funds and demonstrate its usage with a few transactions."
],
[
"Write a Go program that calculates and prints the Fibonacci sequence up to the 50th number. Format the output in a table with 5 numbers per row and include the index of each Fibonacci number."
],
[
"Write a C program that calculates and prints a histogram of letter frequencies from a predefined string. Use ASCII art to display the histogram vertically."
],
[
"Write a C++ program that implements a simple stack data structure with push, pop, and peek operations. Demonstrate its usage by reversing a predefined string using the stack."
],
[
"Write a Rust program that calculates and prints the first 20 happy numbers. Include a function to check if a number is happy and use it in your solution."
],
[
"Write a Java program that implements a simple binary search algorithm. Create a sorted array of integers and demonstrate searching for different values, including cases where the value is found and not found."
],
[
"Write a Go program that generates and prints a multiplication table from 1 to 12. Format the output in a neat grid with proper alignment."
],
[
"Write a Python script with Gradio that applies a halftone effect to uploaded images. The app should allow users to upload an image file and convert it to a simple halftone pattern using basic image processing. Include a slider to control the dot size and display both the original and processed images. Use only built-in Python libraries and basic image manipulation techniques."
],
],
example_labels=[
"๐ฎ ๆฅ่ไธป้ขๅพๆก",
"๐ ๆฅๆฌใฎไผ็ตฑ็ใชๅๆใใฟใผใณ",
"๐ Particles in a Spherical Container",
"๐น NVIDIA Stock Analysis with Matplotlib",
"๐ฌ Chatbot with Gradio",
"๐ Todo List App with React.js",
"๐ File Upload Web App with Streamlit",
"๐ฆ Solve Trapping Rain Water Problem",
"๐ฎ Pygame Bouncing Ball Game",
"๐ณ Financial Dashboard with Vue.js",
"๐ User Login Process Flowchart",
"๐ข Fibonacci Sequence with Tests",
"๐ Vue Kanban Board",
"๐งน Streamlit Data Cleaning App",
"โ๏ธ Interactive Periodic Table with React",
"๐ Dictionary Sorting Tests in Python",
"๐๏ธโโ๏ธ Fitness Tracker with React",
"๐ฅ๏ธ Vue Server Monitoring",
"๐ข Prime Numbers in C",
"๐งฎ OOP Calculator in C++",
"๐ท Pascal's Triangle in Rust",
"๐๏ธ Bank Account Simulation in Java",
"๐ฐ Fibonacci Sequence in Go",
"๐ Letter Frequency Histogram in C",
"๐ฆ Stack Implementation in C++",
"๐ Happy Numbers in Rust",
"๐ Binary Search in Java",
"โ๏ธ Multiplication Table in Go",
"๐จ๏ธ Halftone Image Effect with Gradio",
],
examples_per_page=100,
label="Example Prompts",
inputs=[text_input],
)
# Ranking Tab
ranking_table, ranking_last_update, ranking_timer = create_ranking_tab()
# Event handlers
# Create state variables for the run buttons
state0_var = gr.State()
state1_var = gr.State()
# Add telemetry logging for user interactions after state variables are created
# We need to create a wrapper function to extract the sandbox state from the main state
def log_telemetry_a(state0, sandbox_ui):
if state0 and "sandbox_state" in state0:
# Print user interactions for debugging
if sandbox_ui and len(sandbox_ui) > 2:
interactions = sandbox_ui[2] # Third element is user_interaction_records
if interactions:
# Store interactions in the state
if "interactions" not in state0:
state0["interactions"] = []
state0["interactions"].extend(interactions)
return log_sandbox_telemetry_gradio_fn(state0["sandbox_state"], sandbox_ui)
return None
def log_telemetry_b(state1, sandbox_ui):
if state1 and "sandbox_state" in state1:
# Print user interactions for debugging
if sandbox_ui and len(sandbox_ui) > 2:
interactions = sandbox_ui[2] # Third element is user_interaction_records
if interactions:
# Store interactions in the state
if "interactions" not in state1:
state1["interactions"] = []
state1["interactions"].extend(interactions)
return log_sandbox_telemetry_gradio_fn(state1["sandbox_state"], sandbox_ui)
return None
sandbox_component_a.change(
fn=log_telemetry_a,
inputs=[state0_var, sandbox_component_a],
)
sandbox_component_b.change(
fn=log_telemetry_b,
inputs=[state1_var, sandbox_component_b],
)
# Create response components (hidden but needed for outputs)
response_a = gr.Markdown("", visible=False)
response_b = gr.Markdown("", visible=False)
# Create a wrapper function that handles both the main execution and state update
def send_and_update_state(state0, state1, text, temp, max_tok, model_a, model_b):
# Hide vote buttons immediately when generation starts
initial_vote_visibility = False
# Call the main function
result = add_text_and_generate(state0, state1, text, temp, max_tok, model_a, model_b)
# Extract the state from the result
new_state0, new_state1 = result[0], result[1]
# Check if both models have output and are not generating to show vote buttons
show_vote_buttons = should_show_vote_buttons(new_state0, new_state1)
# Return all the original outputs plus the updated state for run buttons
# Make sure all outputs are properly formatted for their expected types
return (
new_state0, # state0
new_state1, # state1
result[2], # chatbot_a (chat0)
result[3], # chatbot_b (chat1)
(
result[4]["content"] if isinstance(result[4], dict) else result[4]
), # response_a (response0)
(
result[5]["content"] if isinstance(result[5], dict) else result[5]
), # response_b (response1)
result[6], # code_a (code0)
result[7], # code_b (code1)
result[10] if len(result) > 10 else "", # sandbox_state0
result[11] if len(result) > 11 else "", # sandbox_state1
result[12] if len(result) > 12 else "", # sandbox_output0
result[13] if len(result) > 13 else "", # sandbox_output1
(
result[14] if len(result) > 14 else gr.update(visible=False)
), # sandbox_component_update0
(
result[15] if len(result) > 15 else gr.update(visible=False)
), # sandbox_component_update1
(
result[16] if len(result) > 16 else "**Conversation:** 0 turns"
), # chat_stats_a
(
result[17] if len(result) > 17 else "**Conversation:** 0 turns"
), # chat_stats_b
result[18] if len(result) > 18 else "", # sandbox_view_a
result[19] if len(result) > 19 else "", # sandbox_view_b
new_state0, # state0_var
new_state1, # state1_var
new_state0.get('install_command', ''), # state0_install_command
new_state1.get('install_command', ''), # state1_install_command
text, # Keep original text input
f"**Model A:** {model_a}", # Update model display A
f"**Model B:** {model_b}", # Update model display B
gr.update(visible=show_vote_buttons), # vote_section
gr.update(visible=show_vote_buttons), # vote_buttons_row
gr.update(visible=False), # vote_status
gr.update(interactive=show_vote_buttons), # vote_left_btn
gr.update(interactive=show_vote_buttons), # vote_right_btn
gr.update(interactive=show_vote_buttons), # vote_tie_btn
gr.update(interactive=show_vote_buttons), # vote_both_bad_btn
)
send_btn.click(
fn=send_and_update_state,
inputs=[
state0_var, # state0
state1_var, # state1
text_input,
temperature,
max_tokens,
gr.State(model_a), # Use fixed model A
gr.State(model_b), # Use fixed model B
],
outputs=[
state0_var, # state0
state1_var, # state1
chatbot_a,
chatbot_b,
response_a,
response_b,
code_a,
code_b,
gr.State(), # sandbox_state0
gr.State(), # sandbox_state1
sandbox_view_a, # sandbox output for model A
sandbox_view_b, # sandbox output for model B
sandbox_component_a, # sandbox component for model A
sandbox_component_b, # sandbox component for model B
chat_stats_a, # Conversation statistics for model A
chat_stats_b, # Conversation statistics for model B
sandbox_view_a, # Sandbox view for model A
sandbox_view_b, # Sandbox view for model B
state0_var, # Updated state for run button A
state1_var, # Updated state for run button B
install_command_a, # Install command for model A
install_command_b, # Install command for model B
text_input, # Clear the text input after sending
model_display_a, # Update model display A
model_display_b, # Update model display B
vote_section, # Show/hide vote section
vote_buttons_row, # Show/hide vote buttons
vote_status, # Vote status message
vote_left_btn, # vote_left_btn
vote_right_btn, # vote_right_btn
vote_tie_btn, # vote_tie_btn
vote_both_bad_btn, # vote_both_bad_btn
],
)
# Add Enter key submission support to textbox
text_input.submit(
fn=send_and_update_state,
inputs=[
state0_var, # state0
state1_var, # state1
text_input,
temperature,
max_tokens,
gr.State(model_a), # Use fixed model A
gr.State(model_b), # Use fixed model B
],
outputs=[
state0_var, # state0
state1_var, # state1
chatbot_a,
chatbot_b,
response_a,
response_b,
code_a,
code_b,
gr.State(), # sandbox_state0
gr.State(), # sandbox_state1
sandbox_view_a, # sandbox output for model A
sandbox_view_b, # sandbox output for model B
sandbox_component_a, # sandbox component for model A
sandbox_component_b, # sandbox component for model B
chat_stats_a, # Conversation statistics for model A
chat_stats_b, # Conversation statistics for model B
sandbox_view_a, # Sandbox view for model A
sandbox_view_b, # Sandbox view for model B
state0_var, # Updated state for run button A
state1_var, # Updated state for run button B
install_command_a, # Install command for model A
install_command_b, # Install command for model B
text_input, # Clear the text input after sending
model_display_a, # Update model display A
model_display_b, # Update model display B
vote_section, # Show/hide vote section
vote_buttons_row, # Show/hide vote buttons
vote_status, # Vote status message
vote_left_btn, # vote_left_btn
vote_right_btn, # vote_right_btn
vote_tie_btn, # vote_tie_btn
vote_both_bad_btn, # vote_both_bad_btn
],
)
# Retry button handler
retry_btn.click(
fn=retry_last_message,
inputs=[
state0_var, # state0
state1_var, # state1
gr.State(model_a), # Use fixed model A
gr.State(model_b), # Use fixed model B
],
outputs=[
state0_var, # state0
state1_var, # state1
chatbot_a,
chatbot_b,
response_a,
response_b,
code_a,
code_b,
gr.State(), # sandbox_state0
gr.State(), # sandbox_state1
sandbox_view_a, # sandbox output for model A
sandbox_view_b, # sandbox output for model B
sandbox_component_a, # sandbox component for model A
sandbox_component_b, # sandbox component for model B
chat_stats_a, # Conversation statistics for model A
chat_stats_b, # Conversation statistics for model B
sandbox_view_a, # Sandbox view for model A
sandbox_view_b, # Sandbox view for model B
state0_var, # Updated state for run button A
state1_var, # Updated state for run button B
install_command_a, # Install command for model A
install_command_b, # Install command for model B
text_input, # Clear the text input after sending
model_display_a, # Update model display A
model_display_b, # Update model display B
vote_section, # Show/hide vote section
vote_buttons_row, # Show/hide vote buttons
vote_status, # Vote status message
vote_left_btn, # vote_left_btn
vote_right_btn, # vote_right_btn
vote_tie_btn, # vote_tie_btn
vote_both_bad_btn, # vote_both_bad_btn
],
)
# Send left button handler
send_left_btn.click(
fn=send_to_left_only,
inputs=[
state0_var, # state0
state1_var, # state1
text_input,
temperature,
max_tokens,
gr.State(model_a), # Use fixed model A
gr.State(model_b), # Use fixed model B
],
outputs=[
state0_var, # state0
state1_var, # state1
chatbot_a,
chatbot_b,
response_a,
response_b,
code_a,
code_b,
gr.State(), # sandbox_state0
gr.State(), # sandbox_state1
sandbox_view_a, # sandbox output for model A
sandbox_view_b, # sandbox output for model B
sandbox_component_a, # sandbox component for model A
sandbox_component_b, # sandbox component for model B
chat_stats_a, # Conversation statistics for model A
chat_stats_b, # Conversation statistics for model B
sandbox_view_a, # Sandbox view for model A
sandbox_view_b, # Sandbox view for model B
state0_var, # Updated state for run button A
state1_var, # Updated state for run button B
install_command_a, # Install command for model A
install_command_b, # Install command for model B
text_input, # Clear the text input after sending
model_display_a, # Update model display A
model_display_b, # Update model display B
vote_section, # Show/hide vote section
vote_buttons_row, # Show/hide vote buttons
vote_status, # Vote status message
vote_left_btn, # vote_left_btn
vote_right_btn, # vote_right_btn
vote_tie_btn, # vote_tie_btn
vote_both_bad_btn, # vote_both_bad_btn
],
)
# Send right button handler
send_right_btn.click(
fn=send_to_right_only,
inputs=[
state0_var, # state0
state1_var, # state1
text_input,
temperature,
max_tokens,
gr.State(model_a), # Use fixed model A
gr.State(model_b), # Use fixed model B
],
outputs=[
state0_var, # state0
state1_var, # state1
chatbot_a,
chatbot_b,
response_a,
response_b,
code_a,
code_b,
gr.State(), # sandbox_state0
gr.State(), # sandbox_state1
sandbox_view_a, # sandbox output for model A
sandbox_view_b, # sandbox output for model B
sandbox_component_a, # sandbox component for model A
sandbox_component_b, # sandbox component for model B
chat_stats_a, # Conversation statistics for model A
chat_stats_b, # Conversation statistics for model B
sandbox_view_a, # Sandbox view for model A
sandbox_view_b, # Sandbox view for model B
state0_var, # Updated state for run button A
state1_var, # Updated state for run button B
install_command_a, # Install command for model A
install_command_b, # Install command for model B
text_input, # Clear the text input after sending
model_display_a, # Update model display A
model_display_b, # Update model display B
vote_section, # Show/hide vote section
vote_buttons_row, # Show/hide vote buttons
vote_status, # Vote status message
vote_left_btn, # vote_left_btn
vote_right_btn, # vote_right_btn
vote_tie_btn, # vote_tie_btn
vote_both_bad_btn, # vote_both_bad_btn
],
)
clear_btn.click(
fn=clear_chat,
inputs=[state0_var, state1_var],
outputs=[
state0_var, # Reset state0
state1_var, # Reset state1
chatbot_a, # Clear chatbot_a
chatbot_b, # Clear chatbot_b
response_a, # Clear response_a
response_b, # Clear response_b
code_a, # Clear code_a
code_b, # Clear code_b
gr.State(None), # Reset sandbox_state0
gr.State(None), # Reset sandbox_state1
sandbox_view_a, # Clear sandbox_view_a
sandbox_view_b, # Clear sandbox_view_b
sandbox_component_a, # Hide sandbox_component_a
sandbox_component_b, # Hide sandbox_component_b
chat_stats_a, # Reset conversation statistics for model A
chat_stats_b, # Reset conversation statistics for model B
sandbox_view_a, # Reset sandbox view for model A
sandbox_view_b, # Reset sandbox view for model B
state0_var, # Updated state for run button A
state1_var, # Updated state for run button B
install_command_a, # Clear install command for model A
install_command_b, # Clear install command for model B
model_display_a, # Reset model display A
model_display_b, # Reset model display B
text_input, # Clear text input
vote_section, # Hide vote section
vote_buttons_row, # Hide vote buttons
vote_status, # Clear vote status
vote_left_btn, # Disable vote buttons
vote_right_btn, # Disable vote buttons
vote_tie_btn, # Disable vote buttons
vote_both_bad_btn, # Disable vote buttons
]
)
# Refresh models button handler
def refresh_models():
new_model_a, new_model_b = get_random_models()
return (
None, # Reset state0
None, # Reset state1
"", # Clear chat A
"", # Clear chat B
"", # Clear response A
"", # Clear response B
"", # Clear code A
"", # Clear code B
gr.State(None), # Reset sandbox state A
gr.State(None), # Reset sandbox state B
"", # Clear sandbox view A
"", # Clear sandbox view B
gr.update(visible=False), # Hide sandbox component A
gr.update(visible=False), # Hide sandbox component B
"**Conversation:** 0 turns | **Total Messages:** 0", # Reset stats A
"**Conversation:** 0 turns | **Total Messages:** 0", # Reset stats B
"", # Clear sandbox view A
"", # Clear sandbox view B
None, # Reset state0_var
None, # Reset state1_var
f"**Model A:** {new_model_a}", # Update model display A
f"**Model B:** {new_model_b}", # Update model display B
gr.update(visible=False), # Hide vote section
gr.update(visible=False), # Hide vote buttons
gr.update(visible=False), # Clear vote status
)
refresh_models_btn.click(
fn=refresh_models,
inputs=[],
outputs=[
state0_var,
state1_var,
chatbot_a,
chatbot_b,
response_a,
response_b,
code_a,
code_b,
gr.State(None),
gr.State(None),
sandbox_view_a,
sandbox_view_b,
sandbox_component_a,
sandbox_component_b,
chat_stats_a,
chat_stats_b,
sandbox_view_a,
sandbox_view_b,
state0_var,
state1_var,
model_display_a, # Update model display A
model_display_b, # Update model display B
vote_section, # Hide vote section
vote_buttons_row, # Hide vote buttons
vote_status, # Clear vote status
],
)
# Setup vote handlers
def process_vote(state0, state1, vote_type, current_text):
# Save the vote and get updates
message, ranking_update, last_update = handle_vote(
state0, state1, vote_type
)
# Get the model names from the current session
model_a = state0["model_name"] if state0 else "Unknown"
model_b = state1["model_name"] if state1 else "Unknown"
# Always show thank you message and clear everything immediately
gr.Info("Thank you for your vote! ๐ Your feedback has been recorded and new models have been selected.", duration=5)
# revval the model names in the info message
gr.Info(f"Now you can see model names! ๐ \nModel A: {model_a}, Model B: {model_b}", duration=15)
# Get new random models for the next session
model_a, model_b = get_random_models()
# Clear everything and start fresh immediately, but preserve examples
return (
message, # vote status message
gr.update(), # Keep state0 unchanged
gr.update(), # Keep state1 unchanged
gr.update(), # Keep chatbot_a unchanged
gr.update(), # Keep chatbot_b unchanged
gr.update(), # Keep response_a unchanged
gr.update(), # Keep response_b unchanged
gr.update(), # Keep code_a unchanged
gr.update(), # Keep code_b unchanged
gr.update(), # Keep sandbox_view_a unchanged
gr.update(), # Keep sandbox_view_b unchanged
gr.update(), # Keep sandbox_component_a unchanged
gr.update(), # Keep sandbox_component_b unchanged
gr.update(), # Keep chat_stats_a unchanged
gr.update(), # Keep chat_stats_b unchanged
gr.update(), # Keep model_display_a unchanged
gr.update(), # Keep model_display_b unchanged
gr.update(visible=False), # Hide vote_section
gr.update(visible=False), # Hide vote_buttons_row
None, # Reset state0_var
None, # Reset state1_var
gr.update(), # Keep existing ranking_table (no refresh needed)
gr.update(), # Keep existing ranking_last_update (no refresh needed)
gr.update(interactive=False), # Disable vote_left_btn
gr.update(interactive=False), # Disable vote_right_btn
gr.update(interactive=False), # Disable vote_tie_btn
gr.update(interactive=False), # Disable vote_both_bad_btn
"", # Clear text_input to preserve examples
)
# Re-run code button handlers
def rerun_code_a(state0, current_code_a, current_install_command_a):
"""Re-run code execution for Model A"""
updated_state, sandbox_view, sandbox_component_update, error = rerun_code_execution(state0, current_code_a, current_install_command_a, "A")
return (
updated_state, # state0_var
sandbox_view, # sandbox_view_a
sandbox_component_update if sandbox_component_update else gr.skip(), # sandbox_component_a
)
def rerun_code_b(state1, current_code_b, current_install_command_b):
"""Re-run code execution for Model B"""
updated_state, sandbox_view, sandbox_component_update, error = rerun_code_execution(state1, current_code_b, current_install_command_b, "B")
return (
updated_state, # state1_var
sandbox_view, # sandbox_view_b
sandbox_component_update if sandbox_component_update else gr.skip(), # sandbox_component_b
)
def change_to_view_a():
return gr.Tabs(selected=0)
def change_to_view_b():
return gr.Tabs(selected=2)
rerun_code_a_btn.click(
fn=change_to_view_a,
inputs=[],
outputs=[tabs_a]
).then(
fn=rerun_code_a,
inputs=[state0_var, code_a, install_command_a],
outputs=[
state0_var, # state0_var
sandbox_view_a, # sandbox_view_a
sandbox_component_a, # sandbox_component_a
]
)
rerun_code_b_btn.click(
fn=change_to_view_b,
inputs=[],
outputs=[tabs_b]
).then(
fn=rerun_code_b,
inputs=[state1_var, code_b, install_command_b],
outputs=[
state1_var, # state1_var
sandbox_view_b, # sandbox_view_b
sandbox_component_b, # sandbox_component_b
]
)
# Vote button click handlers
for vote_btn, vote_type in [
(vote_left_btn, "left"),
(vote_right_btn, "right"),
(vote_tie_btn, "tie"),
(vote_both_bad_btn, "both_bad"),
]:
vote_btn.click(
fn=process_vote,
inputs=[state0_var, state1_var, gr.State(vote_type), text_input],
outputs=[
vote_status, # vote status message
state0_var, # state0
state1_var, # state1
chatbot_a, # chatbot_a
chatbot_b, # chatbot_b
response_a, # response_a
response_b, # response_b
code_a, # code_a
code_b, # code_b
sandbox_view_a, # sandbox_view_a
sandbox_view_b, # sandbox_view_b
sandbox_component_a, # sandbox_component_a
sandbox_component_b, # sandbox_component_b
chat_stats_a, # chat_stats_a
chat_stats_b, # chat_stats_b
model_display_a, # model_display_a
model_display_b, # model_display_b
vote_section, # vote_section
vote_buttons_row, # vote_buttons_row
state0_var, # state0_var (duplicate for state management)
state1_var, # state1_var (duplicate for state management)
ranking_table, # ranking_table
ranking_last_update, # ranking_last_update
vote_left_btn, # vote_left_btn
vote_right_btn, # vote_right_btn
vote_tie_btn, # vote_tie_btn
vote_both_bad_btn, # vote_both_bad_btn
text_input, # text_input (to preserve examples)
],
)
# Setup ranking handlers
setup_ranking_handlers(demo, ranking_table, ranking_last_update, ranking_timer)
return demo
def main():
"""Main function to run the Simple BigCodeArena app"""
# Get random models for this session
model_a, model_b = get_random_models()
# Build the UI
demo = build_ui()
# Launch the app
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=True
)
if __name__ == "__main__":
main()
|