|
|
""" |
|
|
Minimalist Issue Metadata Mining Script |
|
|
Mines issue metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
import tempfile |
|
|
from datetime import datetime, timezone, timedelta |
|
|
from collections import defaultdict |
|
|
from huggingface_hub import HfApi, hf_hub_download |
|
|
from huggingface_hub.errors import HfHubHTTPError |
|
|
from dotenv import load_dotenv |
|
|
from google.cloud import bigquery |
|
|
import backoff |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AGENTS_REPO = "SWE-Arena/bot_metadata" |
|
|
ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata" |
|
|
LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" |
|
|
LEADERBOARD_TIME_FRAME_DAYS = 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_rate_limit_error(e): |
|
|
"""Check if the exception is a rate limit error (429).""" |
|
|
return isinstance(e, HfHubHTTPError) and e.response.status_code == 429 |
|
|
|
|
|
|
|
|
@backoff.on_exception( |
|
|
backoff.expo, |
|
|
HfHubHTTPError, |
|
|
giveup=lambda e: not is_rate_limit_error(e), |
|
|
max_tries=8, |
|
|
base=300, |
|
|
max_value=3600, |
|
|
jitter=backoff.full_jitter, |
|
|
on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...") |
|
|
) |
|
|
def list_repo_files_with_backoff(api, **kwargs): |
|
|
"""List repo files with exponential backoff on rate limit errors.""" |
|
|
return api.list_repo_files(**kwargs) |
|
|
|
|
|
@backoff.on_exception( |
|
|
backoff.expo, |
|
|
HfHubHTTPError, |
|
|
giveup=lambda e: not is_rate_limit_error(e), |
|
|
max_tries=8, |
|
|
base=300, |
|
|
max_value=3600, |
|
|
jitter=backoff.full_jitter, |
|
|
on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...") |
|
|
) |
|
|
def hf_hub_download_with_backoff(**kwargs): |
|
|
"""Download from HF Hub with exponential backoff on rate limit errors.""" |
|
|
return hf_hub_download(**kwargs) |
|
|
|
|
|
@backoff.on_exception( |
|
|
backoff.expo, |
|
|
HfHubHTTPError, |
|
|
giveup=lambda e: not is_rate_limit_error(e), |
|
|
max_tries=8, |
|
|
base=300, |
|
|
max_value=3600, |
|
|
jitter=backoff.full_jitter, |
|
|
on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...") |
|
|
) |
|
|
def upload_file_with_backoff(api, **kwargs): |
|
|
"""Upload file with exponential backoff on rate limit errors.""" |
|
|
return api.upload_file(**kwargs) |
|
|
|
|
|
@backoff.on_exception( |
|
|
backoff.expo, |
|
|
HfHubHTTPError, |
|
|
giveup=lambda e: not is_rate_limit_error(e), |
|
|
max_tries=8, |
|
|
base=300, |
|
|
max_value=3600, |
|
|
jitter=backoff.full_jitter, |
|
|
on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...") |
|
|
) |
|
|
def upload_folder_with_backoff(api, **kwargs): |
|
|
"""Upload folder with exponential backoff on rate limit errors.""" |
|
|
return api.upload_folder(**kwargs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_jsonl(filename): |
|
|
"""Load JSONL file and return list of dictionaries.""" |
|
|
if not os.path.exists(filename): |
|
|
return [] |
|
|
|
|
|
data = [] |
|
|
with open(filename, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
line = line.strip() |
|
|
if line: |
|
|
try: |
|
|
data.append(json.loads(line)) |
|
|
except json.JSONDecodeError as e: |
|
|
print(f"Warning: Skipping invalid JSON line: {e}") |
|
|
return data |
|
|
|
|
|
|
|
|
def save_jsonl(filename, data): |
|
|
"""Save list of dictionaries to JSONL file.""" |
|
|
with open(filename, 'w', encoding='utf-8') as f: |
|
|
for item in data: |
|
|
f.write(json.dumps(item) + '\n') |
|
|
|
|
|
|
|
|
def get_hf_token(): |
|
|
"""Get HuggingFace token from environment variables.""" |
|
|
token = os.getenv('HF_TOKEN') |
|
|
if not token: |
|
|
print("Warning: HF_TOKEN not found in environment variables") |
|
|
return token |
|
|
|
|
|
|
|
|
def get_bigquery_client(): |
|
|
""" |
|
|
Initialize BigQuery client using credentials from environment variable. |
|
|
|
|
|
Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing |
|
|
the service account JSON credentials as a string. |
|
|
""" |
|
|
|
|
|
creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON') |
|
|
|
|
|
if creds_json: |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file: |
|
|
temp_file.write(creds_json) |
|
|
temp_path = temp_file.name |
|
|
|
|
|
|
|
|
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path |
|
|
|
|
|
|
|
|
client = bigquery.Client() |
|
|
|
|
|
|
|
|
os.unlink(temp_path) |
|
|
|
|
|
return client |
|
|
else: |
|
|
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment") |
|
|
|
|
|
|
|
|
def generate_table_union_statements(start_date, end_date): |
|
|
""" |
|
|
Generate UNION ALL statements for githubarchive.month tables in date range. |
|
|
|
|
|
Args: |
|
|
start_date: Start datetime |
|
|
end_date: End datetime |
|
|
|
|
|
Returns: |
|
|
String with UNION ALL SELECT statements for all monthly tables in range |
|
|
""" |
|
|
table_names = [] |
|
|
|
|
|
|
|
|
current_date = start_date.replace(day=1) |
|
|
end_month = end_date.replace(day=1) |
|
|
|
|
|
while current_date <= end_month: |
|
|
table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`" |
|
|
table_names.append(table_name) |
|
|
|
|
|
|
|
|
if current_date.month == 12: |
|
|
current_date = current_date.replace(year=current_date.year + 1, month=1) |
|
|
else: |
|
|
current_date = current_date.replace(month=current_date.month + 1) |
|
|
|
|
|
|
|
|
union_parts = [f"SELECT * FROM {table}" for table in table_names] |
|
|
return " UNION ALL ".join(union_parts) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True): |
|
|
""" |
|
|
Fetch issue metadata for ALL agents using BATCHED BigQuery queries. |
|
|
|
|
|
Splits agents into smaller batches to avoid performance issues with large UNNEST arrays |
|
|
and correlated subqueries. Each batch query runs much faster than one massive query. |
|
|
|
|
|
Args: |
|
|
client: BigQuery client instance |
|
|
identifiers: List of GitHub usernames/bot identifiers |
|
|
start_date: Start datetime (timezone-aware) |
|
|
end_date: End datetime (timezone-aware) |
|
|
batch_size: Number of agents per batch (default: 100) |
|
|
upload_immediately: Upload results to HuggingFace immediately after each batch (default: True) |
|
|
|
|
|
Returns: |
|
|
Dictionary mapping agent identifier to list of issue metadata |
|
|
""" |
|
|
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach") |
|
|
print(f" Batch size: {batch_size} agents per query") |
|
|
print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}") |
|
|
|
|
|
|
|
|
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)] |
|
|
print(f" Total batches: {len(batches)}") |
|
|
|
|
|
|
|
|
all_metadata = {} |
|
|
|
|
|
for batch_num, batch_identifiers in enumerate(batches, 1): |
|
|
print(f"\n{'─'*80}") |
|
|
print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)") |
|
|
print(f"{'─'*80}") |
|
|
|
|
|
try: |
|
|
batch_results = fetch_all_issue_metadata_single_query( |
|
|
client, batch_identifiers, start_date, end_date |
|
|
) |
|
|
|
|
|
|
|
|
for identifier, metadata_list in batch_results.items(): |
|
|
if identifier in all_metadata: |
|
|
all_metadata[identifier].extend(metadata_list) |
|
|
else: |
|
|
all_metadata[identifier] = metadata_list |
|
|
|
|
|
print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data") |
|
|
|
|
|
|
|
|
if upload_immediately and batch_results: |
|
|
print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...") |
|
|
upload_success = 0 |
|
|
upload_errors = 0 |
|
|
|
|
|
for identifier, metadata_list in batch_results.items(): |
|
|
if metadata_list: |
|
|
if save_issue_metadata_to_hf(metadata_list, identifier): |
|
|
upload_success += 1 |
|
|
else: |
|
|
upload_errors += 1 |
|
|
|
|
|
print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ✗ Batch {batch_num} failed: {str(e)}") |
|
|
print(f" Continuing with remaining batches...") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
continue |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"✅ All batches completed!") |
|
|
print(f" Total agents with data: {len(all_metadata)}") |
|
|
total_issues = sum(len(issues) for issues in all_metadata.values()) |
|
|
print(f" Total issues found: {total_issues}") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
return all_metadata |
|
|
|
|
|
|
|
|
def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date): |
|
|
""" |
|
|
Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query. |
|
|
|
|
|
This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and |
|
|
deduplicates to get the latest state of each issue. Filters by issue author, |
|
|
commenter, or assignee. |
|
|
|
|
|
NOTE: This function is designed for smaller batches (~100 agents). For large |
|
|
numbers of agents, use fetch_issue_metadata_batched() instead. |
|
|
|
|
|
Args: |
|
|
client: BigQuery client instance |
|
|
identifiers: List of GitHub usernames/bot identifiers (recommended: <100) |
|
|
start_date: Start datetime (timezone-aware) |
|
|
end_date: End datetime (timezone-aware) |
|
|
|
|
|
Returns: |
|
|
Dictionary mapping agent identifier to list of issue metadata: |
|
|
{ |
|
|
'agent-identifier': [ |
|
|
{ |
|
|
'url': Issue URL, |
|
|
'created_at': Issue creation timestamp, |
|
|
'closed_at': Close timestamp (if closed, else None), |
|
|
'state_reason': Reason for closure (completed/not_planned/etc.) |
|
|
}, |
|
|
... |
|
|
], |
|
|
... |
|
|
} |
|
|
""" |
|
|
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY") |
|
|
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}") |
|
|
|
|
|
|
|
|
issue_tables = generate_table_union_statements(start_date, end_date) |
|
|
|
|
|
|
|
|
identifier_set = set() |
|
|
for id in identifiers: |
|
|
identifier_set.add(id) |
|
|
|
|
|
stripped = id.replace('[bot]', '') |
|
|
if stripped != id: |
|
|
identifier_set.add(stripped) |
|
|
|
|
|
|
|
|
identifier_array = '[' + ', '.join([f'"{id}"' for id in identifier_set]) + ']' |
|
|
|
|
|
print(f" Total identifiers (including bot/non-bot variants): {len(identifier_set)}") |
|
|
|
|
|
|
|
|
query = f""" |
|
|
WITH agent_identifiers AS ( |
|
|
-- Create a table from the identifier array to avoid massive IN clauses |
|
|
SELECT identifier |
|
|
FROM UNNEST({identifier_array}) AS identifier |
|
|
), |
|
|
|
|
|
issue_events AS ( |
|
|
-- Get all issue events and comment events for ALL agents |
|
|
SELECT |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url, |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at, |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at, |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason, |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author, |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee, |
|
|
JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter, |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number, |
|
|
repo.name as repo_name, |
|
|
created_at as event_time |
|
|
FROM ( |
|
|
{issue_tables} |
|
|
) |
|
|
WHERE |
|
|
type IN ('IssuesEvent', 'IssueCommentEvent') |
|
|
-- Exclude pull requests (they have pull_request field) |
|
|
AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL |
|
|
AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL |
|
|
-- Filter by author OR commenter OR assignee using JOIN instead of IN |
|
|
AND ( |
|
|
JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN (SELECT identifier FROM agent_identifiers) |
|
|
OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN (SELECT identifier FROM agent_identifiers) |
|
|
OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN (SELECT identifier FROM agent_identifiers) |
|
|
) |
|
|
), |
|
|
|
|
|
latest_states AS ( |
|
|
-- Deduplicate to get latest state for each issue |
|
|
SELECT |
|
|
url, |
|
|
created_at, |
|
|
closed_at, |
|
|
state_reason, |
|
|
author, |
|
|
assignee, |
|
|
commenter |
|
|
FROM issue_events |
|
|
QUALIFY ROW_NUMBER() OVER ( |
|
|
PARTITION BY repo_name, issue_number |
|
|
ORDER BY event_time DESC |
|
|
) = 1 |
|
|
), |
|
|
|
|
|
agent_issues AS ( |
|
|
-- Map each issue to its relevant agent(s) |
|
|
SELECT DISTINCT |
|
|
CASE |
|
|
WHEN author IN (SELECT identifier FROM agent_identifiers) THEN author |
|
|
WHEN commenter IN (SELECT identifier FROM agent_identifiers) THEN commenter |
|
|
WHEN assignee IN (SELECT identifier FROM agent_identifiers) THEN assignee |
|
|
ELSE NULL |
|
|
END as agent_identifier, |
|
|
url, |
|
|
created_at, |
|
|
closed_at, |
|
|
state_reason |
|
|
FROM latest_states |
|
|
WHERE |
|
|
author IN (SELECT identifier FROM agent_identifiers) |
|
|
OR commenter IN (SELECT identifier FROM agent_identifiers) |
|
|
OR assignee IN (SELECT identifier FROM agent_identifiers) |
|
|
) |
|
|
|
|
|
SELECT |
|
|
agent_identifier, |
|
|
url, |
|
|
created_at, |
|
|
closed_at, |
|
|
state_reason |
|
|
FROM agent_issues |
|
|
WHERE agent_identifier IS NOT NULL |
|
|
ORDER BY agent_identifier, created_at DESC |
|
|
""" |
|
|
|
|
|
|
|
|
query_days = (end_date - start_date).days |
|
|
|
|
|
print(f" Querying {query_days} days for issue and comment events...") |
|
|
print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}") |
|
|
|
|
|
try: |
|
|
query_job = client.query(query) |
|
|
results = list(query_job.result()) |
|
|
|
|
|
print(f" ✓ Found {len(results)} total issue records across all agents") |
|
|
|
|
|
|
|
|
metadata_by_agent = defaultdict(list) |
|
|
|
|
|
for row in results: |
|
|
agent_id = row.agent_identifier |
|
|
|
|
|
|
|
|
created_at = row.created_at |
|
|
if hasattr(created_at, 'isoformat'): |
|
|
created_at = created_at.isoformat() |
|
|
|
|
|
closed_at = row.closed_at |
|
|
if hasattr(closed_at, 'isoformat'): |
|
|
closed_at = closed_at.isoformat() |
|
|
|
|
|
metadata_by_agent[agent_id].append({ |
|
|
'url': row.url, |
|
|
'created_at': created_at, |
|
|
'closed_at': closed_at, |
|
|
'state_reason': row.state_reason, |
|
|
}) |
|
|
|
|
|
|
|
|
print(f"\n 📊 Results breakdown by agent:") |
|
|
for identifier in identifiers: |
|
|
|
|
|
count = len(metadata_by_agent.get(identifier, [])) |
|
|
stripped = identifier.replace('[bot]', '') |
|
|
if stripped != identifier: |
|
|
count += len(metadata_by_agent.get(stripped, [])) |
|
|
|
|
|
if count > 0: |
|
|
|
|
|
all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, []) |
|
|
completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed') |
|
|
closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None) |
|
|
open_count = count - closed_count |
|
|
print(f" {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)") |
|
|
|
|
|
|
|
|
final_metadata = {} |
|
|
for identifier in identifiers: |
|
|
combined = metadata_by_agent.get(identifier, []) |
|
|
stripped = identifier.replace('[bot]', '') |
|
|
if stripped != identifier and stripped in metadata_by_agent: |
|
|
combined.extend(metadata_by_agent[stripped]) |
|
|
|
|
|
if combined: |
|
|
final_metadata[identifier] = combined |
|
|
|
|
|
return final_metadata |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ✗ BigQuery error: {str(e)}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def group_metadata_by_date(metadata_list): |
|
|
""" |
|
|
Group issue metadata by exact date (year.month.day) for efficient daily storage. |
|
|
Returns dict: {(year, month, day): [metadata_list]} |
|
|
""" |
|
|
grouped = defaultdict(list) |
|
|
|
|
|
for issue_meta in metadata_list: |
|
|
created_at = issue_meta.get('created_at') |
|
|
if not created_at: |
|
|
continue |
|
|
|
|
|
try: |
|
|
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00')) |
|
|
key = (dt.year, dt.month, dt.day) |
|
|
grouped[key].append(issue_meta) |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not parse date '{created_at}': {e}") |
|
|
|
|
|
return dict(grouped) |
|
|
|
|
|
|
|
|
def save_issue_metadata_to_hf(metadata_list, agent_identifier): |
|
|
""" |
|
|
Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl. |
|
|
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues. |
|
|
|
|
|
This function OVERWRITES existing files completely with fresh data from BigQuery. |
|
|
Uses upload_folder for single-commit batch uploads (avoids rate limit issues). |
|
|
|
|
|
Args: |
|
|
metadata_list: List of issue metadata dictionaries |
|
|
agent_identifier: GitHub identifier of the agent (used as folder name) |
|
|
""" |
|
|
import shutil |
|
|
|
|
|
try: |
|
|
token = get_hf_token() |
|
|
if not token: |
|
|
raise Exception("No HuggingFace token found") |
|
|
|
|
|
api = HfApi(token=token) |
|
|
|
|
|
|
|
|
grouped = group_metadata_by_date(metadata_list) |
|
|
|
|
|
if not grouped: |
|
|
print(f" No valid metadata to save for {agent_identifier}") |
|
|
return False |
|
|
|
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
agent_folder = os.path.join(temp_dir, agent_identifier) |
|
|
os.makedirs(agent_folder, exist_ok=True) |
|
|
|
|
|
try: |
|
|
print(f" 📦 Preparing batch upload for {len(grouped)} daily files...") |
|
|
|
|
|
|
|
|
for (issue_year, month, day), day_metadata in grouped.items(): |
|
|
filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl" |
|
|
local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl") |
|
|
|
|
|
|
|
|
day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True) |
|
|
|
|
|
|
|
|
save_jsonl(local_filename, day_metadata) |
|
|
print(f" Prepared {len(day_metadata)} issues for {filename}") |
|
|
|
|
|
|
|
|
print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...") |
|
|
upload_folder_with_backoff( |
|
|
api, |
|
|
folder_path=temp_dir, |
|
|
repo_id=ISSUE_METADATA_REPO, |
|
|
repo_type="dataset", |
|
|
commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC" |
|
|
) |
|
|
print(f" ✓ Batch upload complete for {agent_identifier}") |
|
|
|
|
|
return True |
|
|
|
|
|
finally: |
|
|
|
|
|
if os.path.exists(temp_dir): |
|
|
shutil.rmtree(temp_dir) |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ✗ Error saving issue metadata: {str(e)}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
|
|
|
def load_agents_from_hf(): |
|
|
""" |
|
|
Load all agent metadata JSON files from HuggingFace dataset. |
|
|
|
|
|
The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]') |
|
|
""" |
|
|
try: |
|
|
api = HfApi() |
|
|
agents = [] |
|
|
|
|
|
|
|
|
files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset") |
|
|
|
|
|
|
|
|
json_files = [f for f in files if f.endswith('.json')] |
|
|
|
|
|
print(f"Found {len(json_files)} agent files in {AGENTS_REPO}") |
|
|
|
|
|
|
|
|
for json_file in json_files: |
|
|
try: |
|
|
file_path = hf_hub_download_with_backoff( |
|
|
repo_id=AGENTS_REPO, |
|
|
filename=json_file, |
|
|
repo_type="dataset" |
|
|
) |
|
|
|
|
|
with open(file_path, 'r') as f: |
|
|
agent_data = json.load(f) |
|
|
|
|
|
|
|
|
if agent_data.get('status') != 'public': |
|
|
continue |
|
|
|
|
|
|
|
|
github_identifier = json_file.replace('.json', '') |
|
|
agent_data['github_identifier'] = github_identifier |
|
|
|
|
|
agents.append(agent_data) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Warning: Could not load {json_file}: {str(e)}") |
|
|
continue |
|
|
|
|
|
print(f"✓ Loaded {len(agents)} agents from HuggingFace") |
|
|
return agents |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Could not load agents from HuggingFace: {str(e)}") |
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_issue_stats_from_metadata(metadata_list): |
|
|
""" |
|
|
Calculate statistics from a list of issue metadata. |
|
|
|
|
|
Returns: |
|
|
dict: Issue statistics including total, closed, resolved counts and rate |
|
|
""" |
|
|
total_issues = len(metadata_list) |
|
|
|
|
|
|
|
|
closed_issues = sum(1 for issue_meta in metadata_list |
|
|
if issue_meta.get('closed_at') is not None) |
|
|
|
|
|
|
|
|
completed = sum(1 for issue_meta in metadata_list |
|
|
if issue_meta.get('state_reason') == 'completed') |
|
|
|
|
|
|
|
|
resolved_rate = (completed / closed_issues * 100) if closed_issues > 0 else 0 |
|
|
|
|
|
return { |
|
|
'total_issues': total_issues, |
|
|
'closed_issues': closed_issues, |
|
|
'resolved_issues': completed, |
|
|
'resolved_rate': round(resolved_rate, 2), |
|
|
} |
|
|
|
|
|
|
|
|
def calculate_monthly_metrics(all_metadata, agents): |
|
|
""" |
|
|
Calculate monthly metrics for all agents for visualization. |
|
|
|
|
|
Args: |
|
|
all_metadata: Dictionary mapping agent_identifier to list of issue metadata |
|
|
agents: List of agent dictionaries with metadata |
|
|
|
|
|
Returns: |
|
|
dict: { |
|
|
'agents': list of agent names, |
|
|
'months': list of month labels (e.g., '2025-01'), |
|
|
'data': { |
|
|
agent_name: { |
|
|
'resolved_rates': list of resolved rates by month, |
|
|
'total_issues': list of issue counts by month, |
|
|
'resolved_issues': list of resolved issue counts by month |
|
|
} |
|
|
} |
|
|
} |
|
|
""" |
|
|
|
|
|
identifier_to_name = { |
|
|
agent.get('github_identifier'): agent.get('name', agent.get('name', agent.get('github_identifier'))) |
|
|
for agent in agents if agent.get('github_identifier') |
|
|
} |
|
|
|
|
|
|
|
|
agent_month_data = defaultdict(lambda: defaultdict(list)) |
|
|
|
|
|
for identifier, metadata_list in all_metadata.items(): |
|
|
agent_name = identifier_to_name.get(identifier, identifier) |
|
|
|
|
|
for issue_meta in metadata_list: |
|
|
created_at = issue_meta.get('created_at') |
|
|
if not created_at: |
|
|
continue |
|
|
|
|
|
try: |
|
|
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00')) |
|
|
month_key = f"{dt.year}-{dt.month:02d}" |
|
|
agent_month_data[agent_name][month_key].append(issue_meta) |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not parse date '{created_at}': {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
all_months = set() |
|
|
for agent_data in agent_month_data.values(): |
|
|
all_months.update(agent_data.keys()) |
|
|
months = sorted(list(all_months)) |
|
|
|
|
|
|
|
|
result_data = {} |
|
|
for agent_name, month_dict in agent_month_data.items(): |
|
|
resolved_rates = [] |
|
|
total_issues_list = [] |
|
|
resolved_issues_list = [] |
|
|
|
|
|
for month in months: |
|
|
issues_in_month = month_dict.get(month, []) |
|
|
|
|
|
|
|
|
completed_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed') |
|
|
|
|
|
|
|
|
closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at') is not None) |
|
|
|
|
|
|
|
|
total_count = len(issues_in_month) |
|
|
|
|
|
|
|
|
resolved_rate = (completed_count / closed_count * 100) if closed_count > 0 else None |
|
|
|
|
|
resolved_rates.append(resolved_rate) |
|
|
total_issues_list.append(total_count) |
|
|
resolved_issues_list.append(completed_count) |
|
|
|
|
|
result_data[agent_name] = { |
|
|
'resolved_rates': resolved_rates, |
|
|
'total_issues': total_issues_list, |
|
|
'resolved_issues': resolved_issues_list |
|
|
} |
|
|
|
|
|
agents_list = sorted(list(agent_month_data.keys())) |
|
|
|
|
|
return { |
|
|
'agents': agents_list, |
|
|
'months': months, |
|
|
'data': result_data |
|
|
} |
|
|
|
|
|
|
|
|
def save_leaderboard_and_metrics_to_hf(all_metadata, agents): |
|
|
""" |
|
|
Creates a comprehensive JSON file with both leaderboard stats and monthly metrics. |
|
|
If the file exists, it will be overwritten. |
|
|
|
|
|
Args: |
|
|
all_metadata: Dictionary mapping agent_identifier to list of issue metadata |
|
|
agents: List of agent dictionaries with metadata |
|
|
|
|
|
Returns: |
|
|
bool: True if successful, False otherwise |
|
|
""" |
|
|
import io |
|
|
|
|
|
try: |
|
|
token = get_hf_token() |
|
|
if not token: |
|
|
raise Exception("No HuggingFace token found") |
|
|
|
|
|
api = HfApi(token=token) |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"📊 Preparing leaderboard and metrics data for upload...") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
|
|
|
print(" Constructing leaderboard data...") |
|
|
leaderboard_data = {} |
|
|
|
|
|
for agent in agents: |
|
|
identifier = agent.get('github_identifier') |
|
|
agent_name = agent.get('name', 'Unknown') |
|
|
|
|
|
if not identifier: |
|
|
continue |
|
|
|
|
|
metadata = all_metadata.get(identifier, []) |
|
|
stats = calculate_issue_stats_from_metadata(metadata) |
|
|
|
|
|
leaderboard_data[identifier] = { |
|
|
'name': agent_name, |
|
|
'website': agent.get('website', 'N/A'), |
|
|
'github_identifier': identifier, |
|
|
**stats |
|
|
} |
|
|
|
|
|
|
|
|
print(" Calculating monthly metrics...") |
|
|
monthly_metrics = calculate_monthly_metrics(all_metadata, agents) |
|
|
|
|
|
|
|
|
combined_data = { |
|
|
"leaderboard": leaderboard_data, |
|
|
"monthly_metrics": monthly_metrics, |
|
|
"metadata": { |
|
|
"last_updated": datetime.now(timezone.utc).isoformat(), |
|
|
"time_frame_days": LEADERBOARD_TIME_FRAME_DAYS, |
|
|
"total_agents": len(leaderboard_data) |
|
|
} |
|
|
} |
|
|
|
|
|
print(f" Leaderboard entries: {len(leaderboard_data)}") |
|
|
print(f" Monthly metrics for: {len(monthly_metrics['agents'])} agents") |
|
|
print(f" Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days") |
|
|
|
|
|
|
|
|
json_content = json.dumps(combined_data, indent=2) |
|
|
file_like_object = io.BytesIO(json_content.encode('utf-8')) |
|
|
|
|
|
|
|
|
print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...") |
|
|
upload_file_with_backoff( |
|
|
api, |
|
|
path_or_fileobj=file_like_object, |
|
|
path_in_repo="swe-issue.json", |
|
|
repo_id=LEADERBOARD_REPO, |
|
|
repo_type="dataset", |
|
|
token=token, |
|
|
commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC" |
|
|
) |
|
|
|
|
|
print(f" ✓ Successfully uploaded swe-issue.json") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"✗ Error saving leaderboard and metrics: {str(e)}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mine_all_agents(): |
|
|
""" |
|
|
Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace. |
|
|
Uses ONE BigQuery query for ALL agents (most efficient approach). |
|
|
""" |
|
|
|
|
|
agents = load_agents_from_hf() |
|
|
if not agents: |
|
|
print("No agents found in HuggingFace dataset") |
|
|
return |
|
|
|
|
|
|
|
|
identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')] |
|
|
if not identifiers: |
|
|
print("No valid agent identifiers found") |
|
|
return |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"Starting issue metadata mining for {len(identifiers)} agents") |
|
|
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days") |
|
|
print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
|
|
|
try: |
|
|
client = get_bigquery_client() |
|
|
except Exception as e: |
|
|
print(f"✗ Failed to initialize BigQuery client: {str(e)}") |
|
|
return |
|
|
|
|
|
|
|
|
current_time = datetime.now(timezone.utc) |
|
|
end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0) |
|
|
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS) |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
all_metadata = fetch_issue_metadata_batched( |
|
|
client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True |
|
|
) |
|
|
|
|
|
|
|
|
total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values()) |
|
|
agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list) |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"✅ BigQuery mining and upload complete!") |
|
|
print(f" Total agents: {len(agents)}") |
|
|
print(f" Agents with data: {agents_with_data}") |
|
|
print(f" Total PRs found: {total_prs}") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"✗ Error during BigQuery fetch: {str(e)}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return |
|
|
|
|
|
|
|
|
print(f"📤 Uploading leaderboard and metrics data...") |
|
|
if save_leaderboard_and_metrics_to_hf(all_metadata, agents): |
|
|
print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}") |
|
|
else: |
|
|
print(f"⚠️ Failed to upload leaderboard and metrics data") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
mine_all_agents() |
|
|
|