Spaces:

SWE-Arena
/

SWE-Issue

Sleeping

App Files Files Community

SWE-Issue / msr.py

zhiminy

refine

5479427 5 days ago

raw

history blame contribute delete

34.4 kB

	"""
	Minimalist Issue Metadata Mining Script
	Mines issue metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset.
	"""

	import json
	import os
	import tempfile
	from datetime import datetime, timezone, timedelta
	from collections import defaultdict
	from huggingface_hub import HfApi, hf_hub_download
	from huggingface_hub.errors import HfHubHTTPError
	from dotenv import load_dotenv
	from google.cloud import bigquery
	import backoff

	# Load environment variables
	load_dotenv()

	# =============================================================================
	# CONFIGURATION
	# =============================================================================

	AGENTS_REPO = "SWE-Arena/bot_metadata"
	ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"
	LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
	LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard

	# =============================================================================
	# HUGGINGFACE API WRAPPERS WITH BACKOFF
	# =============================================================================

	def is_rate_limit_error(e):
	"""Check if the exception is a rate limit error (429)."""
	return isinstance(e, HfHubHTTPError) and e.response.status_code == 429


	@backoff.on_exception(
	backoff.expo,
	HfHubHTTPError,
	giveup=lambda e: not is_rate_limit_error(e),
	max_tries=8,
	base=300,
	max_value=3600,
	jitter=backoff.full_jitter,
	on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
	)
	def list_repo_files_with_backoff(api, **kwargs):
	"""List repo files with exponential backoff on rate limit errors."""
	return api.list_repo_files(**kwargs)

	@backoff.on_exception(
	backoff.expo,
	HfHubHTTPError,
	giveup=lambda e: not is_rate_limit_error(e),
	max_tries=8,
	base=300,
	max_value=3600,
	jitter=backoff.full_jitter,
	on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
	)
	def hf_hub_download_with_backoff(**kwargs):
	"""Download from HF Hub with exponential backoff on rate limit errors."""
	return hf_hub_download(**kwargs)

	@backoff.on_exception(
	backoff.expo,
	HfHubHTTPError,
	giveup=lambda e: not is_rate_limit_error(e),
	max_tries=8,
	base=300,
	max_value=3600,
	jitter=backoff.full_jitter,
	on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
	)
	def upload_file_with_backoff(api, **kwargs):
	"""Upload file with exponential backoff on rate limit errors."""
	return api.upload_file(**kwargs)

	@backoff.on_exception(
	backoff.expo,
	HfHubHTTPError,
	giveup=lambda e: not is_rate_limit_error(e),
	max_tries=8,
	base=300,
	max_value=3600,
	jitter=backoff.full_jitter,
	on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
	)
	def upload_folder_with_backoff(api, **kwargs):
	"""Upload folder with exponential backoff on rate limit errors."""
	return api.upload_folder(**kwargs)

	# =============================================================================
	# UTILITY FUNCTIONS
	# =============================================================================

	def load_jsonl(filename):
	"""Load JSONL file and return list of dictionaries."""
	if not os.path.exists(filename):
	return []

	data = []
	with open(filename, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if line:
	try:
	data.append(json.loads(line))
	except json.JSONDecodeError as e:
	print(f"Warning: Skipping invalid JSON line: {e}")
	return data


	def save_jsonl(filename, data):
	"""Save list of dictionaries to JSONL file."""
	with open(filename, 'w', encoding='utf-8') as f:
	for item in data:
	f.write(json.dumps(item) + '\n')


	def get_hf_token():
	"""Get HuggingFace token from environment variables."""
	token = os.getenv('HF_TOKEN')
	if not token:
	print("Warning: HF_TOKEN not found in environment variables")
	return token


	def get_bigquery_client():
	"""
	Initialize BigQuery client using credentials from environment variable.

	Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
	the service account JSON credentials as a string.
	"""
	# Get the JSON content from environment variable
	creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')

	if creds_json:
	# Create a temporary file to store credentials
	with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
	temp_file.write(creds_json)
	temp_path = temp_file.name

	# Set environment variable to point to temp file
	os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path

	# Initialize BigQuery client
	client = bigquery.Client()

	# Clean up temp file
	os.unlink(temp_path)

	return client
	else:
	raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")


	def generate_table_union_statements(start_date, end_date):
	"""
	Generate UNION ALL statements for githubarchive.month tables in date range.

	Args:
	start_date: Start datetime
	end_date: End datetime

	Returns:
	String with UNION ALL SELECT statements for all monthly tables in range
	"""
	table_names = []

	# Start from the beginning of start_date's month
	current_date = start_date.replace(day=1)
	end_month = end_date.replace(day=1)

	while current_date <= end_month:
	table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
	table_names.append(table_name)

	# Move to next month
	if current_date.month == 12:
	current_date = current_date.replace(year=current_date.year + 1, month=1)
	else:
	current_date = current_date.replace(month=current_date.month + 1)

	# Create UNION ALL chain
	union_parts = [f"SELECT * FROM {table}" for table in table_names]
	return " UNION ALL ".join(union_parts)


	# =============================================================================
	# BIGQUERY FUNCTIONS
	# =============================================================================

	def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
	"""
	Fetch issue metadata for ALL agents using BATCHED BigQuery queries.

	Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
	and correlated subqueries. Each batch query runs much faster than one massive query.

	Args:
	client: BigQuery client instance
	identifiers: List of GitHub usernames/bot identifiers
	start_date: Start datetime (timezone-aware)
	end_date: End datetime (timezone-aware)
	batch_size: Number of agents per batch (default: 100)
	upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)

	Returns:
	Dictionary mapping agent identifier to list of issue metadata
	"""
	print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
	print(f" Batch size: {batch_size} agents per query")
	print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")

	# Split identifiers into batches
	batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
	print(f" Total batches: {len(batches)}")

	# Collect results from all batches
	all_metadata = {}

	for batch_num, batch_identifiers in enumerate(batches, 1):
	print(f"\n{'─'*80}")
	print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
	print(f"{'─'*80}")

	try:
	batch_results = fetch_all_issue_metadata_single_query(
	client, batch_identifiers, start_date, end_date
	)

	# Merge results
	for identifier, metadata_list in batch_results.items():
	if identifier in all_metadata:
	all_metadata[identifier].extend(metadata_list)
	else:
	all_metadata[identifier] = metadata_list

	print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")

	# Upload immediately after this batch if enabled
	if upload_immediately and batch_results:
	print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
	upload_success = 0
	upload_errors = 0

	for identifier, metadata_list in batch_results.items():
	if metadata_list:
	if save_issue_metadata_to_hf(metadata_list, identifier):
	upload_success += 1
	else:
	upload_errors += 1

	print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")

	except Exception as e:
	print(f" ✗ Batch {batch_num} failed: {str(e)}")
	print(f" Continuing with remaining batches...")
	import traceback
	traceback.print_exc()
	continue

	print(f"\n{'='*80}")
	print(f"✅ All batches completed!")
	print(f" Total agents with data: {len(all_metadata)}")
	total_issues = sum(len(issues) for issues in all_metadata.values())
	print(f" Total issues found: {total_issues}")
	print(f"{'='*80}\n")

	return all_metadata


	def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
	"""
	Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.

	This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
	deduplicates to get the latest state of each issue. Filters by issue author,
	commenter, or assignee.

	NOTE: This function is designed for smaller batches (~100 agents). For large
	numbers of agents, use fetch_issue_metadata_batched() instead.

	Args:
	client: BigQuery client instance
	identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
	start_date: Start datetime (timezone-aware)
	end_date: End datetime (timezone-aware)

	Returns:
	Dictionary mapping agent identifier to list of issue metadata:
	{
	'agent-identifier': [
	{
	'url': Issue URL,
	'created_at': Issue creation timestamp,
	'closed_at': Close timestamp (if closed, else None),
	'state_reason': Reason for closure (completed/not_planned/etc.)
	},
	...
	],
	...
	}
	"""
	print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
	print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

	# Generate table UNION statements for issue events
	issue_tables = generate_table_union_statements(start_date, end_date)

	# Build identifier list (handle both bot and non-bot versions)
	identifier_set = set()
	for id in identifiers:
	identifier_set.add(id)
	# Also add stripped version without [bot] suffix
	stripped = id.replace('[bot]', '')
	if stripped != id:
	identifier_set.add(stripped)

	# Convert to array literal for UNNEST (avoids query size limits from large IN clauses)
	identifier_array = '[' + ', '.join([f'"{id}"' for id in identifier_set]) + ']'

	print(f" Total identifiers (including bot/non-bot variants): {len(identifier_set)}")

	# Build comprehensive query with CTEs using UNNEST instead of large IN clauses
	query = f"""
	WITH agent_identifiers AS (
	-- Create a table from the identifier array to avoid massive IN clauses
	SELECT identifier
	FROM UNNEST({identifier_array}) AS identifier
	),

	issue_events AS (
	-- Get all issue events and comment events for ALL agents
	SELECT
	JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url,
	JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at,
	JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at,
	JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason,
	JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author,
	JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee,
	JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter,
	JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
	repo.name as repo_name,
	created_at as event_time
	FROM (
	{issue_tables}
	)
	WHERE
	type IN ('IssuesEvent', 'IssueCommentEvent')
	-- Exclude pull requests (they have pull_request field)
	AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL
	AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL
	-- Filter by author OR commenter OR assignee using JOIN instead of IN
	AND (
	JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN (SELECT identifier FROM agent_identifiers)
	OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN (SELECT identifier FROM agent_identifiers)
	OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN (SELECT identifier FROM agent_identifiers)
	)
	),

	latest_states AS (
	-- Deduplicate to get latest state for each issue
	SELECT
	url,
	created_at,
	closed_at,
	state_reason,
	author,
	assignee,
	commenter
	FROM issue_events
	QUALIFY ROW_NUMBER() OVER (
	PARTITION BY repo_name, issue_number
	ORDER BY event_time DESC
	) = 1
	),

	agent_issues AS (
	-- Map each issue to its relevant agent(s)
	SELECT DISTINCT
	CASE
	WHEN author IN (SELECT identifier FROM agent_identifiers) THEN author
	WHEN commenter IN (SELECT identifier FROM agent_identifiers) THEN commenter
	WHEN assignee IN (SELECT identifier FROM agent_identifiers) THEN assignee
	ELSE NULL
	END as agent_identifier,
	url,
	created_at,
	closed_at,
	state_reason
	FROM latest_states
	WHERE
	author IN (SELECT identifier FROM agent_identifiers)
	OR commenter IN (SELECT identifier FROM agent_identifiers)
	OR assignee IN (SELECT identifier FROM agent_identifiers)
	)

	SELECT
	agent_identifier,
	url,
	created_at,
	closed_at,
	state_reason
	FROM agent_issues
	WHERE agent_identifier IS NOT NULL
	ORDER BY agent_identifier, created_at DESC
	"""

	# Calculate number of days for reporting
	query_days = (end_date - start_date).days

	print(f" Querying {query_days} days for issue and comment events...")
	print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")

	try:
	query_job = client.query(query)
	results = list(query_job.result())

	print(f" ✓ Found {len(results)} total issue records across all agents")

	# Group results by agent
	metadata_by_agent = defaultdict(list)

	for row in results:
	agent_id = row.agent_identifier

	# Convert datetime objects to ISO strings
	created_at = row.created_at
	if hasattr(created_at, 'isoformat'):
	created_at = created_at.isoformat()

	closed_at = row.closed_at
	if hasattr(closed_at, 'isoformat'):
	closed_at = closed_at.isoformat()

	metadata_by_agent[agent_id].append({
	'url': row.url,
	'created_at': created_at,
	'closed_at': closed_at,
	'state_reason': row.state_reason,
	})

	# Print breakdown by agent
	print(f"\n 📊 Results breakdown by agent:")
	for identifier in identifiers:
	# Check both original and stripped versions
	count = len(metadata_by_agent.get(identifier, []))
	stripped = identifier.replace('[bot]', '')
	if stripped != identifier:
	count += len(metadata_by_agent.get(stripped, []))

	if count > 0:
	# Merge both versions if needed
	all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, [])
	completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed')
	closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None)
	open_count = count - closed_count
	print(f" {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)")

	# Convert defaultdict to regular dict and merge bot/non-bot versions
	final_metadata = {}
	for identifier in identifiers:
	combined = metadata_by_agent.get(identifier, [])
	stripped = identifier.replace('[bot]', '')
	if stripped != identifier and stripped in metadata_by_agent:
	combined.extend(metadata_by_agent[stripped])

	if combined:
	final_metadata[identifier] = combined

	return final_metadata

	except Exception as e:
	print(f" ✗ BigQuery error: {str(e)}")
	import traceback
	traceback.print_exc()
	return {}


	# =============================================================================
	# HUGGINGFACE STORAGE FUNCTIONS
	# =============================================================================

	def group_metadata_by_date(metadata_list):
	"""
	Group issue metadata by exact date (year.month.day) for efficient daily storage.
	Returns dict: {(year, month, day): [metadata_list]}
	"""
	grouped = defaultdict(list)

	for issue_meta in metadata_list:
	created_at = issue_meta.get('created_at')
	if not created_at:
	continue

	try:
	dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
	key = (dt.year, dt.month, dt.day)
	grouped[key].append(issue_meta)
	except Exception as e:
	print(f"Warning: Could not parse date '{created_at}': {e}")

	return dict(grouped)


	def save_issue_metadata_to_hf(metadata_list, agent_identifier):
	"""
	Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
	Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.

	This function OVERWRITES existing files completely with fresh data from BigQuery.
	Uses upload_folder for single-commit batch uploads (avoids rate limit issues).

	Args:
	metadata_list: List of issue metadata dictionaries
	agent_identifier: GitHub identifier of the agent (used as folder name)
	"""
	import shutil

	try:
	token = get_hf_token()
	if not token:
	raise Exception("No HuggingFace token found")

	api = HfApi(token=token)

	# Group by date (year, month, day)
	grouped = group_metadata_by_date(metadata_list)

	if not grouped:
	print(f" No valid metadata to save for {agent_identifier}")
	return False

	# Create a temporary directory for batch upload
	temp_dir = tempfile.mkdtemp()
	agent_folder = os.path.join(temp_dir, agent_identifier)
	os.makedirs(agent_folder, exist_ok=True)

	try:
	print(f" 📦 Preparing batch upload for {len(grouped)} daily files...")

	# Process each daily file
	for (issue_year, month, day), day_metadata in grouped.items():
	filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
	local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl")

	# Sort by created_at for better organization
	day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)

	# Save to temp directory (complete overwrite, no merging)
	save_jsonl(local_filename, day_metadata)
	print(f" Prepared {len(day_metadata)} issues for {filename}")

	# Upload entire folder using upload_folder (single commit per agent)
	print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
	upload_folder_with_backoff(
	api,
	folder_path=temp_dir,
	repo_id=ISSUE_METADATA_REPO,
	repo_type="dataset",
	commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
	)
	print(f" ✓ Batch upload complete for {agent_identifier}")

	return True

	finally:
	# Always clean up temp directory
	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)

	except Exception as e:
	print(f" ✗ Error saving issue metadata: {str(e)}")
	import traceback
	traceback.print_exc()
	return False


	def load_agents_from_hf():
	"""
	Load all agent metadata JSON files from HuggingFace dataset.

	The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]')
	"""
	try:
	api = HfApi()
	agents = []

	# List all files in the repository
	files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")

	# Filter for JSON files only
	json_files = [f for f in files if f.endswith('.json')]

	print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")

	# Download and parse each JSON file
	for json_file in json_files:
	try:
	file_path = hf_hub_download_with_backoff(
	repo_id=AGENTS_REPO,
	filename=json_file,
	repo_type="dataset"
	)

	with open(file_path, 'r') as f:
	agent_data = json.load(f)

	# Only process agents with status == "public"
	if agent_data.get('status') != 'public':
	continue

	# Extract github_identifier from filename (remove .json extension)
	github_identifier = json_file.replace('.json', '')
	agent_data['github_identifier'] = github_identifier

	agents.append(agent_data)

	except Exception as e:
	print(f"Warning: Could not load {json_file}: {str(e)}")
	continue

	print(f"✓ Loaded {len(agents)} agents from HuggingFace")
	return agents

	except Exception as e:
	print(f"Could not load agents from HuggingFace: {str(e)}")
	return []


	# =============================================================================
	# LEADERBOARD CALCULATION FUNCTIONS
	# =============================================================================

	def calculate_issue_stats_from_metadata(metadata_list):
	"""
	Calculate statistics from a list of issue metadata.

	Returns:
	dict: Issue statistics including total, closed, resolved counts and rate
	"""
	total_issues = len(metadata_list)

	# Count closed issues (those with closed_at timestamp)
	closed_issues = sum(1 for issue_meta in metadata_list
	if issue_meta.get('closed_at') is not None)

	# Count completed issues (subset of closed issues with state_reason="completed")
	completed = sum(1 for issue_meta in metadata_list
	if issue_meta.get('state_reason') == 'completed')

	# Calculate resolved rate as: completed / closed (not completed / total)
	resolved_rate = (completed / closed_issues * 100) if closed_issues > 0 else 0

	return {
	'total_issues': total_issues,
	'closed_issues': closed_issues,
	'resolved_issues': completed,
	'resolved_rate': round(resolved_rate, 2),
	}


	def calculate_monthly_metrics(all_metadata, agents):
	"""
	Calculate monthly metrics for all agents for visualization.

	Args:
	all_metadata: Dictionary mapping agent_identifier to list of issue metadata
	agents: List of agent dictionaries with metadata

	Returns:
	dict: {
	'agents': list of agent names,
	'months': list of month labels (e.g., '2025-01'),
	'data': {
	agent_name: {
	'resolved_rates': list of resolved rates by month,
	'total_issues': list of issue counts by month,
	'resolved_issues': list of resolved issue counts by month
	}
	}
	}
	"""
	# Create mapping from agent_identifier to agent_name
	identifier_to_name = {
	agent.get('github_identifier'): agent.get('name', agent.get('name', agent.get('github_identifier')))
	for agent in agents if agent.get('github_identifier')
	}

	# Group by agent and month
	agent_month_data = defaultdict(lambda: defaultdict(list))

	for identifier, metadata_list in all_metadata.items():
	agent_name = identifier_to_name.get(identifier, identifier)

	for issue_meta in metadata_list:
	created_at = issue_meta.get('created_at')
	if not created_at:
	continue

	try:
	dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
	month_key = f"{dt.year}-{dt.month:02d}"
	agent_month_data[agent_name][month_key].append(issue_meta)
	except Exception as e:
	print(f"Warning: Could not parse date '{created_at}': {e}")
	continue

	# Get all unique months and sort them
	all_months = set()
	for agent_data in agent_month_data.values():
	all_months.update(agent_data.keys())
	months = sorted(list(all_months))

	# Calculate metrics for each agent and month
	result_data = {}
	for agent_name, month_dict in agent_month_data.items():
	resolved_rates = []
	total_issues_list = []
	resolved_issues_list = []

	for month in months:
	issues_in_month = month_dict.get(month, [])

	# Count completed issues (those with state_reason="completed")
	completed_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')

	# Count closed issues (those with closed_at timestamp)
	closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at') is not None)

	# Total issues created in this month
	total_count = len(issues_in_month)

	# Calculate resolved rate as: completed / closed (not completed / total)
	resolved_rate = (completed_count / closed_count * 100) if closed_count > 0 else None

	resolved_rates.append(resolved_rate)
	total_issues_list.append(total_count)
	resolved_issues_list.append(completed_count)

	result_data[agent_name] = {
	'resolved_rates': resolved_rates,
	'total_issues': total_issues_list,
	'resolved_issues': resolved_issues_list
	}

	agents_list = sorted(list(agent_month_data.keys()))

	return {
	'agents': agents_list,
	'months': months,
	'data': result_data
	}


	def save_leaderboard_and_metrics_to_hf(all_metadata, agents):
	"""
	Creates a comprehensive JSON file with both leaderboard stats and monthly metrics.
	If the file exists, it will be overwritten.

	Args:
	all_metadata: Dictionary mapping agent_identifier to list of issue metadata
	agents: List of agent dictionaries with metadata

	Returns:
	bool: True if successful, False otherwise
	"""
	import io

	try:
	token = get_hf_token()
	if not token:
	raise Exception("No HuggingFace token found")

	api = HfApi(token=token)

	print(f"\n{'='*80}")
	print(f"📊 Preparing leaderboard and metrics data for upload...")
	print(f"{'='*80}\n")

	# Build leaderboard data
	print(" Constructing leaderboard data...")
	leaderboard_data = {}

	for agent in agents:
	identifier = agent.get('github_identifier')
	agent_name = agent.get('name', 'Unknown')

	if not identifier:
	continue

	metadata = all_metadata.get(identifier, [])
	stats = calculate_issue_stats_from_metadata(metadata)

	leaderboard_data[identifier] = {
	'name': agent_name,
	'website': agent.get('website', 'N/A'),
	'github_identifier': identifier,
	**stats
	}

	# Get monthly metrics data
	print(" Calculating monthly metrics...")
	monthly_metrics = calculate_monthly_metrics(all_metadata, agents)

	# Combine into a single structure
	combined_data = {
	"leaderboard": leaderboard_data,
	"monthly_metrics": monthly_metrics,
	"metadata": {
	"last_updated": datetime.now(timezone.utc).isoformat(),
	"time_frame_days": LEADERBOARD_TIME_FRAME_DAYS,
	"total_agents": len(leaderboard_data)
	}
	}

	print(f" Leaderboard entries: {len(leaderboard_data)}")
	print(f" Monthly metrics for: {len(monthly_metrics['agents'])} agents")
	print(f" Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days")

	# Convert to JSON and create file-like object
	json_content = json.dumps(combined_data, indent=2)
	file_like_object = io.BytesIO(json_content.encode('utf-8'))

	# Upload to HuggingFace (will overwrite if exists)
	print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
	upload_file_with_backoff(
	api,
	path_or_fileobj=file_like_object,
	path_in_repo="swe-issue.json",
	repo_id=LEADERBOARD_REPO,
	repo_type="dataset",
	token=token,
	commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
	)

	print(f" ✓ Successfully uploaded swe-issue.json")
	print(f"{'='*80}\n")

	return True

	except Exception as e:
	print(f"✗ Error saving leaderboard and metrics: {str(e)}")
	import traceback
	traceback.print_exc()
	return False


	# =============================================================================
	# MAIN MINING FUNCTION
	# =============================================================================

	def mine_all_agents():
	"""
	Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
	Uses ONE BigQuery query for ALL agents (most efficient approach).
	"""
	# Load agent metadata from HuggingFace
	agents = load_agents_from_hf()
	if not agents:
	print("No agents found in HuggingFace dataset")
	return

	# Extract all identifiers
	identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
	if not identifiers:
	print("No valid agent identifiers found")
	return

	print(f"\n{'='*80}")
	print(f"Starting issue metadata mining for {len(identifiers)} agents")
	print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
	print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
	print(f"{'='*80}\n")

	# Initialize BigQuery client
	try:
	client = get_bigquery_client()
	except Exception as e:
	print(f"✗ Failed to initialize BigQuery client: {str(e)}")
	return

	# Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
	current_time = datetime.now(timezone.utc)
	end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
	start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)

	try:
	# Use batched approach for better performance
	# upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
	all_metadata = fetch_issue_metadata_batched(
	client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
	)

	# Calculate summary statistics
	total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
	agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)

	print(f"\n{'='*80}")
	print(f"✅ BigQuery mining and upload complete!")
	print(f" Total agents: {len(agents)}")
	print(f" Agents with data: {agents_with_data}")
	print(f" Total PRs found: {total_prs}")
	print(f"{'='*80}\n")

	except Exception as e:
	print(f"✗ Error during BigQuery fetch: {str(e)}")
	import traceback
	traceback.print_exc()
	return

	# After mining is complete, save leaderboard and metrics to HuggingFace
	print(f"📤 Uploading leaderboard and metrics data...")
	if save_leaderboard_and_metrics_to_hf(all_metadata, agents):
	print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
	else:
	print(f"⚠️ Failed to upload leaderboard and metrics data")


	# =============================================================================
	# ENTRY POINT
	# =============================================================================

	if __name__ == "__main__":
	mine_all_agents()