Spaces:

SWE-Arena
/

SWE-Review

Running

App Files Files Community

zhiminy commited on 11 days ago

Commit

3d31827

1 Parent(s): cafc0c6

refine mining mechanism

Browse files

Files changed (3) hide show

app.py +360 -57
msr.py +437 -745
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from gradio_leaderboard import Leaderboard
 import json
 import os
 import time
 import requests
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
@@ -17,6 +18,7 @@ import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
 # Load environment variables
 load_dotenv()
@@ -121,6 +123,284 @@ def normalize_date_format(date_string):
         return date_string
 # =============================================================================
 # GITHUB API OPERATIONS
 # =============================================================================
@@ -1788,19 +2068,24 @@ def create_monthly_metrics_plot():
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
-    # Define colors for agents (using a color palette)
-    colors = [
-        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
-        '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
-    ]
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
     # Add traces for each agent
     for idx, agent_name in enumerate(agents):
-        color = colors[idx % len(colors)]
         agent_data = data[agent_name]
         # Add line trace for acceptance rate (left y-axis)
@@ -1817,10 +2102,11 @@ def create_monthly_metrics_plot():
                     name=agent_name,
                     mode='lines+markers',
                     line=dict(color=color, width=2),
-                    marker=dict(size=6),
                     legendgroup=agent_name,
-                    showlegend=True,
-                    hovertemplate='<b>%{fullData.name}</b><br>' +
                                  'Acceptance Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
                 ),
@@ -1841,11 +2127,12 @@ def create_monthly_metrics_plot():
                 go.Bar(
                     x=x_bars,
                     y=y_bars,
-                    name=f"{agent_name} (Reviews)",
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
-                    showlegend=False,  # Don't show in legend (already shown for line)
-                    hovertemplate='<b>%{fullData.name}</b><br>' +
                                  'Total Reviews: %{y}<br>' +
                                  '<extra></extra>',
                     offsetgroup=agent_name  # Group bars by agent for proper spacing
@@ -1861,17 +2148,11 @@ def create_monthly_metrics_plot():
     # Update layout
     fig.update_layout(
         title=None,
-        hovermode='closest',
         barmode='group',
         height=600,
-        legend=dict(
-            orientation="h",
-            yanchor="bottom",
-            y=1.02,
-            xanchor="right",
-            x=1
-        ),
-        margin=dict(l=50, r=50, t=100, b=50)
     )
     return fig
@@ -1978,17 +2259,21 @@ def submit_agent(identifier, agent_name, organization, description, website):
 def fetch_and_update_daily_reviews():
     """
-    Fetch and update reviews with comprehensive status checking.
     Strategy:
     1. For each agent:
        - Examine ALL open reviews from last LEADERBOARD_TIME_FRAME_DAYS - 1 for their closed_at status
-       - Update PR status for all existing metadata (last LEADERBOARD_TIME_FRAME_DAYS - 1)
-       - Fetch new reviews from yesterday 12am to today 12am
        - Save all updated/new metadata back to HuggingFace
     """
-    tokens = get_github_tokens()
-    token_pool = TokenPool(tokens)
     # Load all agents
     agents = load_agents_from_hf()
@@ -2041,44 +2326,62 @@ def fetch_and_update_daily_reviews():
             print(f"   ✓ Loaded {len(recent_metadata)} existing reviews from timeframe")
-            # Step 2: Examine ALL open reviews for their closed_at status
-            # This ensures we capture any reviews that may have been closed/merged since last check
             if recent_metadata:
-                print(f"🔍 Examining {len(recent_metadata)} open reviews for status updates (checking closed_at)...")
-                recent_metadata = update_pr_status(recent_metadata, token_pool)
-                print(f"   ✓ Updated PR status for existing reviews")
-            # Step 3: Fetch NEW reviews from yesterday 12am to today 12am
-            print(f"🔍 Fetching new reviews from {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}...")
-            base_query = f'is:pr review:approved author:{identifier} -is:draft'
-            prs_by_url = {}
-            fetch_reviews_with_time_partition(
-                base_query,
-                yesterday_midnight,
-                today_midnight,
-                token_pool,
-                prs_by_url,
-                debug_limit=None
-            )
             # Extract metadata for new reviews
             yesterday_metadata = []
-            for pr_url, pr in prs_by_url.items():
-                metadata = extract_review_metadata(pr)
-                if metadata:
-                    metadata['agent_identifier'] = identifier
-                    yesterday_metadata.append(metadata)
-            print(f"   ✓ Found {len(yesterday_metadata)} new reviews in 24-hour window")
-            # Step 4: Update PR status for new reviews
-            if yesterday_metadata:
-                print(f"   Updating PR status for {len(yesterday_metadata)} new reviews...")
-                yesterday_metadata = update_pr_status(yesterday_metadata, token_pool)
-            # Step 5: Combine and save all metadata
             all_updated_metadata = recent_metadata + yesterday_metadata
             if all_updated_metadata:

 import json
 import os
 import time
+import tempfile
 import requests
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
+from google.cloud import bigquery
 # Load environment variables
 load_dotenv()
         return date_string
+# =============================================================================
+# BIGQUERY FUNCTIONS
+# =============================================================================
+def get_bigquery_client():
+    """
+    Initialize BigQuery client using credentials from environment variable.
+    Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
+    the service account JSON credentials as a string.
+    """
+    # Get the JSON content from environment variable
+    creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
+    if creds_json:
+        # Create a temporary file to store credentials
+        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
+            temp_file.write(creds_json)
+            temp_path = temp_file.name
+        # Set environment variable to point to temp file
+        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
+        # Initialize BigQuery client
+        client = bigquery.Client()
+        # Clean up temp file
+        os.unlink(temp_path)
+        return client
+    else:
+        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
+def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
+    """
+    Fetch PR review events from GitHub Archive for a specific agent.
+    Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
+    actor.login matches the agent identifier.
+    Args:
+        client: BigQuery client instance
+        identifier: GitHub username or bot identifier (e.g., 'amazon-inspector-beta[bot]')
+        start_date: Start datetime (timezone-aware)
+        end_date: End datetime (timezone-aware)
+    Returns:
+        List of review event rows with PR information
+    """
+    print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
+    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
+    # Generate list of table names for each day in the range
+    table_refs = []
+    current_date = start_date
+    while current_date < end_date:
+        table_name = f"githubarchive.day.{current_date.strftime('%Y%m%d')}"
+        table_refs.append(table_name)
+        current_date += timedelta(days=1)
+    # Build UNION ALL query for all daily tables
+    union_parts = []
+    for table_name in table_refs:
+        union_parts.append(f"""
+        SELECT
+            repo.name as repo_name,
+            actor.login as actor_login,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
+            CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) as pr_number,
+            JSON_EXTRACT_SCALAR(payload, '$.review.submitted_at') as reviewed_at,
+            created_at
+        FROM `{table_name}`
+        WHERE type = 'PullRequestReviewEvent'
+        AND actor.login = @identifier
+        """)
+    query = " UNION ALL ".join(union_parts)
+    job_config = bigquery.QueryJobConfig(
+        query_parameters=[
+            bigquery.ScalarQueryParameter("identifier", "STRING", identifier)
+        ]
+    )
+    print(f"   Querying {len(table_refs)} daily tables...")
+    try:
+        query_job = client.query(query, job_config=job_config)
+        results = list(query_job.result())
+        print(f"   ✓ Found {len(results)} review events")
+        return results
+    except Exception as e:
+        print(f"   ✗ BigQuery error: {str(e)}")
+        return []
+def fetch_pr_status_from_bigquery(client, pr_urls, start_date, end_date):
+    """
+    Fetch PR status (merged/closed) from GitHub Archive PullRequestEvent.
+    For each PR URL, looks for PullRequestEvent with action='closed' to determine
+    if the PR was merged or just closed.
+    Args:
+        client: BigQuery client instance
+        pr_urls: List of PR URLs to check status for
+        start_date: Start datetime (should cover review period and after)
+        end_date: End datetime (should be recent/current)
+    Returns:
+        Dictionary mapping PR URL to status dict:
+        {
+            'pr_url': {
+                'status': 'merged'|'closed'|'open',
+                'merged': bool,
+                'closed_at': timestamp or None
+            }
+        }
+    """
+    if not pr_urls:
+        return {}
+    print(f"\n🔍 Querying BigQuery for PR status ({len(pr_urls)} PRs)...")
+    # Extract repo and PR number from URLs
+    # URL format: https://github.com/owner/repo/pull/123
+    pr_info = []
+    for url in pr_urls:
+        try:
+            parts = url.replace('https://github.com/', '').split('/')
+            if len(parts) >= 4:
+                owner = parts[0]
+                repo = parts[1]
+                pr_number = int(parts[3])
+                repo_name = f"{owner}/{repo}"
+                pr_info.append({
+                    'url': url,
+                    'repo': repo_name,
+                    'number': pr_number
+                })
+        except Exception as e:
+            print(f"   Warning: Could not parse PR URL {url}: {e}")
+            continue
+    if not pr_info:
+        return {}
+    # Build repo filter condition for WHERE clause
+    # Group PRs by repo to create efficient filters
+    repos_to_prs = defaultdict(list)
+    for pr in pr_info:
+        repos_to_prs[pr['repo']].append(pr['number'])
+    # Generate list of table names for date range
+    # Look back 1 full year from end_date to catch PR close events that may have occurred before reviews
+    pr_status_start = end_date - timedelta(days=365)
+    table_refs = []
+    current_date = pr_status_start
+    while current_date < end_date:
+        table_name = f"githubarchive.day.{current_date.strftime('%Y%m%d')}"
+        table_refs.append(table_name)
+        current_date += timedelta(days=1)
+    # Build WHERE clause to filter by specific repos and PR numbers
+    # Format: (repo='owner/repo1' AND pr_number IN (1,2,3)) OR (repo='owner/repo2' AND pr_number IN (4,5))
+    filter_conditions = []
+    for repo, pr_numbers in repos_to_prs.items():
+        pr_list = ','.join(map(str, pr_numbers))
+        filter_conditions.append(f"(repo.name = '{repo}' AND CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) IN ({pr_list}))")
+    pr_filter = " OR ".join(filter_conditions)
+    # Build query to find close/merge events for specific PRs
+    union_parts = []
+    for table_name in table_refs:
+        union_parts.append(f"""
+        SELECT
+            repo.name as repo_name,
+            CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) as pr_number,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
+            JSON_EXTRACT_SCALAR(payload, '$.action') as action,
+            CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') AS BOOL) as merged,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
+            created_at
+        FROM `{table_name}`
+        WHERE type = 'PullRequestEvent'
+        AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'closed'
+        AND ({pr_filter})
+        """)
+    query = " UNION ALL ".join(union_parts)
+    print(f"   Querying {len(table_refs)} daily tables for PR status (1-year lookback: {pr_status_start.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')})...")
+    print(f"   Filtering for {len(pr_info)} specific PRs across {len(repos_to_prs)} repos")
+    try:
+        query_job = client.query(query)
+        results = list(query_job.result())
+        print(f"   ✓ Found {len(results)} PR close events")
+        # Build status map by PR URL
+        status_map = {}
+        for row in results:
+            pr_url = row.pr_url
+            merged = row.merged if row.merged is not None else False
+            closed_at = row.closed_at or row.merged_at
+            # Convert to ISO format if datetime
+            if hasattr(closed_at, 'isoformat'):
+                closed_at = closed_at.isoformat()
+            status = 'merged' if merged else 'closed'
+            status_map[pr_url] = {
+                'status': status,
+                'merged': merged,
+                'closed_at': closed_at
+            }
+        # Mark remaining PRs as open
+        for url in pr_urls:
+            if url not in status_map:
+                status_map[url] = {
+                    'status': 'open',
+                    'merged': False,
+                    'closed_at': None
+                }
+        merged_count = sum(1 for s in status_map.values() if s['merged'])
+        closed_count = sum(1 for s in status_map.values() if s['status'] == 'closed')
+        open_count = sum(1 for s in status_map.values() if s['status'] == 'open')
+        print(f"   Status breakdown: {merged_count} merged, {closed_count} closed, {open_count} open")
+        return status_map
+    except Exception as e:
+        print(f"   ✗ BigQuery error: {str(e)}")
+        # Return all as open on error
+        return {url: {'status': 'open', 'merged': False, 'closed_at': None} for url in pr_urls}
+def extract_review_metadata_from_bigquery(review_row, status_info):
+    """
+    Extract minimal PR review metadata from BigQuery row and status info.
+    Args:
+        review_row: BigQuery row from PullRequestReviewEvent query
+        status_info: Status dictionary from fetch_pr_status_from_bigquery
+    Returns:
+        Dictionary with review metadata
+    """
+    pr_url = review_row.pr_url
+    pr_number = review_row.pr_number
+    reviewed_at = review_row.reviewed_at or review_row.created_at
+    # Convert to ISO format if datetime
+    if hasattr(reviewed_at, 'isoformat'):
+        reviewed_at = reviewed_at.isoformat()
+    return {
+        'html_url': pr_url,
+        'reviewed_at': reviewed_at,
+        'pr_status': status_info['status'],
+        'pr_merged': status_info['merged'],
+        'pr_closed_at': status_info['closed_at'],
+        'pr_url': pr_url,
+        'review_id': f"pr_{pr_number}"
+    }
 # =============================================================================
 # GITHUB API OPERATIONS
 # =============================================================================
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
+    # Generate unique colors for many agents using HSL color space
+    def generate_color(index, total):
+        """Generate distinct colors using HSL color space for better distribution"""
+        hue = (index * 360 / total) % 360
+        saturation = 70 + (index % 3) * 10  # Vary saturation slightly
+        lightness = 45 + (index % 2) * 10   # Vary lightness slightly
+        return f'hsl({hue}, {saturation}%, {lightness}%)'
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
+    # Generate colors for all agents
+    agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
     # Add traces for each agent
     for idx, agent_name in enumerate(agents):
+        color = agent_colors[agent_name]
         agent_data = data[agent_name]
         # Add line trace for acceptance rate (left y-axis)
                     name=agent_name,
                     mode='lines+markers',
                     line=dict(color=color, width=2),
+                    marker=dict(size=8),
                     legendgroup=agent_name,
+                    showlegend=False,  # Hide legend for 70+ agents
+                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
+                                 'Month: %{x}<br>' +
                                  'Acceptance Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
                 ),
                 go.Bar(
                     x=x_bars,
                     y=y_bars,
+                    name=agent_name,
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
+                    showlegend=False,  # Hide legend for 70+ agents
+                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
+                                 'Month: %{x}<br>' +
                                  'Total Reviews: %{y}<br>' +
                                  '<extra></extra>',
                     offsetgroup=agent_name  # Group bars by agent for proper spacing
     # Update layout
     fig.update_layout(
         title=None,
+        hovermode='closest',  # Show individual agent info on hover
         barmode='group',
         height=600,
+        showlegend=False,  # Hide legend for 70+ agents
+        margin=dict(l=50, r=50, t=50, b=50)  # Reduced top margin since no legend
     )
     return fig
 def fetch_and_update_daily_reviews():
     """
+    Fetch and update reviews with comprehensive status checking using BigQuery.
     Strategy:
     1. For each agent:
        - Examine ALL open reviews from last LEADERBOARD_TIME_FRAME_DAYS - 1 for their closed_at status
+       - Update PR status for all existing metadata using BigQuery (last LEADERBOARD_TIME_FRAME_DAYS - 1)
+       - Fetch new reviews from yesterday 12am to today 12am using BigQuery
        - Save all updated/new metadata back to HuggingFace
     """
+    # Initialize BigQuery client
+    try:
+        client = get_bigquery_client()
+    except Exception as e:
+        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
+        return
     # Load all agents
     agents = load_agents_from_hf()
             print(f"   ✓ Loaded {len(recent_metadata)} existing reviews from timeframe")
+            # Step 2: Update PR status for existing reviews using BigQuery
             if recent_metadata:
+                print(f"🔍 Updating PR status for {len(recent_metadata)} existing reviews using BigQuery...")
+                # Extract PR URLs from existing metadata
+                pr_urls = [r.get('pr_url') for r in recent_metadata if r.get('pr_url')]
+                if pr_urls:
+                    # Fetch status from BigQuery
+                    extended_end_date = today_utc
+                    status_map = fetch_pr_status_from_bigquery(client, pr_urls, cutoff_date, extended_end_date)
+                    # Update metadata with new status
+                    for review in recent_metadata:
+                        pr_url = review.get('pr_url')
+                        if pr_url and pr_url in status_map:
+                            status_info = status_map[pr_url]
+                            review['pr_status'] = status_info['status']
+                            review['pr_merged'] = status_info['merged']
+                            review['pr_closed_at'] = status_info['closed_at']
+                    print(f"   ✓ Updated PR status for existing reviews")
+            # Step 3: Fetch NEW reviews from yesterday 12am to today 12am using BigQuery
+            print(f"🔍 Fetching new reviews from {yesterday_midnight.isoformat()} to {today_midnight.isoformat()} using BigQuery...")
+            review_rows = fetch_reviews_from_bigquery(client, identifier, yesterday_midnight, today_midnight)
+            # Extract unique PR URLs and fetch status
+            pr_urls = list(set([row.pr_url for row in review_rows if row.pr_url]))
+            print(f"   Found {len(review_rows)} review events across {len(pr_urls)} unique PRs")
+            # Fetch PR status for new reviews
+            extended_end_date = today_utc
+            status_map = fetch_pr_status_from_bigquery(client, pr_urls, yesterday_midnight, extended_end_date)
             # Extract metadata for new reviews
             yesterday_metadata = []
+            seen_prs = set()
+            for row in review_rows:
+                pr_url = row.pr_url
+                if pr_url in seen_prs:
+                    continue
+                seen_prs.add(pr_url)
+                status_info = status_map.get(pr_url, {
+                    'status': 'open',
+                    'merged': False,
+                    'closed_at': None
+                })
+                metadata = extract_review_metadata_from_bigquery(row, status_info)
+                metadata['agent_identifier'] = identifier
+                yesterday_metadata.append(metadata)
+            print(f"   ✓ Found {len(yesterday_metadata)} unique PRs in 24-hour window")
+            # Step 4: Combine and save all metadata
             all_updated_metadata = recent_metadata + yesterday_metadata
             if all_updated_metadata:

msr.py CHANGED Viewed

@@ -1,17 +1,16 @@
 """
 Minimalist Review Metadata Mining Script
-Mines PR review metadata from GitHub and saves to HuggingFace dataset.
 """
 import json
 import os
-import time
-import requests
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
 from dotenv import load_dotenv
-import random
 # Load environment variables
 load_dotenv()
@@ -52,792 +51,501 @@ def save_jsonl(filename, data):
             f.write(json.dumps(item) + '\n')
-def get_github_tokens():
-    """Get all GitHub tokens from environment variables (all vars starting with GITHUB_TOKEN)."""
-    tokens = []
-    for key, value in os.environ.items():
-        if key.startswith('GITHUB_TOKEN') and value:
-            tokens.append(value)
-    if not tokens:
-        print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
-    else:
-        print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
-    return tokens
-class TokenPool:
     """
-    Hybrid token pool with parallel execution and round-robin fallback.
-    Splits tokens into two pools:
-    - Parallel pool (50%): For concurrent API calls to maximize throughput
-    - Round-robin pool (50%): Backup pool for rate limit fallback
-    Features:
-    - Automatic fallback when parallel tokens hit rate limits
-    - Rate limit tracking with timestamp-based recovery
-    - Thread-safe token management
-    - Real-time statistics monitoring
     """
-    def __init__(self, tokens):
-        import threading
-        self.all_tokens = tokens if tokens else [None]
-        self.lock = threading.Lock()
-        # Split tokens into parallel and round-robin pools (50/50)
-        total_tokens = len(self.all_tokens)
-        split_point = max(1, total_tokens // 2)
-        self.parallel_tokens = self.all_tokens[:split_point]
-        self.roundrobin_tokens = self.all_tokens[split_point:] if total_tokens > 1 else self.all_tokens
-        # Round-robin index for fallback pool
-        self.roundrobin_index = 0
-        # Rate limit tracking: {token: reset_timestamp}
-        self.parallel_rate_limited = set()
-        self.roundrobin_rate_limited = set()
-        self.rate_limit_resets = {}
-        # Statistics
-        self.stats = {
-            'parallel_calls': 0,
-            'roundrobin_calls': 0,
-            'fallback_triggers': 0
-        }
-        print(f"📊 Token Pool Initialized:")
-        print(f"   Total tokens: {total_tokens}")
-        print(f"   Parallel pool: {len(self.parallel_tokens)} tokens")
-        print(f"   Round-robin pool: {len(self.roundrobin_tokens)} tokens")
-    def _cleanup_expired_rate_limits(self):
-        """Remove tokens from rate-limited sets if their reset time has passed."""
-        current_time = time.time()
-        expired_tokens = [
-            token for token, reset_time in self.rate_limit_resets.items()
-            if current_time >= reset_time
-        ]
-        for token in expired_tokens:
-            self.parallel_rate_limited.discard(token)
-            self.roundrobin_rate_limited.discard(token)
-            del self.rate_limit_resets[token]
-            if expired_tokens:
-                print(f"   ✓ Recovered {len(expired_tokens)} token(s) from rate limit")
-    def get_parallel_token(self):
-        """Get an available token from the parallel pool."""
-        with self.lock:
-            self._cleanup_expired_rate_limits()
-            # Find first non-rate-limited parallel token
-            for token in self.parallel_tokens:
-                if token not in self.parallel_rate_limited:
-                    self.stats['parallel_calls'] += 1
-                    return token
-            return None
-    def get_roundrobin_token(self):
-        """Get the next available token from round-robin pool."""
-        with self.lock:
-            self._cleanup_expired_rate_limits()
-            # Try all tokens in round-robin order
-            attempts = 0
-            while attempts < len(self.roundrobin_tokens):
-                token = self.roundrobin_tokens[self.roundrobin_index]
-                self.roundrobin_index = (self.roundrobin_index + 1) % len(self.roundrobin_tokens)
-                if token not in self.roundrobin_rate_limited:
-                    self.stats['roundrobin_calls'] += 1
-                    return token
-                attempts += 1
-            return None
-    def get_next_token(self):
-        """
-        Get next available token, trying parallel pool first, then falling back to round-robin.
-        Returns:
-            Token string or None if all tokens are rate-limited
-        """
-        # Try parallel pool first
-        token = self.get_parallel_token()
-        if token:
-            return token
-        # Fallback to round-robin pool
-        with self.lock:
-            self.stats['fallback_triggers'] += 1
-        token = self.get_roundrobin_token()
-        if not token:
-            print("   ⚠️ All tokens are rate-limited, waiting...")
-        return token
-    def get_headers(self):
-        """Get headers with the next available token."""
-        token = self.get_next_token()
-        return {'Authorization': f'token {token}'} if token else {}
-    def mark_rate_limited(self, token, reset_timestamp=None):
-        """
-        Mark a token as rate-limited with optional reset timestamp.
-        Args:
-            token: The token to mark as rate-limited
-            reset_timestamp: Unix timestamp when rate limit resets (optional)
-        """
-        if not token:
-            return
-        with self.lock:
-            # Determine which pool the token belongs to
-            if token in self.parallel_tokens:
-                self.parallel_rate_limited.add(token)
-            if token in self.roundrobin_tokens:
-                self.roundrobin_rate_limited.add(token)
-            # Store reset timestamp if provided
-            if reset_timestamp:
-                self.rate_limit_resets[token] = reset_timestamp
-                from datetime import datetime, timezone
-                reset_time = datetime.fromtimestamp(reset_timestamp, tz=timezone.utc)
-                print(f"   ⏰ Token rate-limited until {reset_time.strftime('%H:%M:%S')} UTC")
-    def get_available_parallel_tokens(self):
-        """Get list of all available (non-rate-limited) parallel tokens."""
-        with self.lock:
-            self._cleanup_expired_rate_limits()
-            return [t for t in self.parallel_tokens if t not in self.parallel_rate_limited]
-    def get_stats(self):
-        """Get token pool usage statistics."""
-        with self.lock:
-            return {
-                'parallel_calls': self.stats['parallel_calls'],
-                'roundrobin_calls': self.stats['roundrobin_calls'],
-                'fallback_triggers': self.stats['fallback_triggers'],
-                'parallel_rate_limited': len(self.parallel_rate_limited),
-                'roundrobin_rate_limited': len(self.roundrobin_rate_limited)
-            }
-    def print_stats(self):
-        """Print token pool usage statistics."""
-        stats = self.get_stats()
-        total_calls = stats['parallel_calls'] + stats['roundrobin_calls']
-        print(f"\n📊 Token Pool Statistics:")
-        print(f"   Total API calls: {total_calls}")
-        if total_calls > 0:
-            print(f"   Parallel calls: {stats['parallel_calls']} ({stats['parallel_calls']/total_calls*100:.1f}%)")
-            print(f"   Round-robin calls: {stats['roundrobin_calls']} ({stats['roundrobin_calls']/total_calls*100:.1f}%)")
-        print(f"   Fallback triggers: {stats['fallback_triggers']}")
-        print(f"   Currently rate-limited: {stats['parallel_rate_limited']} parallel, {stats['roundrobin_rate_limited']} round-robin")
-def get_hf_token():
-    """Get HuggingFace token from environment variables."""
-    token = os.getenv('HF_TOKEN')
-    if not token:
-        print("Warning: HF_TOKEN not found in environment variables")
-    return token
 # =============================================================================
-# GITHUB API FUNCTIONS
 # =============================================================================
-def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30, token_pool=None, token=None):
     """
-    Perform an HTTP request with exponential backoff and jitter for GitHub API.
-    Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
     Args:
-        token_pool: Optional TokenPool instance for rate limit tracking
-        token: Optional token string to mark as rate-limited if 403/429 occurs
-    Returns the final requests.Response on success or non-retryable status, or None after exhausting retries.
     """
-    delay = 1.0
-    for attempt in range(max_retries):
-        try:
-            resp = requests.request(
-                method,
-                url,
-                headers=headers or {},
-                params=params,
-                json=json_body,
-                data=data,
-                timeout=timeout
-            )
-            status = resp.status_code
-            # Success
-            if 200 <= status < 300:
-                return resp
-            # Rate limits or server errors -> retry with backoff
-            if status in (403, 429) or 500 <= status < 600:
-                wait = None
-                reset_timestamp = None
-                # Prefer Retry-After when present
-                retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
-                if retry_after:
-                    try:
-                        wait = float(retry_after)
-                    except Exception:
-                        wait = None
-                # Fallback to X-RateLimit-Reset when 403/429
-                if wait is None and status in (403, 429):
-                    reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
-                    if reset_hdr:
-                        try:
-                            reset_timestamp = int(float(reset_hdr))
-                            wait = max(reset_timestamp - time.time() + 2, 1)
-                        except Exception:
-                            wait = None
-                # Mark token as rate-limited if we have token pool and token
-                if status in (403, 429) and token_pool and token:
-                    token_pool.mark_rate_limited(token, reset_timestamp)
-                # Final fallback: exponential backoff with jitter
-                if wait is None:
-                    wait = delay + random.uniform(0, 0.5)
-                # Cap individual wait to avoid extreme sleeps
-                wait = max(1.0, min(wait, 120.0))
-                print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
-                time.sleep(wait)
-                delay = min(delay * 2, 60.0)
-                continue
-            # Non-retryable error; return response for caller to handle
-            return resp
-        except requests.RequestException as e:
-            # Network error -> retry with backoff
-            wait = delay + random.uniform(0, 0.5)
-            wait = max(1.0, min(wait, 60.0))
-            print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
-            time.sleep(wait)
-            delay = min(delay * 2, 60.0)
-    print(f"Exceeded max retries for {url}")
-    return None
-def fetch_reviews_with_time_partition(base_query, start_date, end_date, token_pool, prs_by_url, depth=0):
     """
-    Fetch reviews within a specific time range using time-based partitioning.
-    Recursively splits the time range if hitting the 1000-result limit.
-    Supports splitting by day, hour, minute, and second as needed.
-    Returns the number of reviews found in this time partition.
-    """
-    # Calculate time difference
-    time_diff = end_date - start_date
-    total_seconds = time_diff.total_seconds()
-    # Determine granularity and format dates accordingly
-    if total_seconds >= 86400:  # >= 1 day
-        # Use day granularity (YYYY-MM-DD)
-        start_str = start_date.strftime('%Y-%m-%d')
-        end_str = end_date.strftime('%Y-%m-%d')
-    elif total_seconds >= 3600:  # >= 1 hour but < 1 day
-        # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
-        start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
-        end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
-    elif total_seconds >= 60:  # >= 1 minute but < 1 hour
-        # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
-        start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
-        end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
-    else:  # < 1 minute
-        # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
-        start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
-        end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
-    # Add date range to query (use created for PR search)
-    query = f'{base_query} created:{start_str}..{end_str}'
-    indent = "  " + "  " * depth
-    print(f"{indent}Searching range {start_str} to {end_str}...")
-    page = 1
-    per_page = 100
-    total_in_partition = 0
-    while True:
-        url = 'https://api.github.com/search/issues'  # Use issues endpoint for PR search
-        params = {
-            'q': query,
-            'per_page': per_page,
-            'page': page,
-            'sort': 'created',
-            'order': 'asc'
         }
-        token = token_pool.get_next_token()
-        headers = {'Authorization': f'token {token}'} if token else {}
-        try:
-            response = request_with_backoff('GET', url, headers=headers, params=params, token_pool=token_pool, token=token)
-            if response is None:
-                print(f"{indent}  Error: retries exhausted for range {start_str} to {end_str}")
-                return total_in_partition
-            if response.status_code != 200:
-                print(f"{indent}  Error: HTTP {response.status_code} for range {start_str} to {end_str}")
-                return total_in_partition
-            data = response.json()
-            total_count = data.get('total_count', 0)
-            items = data.get('items', [])
-            if not items:
-                break
-            # Add PR reviews to global dict (keyed by PR URL)
-            for pr in items:
-                pr_url = pr.get('html_url')
-                if pr_url and pr_url not in prs_by_url:
-                    prs_by_url[pr_url] = pr
-                    total_in_partition += 1
-            # Check if we hit the 1000-result limit
-            if total_count > 1000 and page == 10:
-                print(f"{indent}  ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
-                # Determine how to split based on time range duration
-                if total_seconds < 2:  # Less than 2 seconds - can't split further
-                    print(f"{indent}  ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
-                    break
-                elif total_seconds < 120:  # Less than 2 minutes - split by seconds
-                    num_splits = min(4, max(2, int(total_seconds / 30)))
-                    split_duration = time_diff / num_splits
-                    split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
-                    total_from_splits = 0
-                    for i in range(num_splits):
-                        split_start = split_dates[i]
-                        split_end = split_dates[i + 1]
-                        if i > 0:
-                            split_start = split_start + timedelta(seconds=1)
-                        count = fetch_reviews_with_time_partition(
-                            base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
-                        )
-                        total_from_splits += count
-                    return total_from_splits
-                elif total_seconds < 7200:  # Less than 2 hours - split by minutes
-                    num_splits = min(4, max(2, int(total_seconds / 1800)))
-                    split_duration = time_diff / num_splits
-                    split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
-                    total_from_splits = 0
-                    for i in range(num_splits):
-                        split_start = split_dates[i]
-                        split_end = split_dates[i + 1]
-                        if i > 0:
-                            split_start = split_start + timedelta(minutes=1)
-                        count = fetch_reviews_with_time_partition(
-                            base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
-                        )
-                        total_from_splits += count
-                    return total_from_splits
-                elif total_seconds < 172800:  # Less than 2 days - split by hours
-                    num_splits = min(4, max(2, int(total_seconds / 43200)))
-                    split_duration = time_diff / num_splits
-                    split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
-                    total_from_splits = 0
-                    for i in range(num_splits):
-                        split_start = split_dates[i]
-                        split_end = split_dates[i + 1]
-                        if i > 0:
-                            split_start = split_start + timedelta(hours=1)
-                        count = fetch_reviews_with_time_partition(
-                            base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
-                        )
-                        total_from_splits += count
-                    return total_from_splits
-                else:  # 2+ days - split by days
-                    days_diff = time_diff.days
-                    # Use aggressive splitting for large ranges or deep recursion
-                    if days_diff > 30 or depth > 5:
-                        # Split into 4 parts for more aggressive partitioning
-                        quarter_diff = time_diff / 4
-                        split_dates = [
-                            start_date,
-                            start_date + quarter_diff,
-                            start_date + quarter_diff * 2,
-                            start_date + quarter_diff * 3,
-                            end_date
-                        ]
-                        total_from_splits = 0
-                        for i in range(4):
-                            split_start = split_dates[i]
-                            split_end = split_dates[i + 1]
-                            if i > 0:
-                                split_start = split_start + timedelta(days=1)
-                            count = fetch_reviews_with_time_partition(
-                                base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
-                            )
-                            total_from_splits += count
-                        return total_from_splits
-                    else:
-                        # Binary split for smaller ranges
-                        mid_date = start_date + time_diff / 2
-                        count1 = fetch_reviews_with_time_partition(
-                            base_query, start_date, mid_date, token_pool, prs_by_url, depth + 1
-                        )
-                        count2 = fetch_reviews_with_time_partition(
-                            base_query, mid_date + timedelta(days=1), end_date, token_pool, prs_by_url, depth + 1
-                        )
-                        return count1 + count2
-            # Normal pagination: check if there are more pages
-            if len(items) < per_page or page >= 10:
-                break
-            page += 1
-            time.sleep(0.5)  # Courtesy delay between pages
         except Exception as e:
-            print(f"{indent}  Error fetching range {start_str} to {end_str}: {str(e)}")
-            return total_in_partition
-    if total_in_partition > 0:
-        print(f"{indent}  ✓ Found {total_in_partition} reviews in range {start_str} to {end_str}")
-    return total_in_partition
-def fetch_reviews_parallel(query_patterns, start_date, end_date, token_pool, prs_by_url):
-    """
-    Fetch reviews for multiple query patterns in parallel using available parallel tokens.
-    This function uses ThreadPoolExecutor to execute multiple query patterns concurrently,
-    with each pattern using a dedicated token from the parallel pool. Falls back to
-    sequential execution if insufficient parallel tokens are available.
-    Args:
-        query_patterns: List of query pattern strings (e.g., ['is:pr author:bot1', 'is:pr reviewed-by:bot1'])
-        start_date: Start datetime for time range
-        end_date: End datetime for time range
-        token_pool: TokenPool instance for token management
-        prs_by_url: Dictionary to collect PRs by URL (shared across patterns)
-    Returns:
-        Total number of PRs found across all patterns
-    """
-    from concurrent.futures import ThreadPoolExecutor, as_completed
-    import threading
-    # Check how many parallel tokens are available
-    available_tokens = token_pool.get_available_parallel_tokens()
-    if len(available_tokens) < 2 or len(query_patterns) < 2:
-        # Not enough tokens or patterns for parallelization, use sequential
-        print(f"   ⚠️ Sequential execution: {len(available_tokens)} parallel tokens available for {len(query_patterns)} patterns")
-        total_found = 0
-        for pattern in query_patterns:
-            pattern_prs = {}
-            count = fetch_reviews_with_time_partition(
-                pattern, start_date, end_date, token_pool, pattern_prs, depth=0
-            )
-            # Merge pattern results into global dict
-            lock = threading.Lock()
-            with lock:
-                for url, pr in pattern_prs.items():
-                    if url not in prs_by_url:
-                        prs_by_url[url] = pr
-            total_found += count
-        return total_found
-    # Use parallel execution
-    print(f"   🚀 Parallel execution: {len(available_tokens)} parallel tokens for {len(query_patterns)} patterns")
-    # Thread-safe lock for updating prs_by_url
-    lock = threading.Lock()
-    def fetch_pattern(pattern):
-        """Fetch reviews for a single pattern (runs in parallel)."""
-        pattern_prs = {}
-        try:
-            count = fetch_reviews_with_time_partition(
-                pattern, start_date, end_date, token_pool, pattern_prs, depth=0
-            )
-            return pattern, pattern_prs, count
-        except Exception as e:
-            print(f"   Error fetching pattern '{pattern}': {str(e)}")
-            return pattern, {}, 0
-    # Execute patterns in parallel
-    max_workers = min(len(query_patterns), len(available_tokens))
-    total_found = 0
-    with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Submit all patterns
-        future_to_pattern = {
-            executor.submit(fetch_pattern, pattern): pattern
-            for pattern in query_patterns
-        }
-        # Collect results as they complete
-        for future in as_completed(future_to_pattern):
-            pattern = future_to_pattern[future]
-            try:
-                _, pattern_prs, count = future.result()
-                # Merge results into global dict (thread-safe)
-                with lock:
-                    for url, pr in pattern_prs.items():
-                        if url not in prs_by_url:
-                            prs_by_url[url] = pr
-                total_found += count
-                print(f"   ✓ Pattern '{pattern}' completed: {count} PRs found")
-            except Exception as e:
-                print(f"   ✗ Pattern '{pattern}' failed: {str(e)}")
-    return total_found
-def extract_review_metadata(pr):
     """
-    Extract minimal PR review metadata for efficient storage.
-    Only keeps essential fields: html_url, reviewed_at, pr_status, pr_merged, pr_closed_at.
-    PR status:
-    - pr_status: 'open', 'merged', or 'closed'
-    - pr_merged: True if PR was merged, False otherwise
-    - pr_closed_at: Date when PR was closed/merged (if applicable)
     """
-    pr_url = pr.get('html_url')
-    pr_number = pr.get('number')
-    created_at = pr.get('created_at')
-    closed_at = pr.get('closed_at')
-    state = pr.get('state', 'open')  # open or closed
-    # Check if PR has pull_request field (indicates it's a PR, not an issue)
-    pull_request_data = pr.get('pull_request', {})
-    pr_merged = pull_request_data.get('merged_at') is not None if pull_request_data else False
-    # Determine initial status
-    if pr_merged:
-        status = 'merged'
-    elif state == 'closed':
-        status = 'closed'
-    else:
-        status = 'open'
     return {
         'html_url': pr_url,
-        'reviewed_at': created_at,  # When the PR was created (agent reviewed it)
-        'pr_status': status,
-        'pr_merged': pr_merged,
-        'pr_closed_at': closed_at,
-        'pr_url': pr_url,  # Store PR URL for tracking
-        'review_id': f"pr_{pr_number}"  # Use PR number for deduplication
     }
-def update_pr_status(metadata_list, token_pool):
     """
-    Update PR status for reviews to get current merged/closed state.
-    For each PR associated with a review, fetch current status from GitHub API.
-    Updates metadata_list in-place with PR status information.
     Args:
-        metadata_list: List of review metadata dictionaries
-        token_pool: TokenPool instance for rotating tokens
     Returns:
-        Updated metadata_list with current PR status
     """
-    if not metadata_list:
-        return metadata_list
-    # Track unique PRs to avoid duplicate API calls
-    pr_url_to_status = {}
-    updated_count = 0
-    for metadata in metadata_list:
-        pr_url = metadata.get('pr_url')
-        if not pr_url:
-            continue
-        # Skip if already fetched for this PR
-        if pr_url in pr_url_to_status:
-            status_info = pr_url_to_status[pr_url]
-            metadata['pr_status'] = status_info['status']
-            metadata['pr_merged'] = status_info['merged']
-            metadata['pr_closed_at'] = status_info['closed_at']
-            continue
-        try:
-            # Convert HTML URL to API URL
-            # https://github.com/owner/repo/pull/123 -> https://api.github.com/repos/owner/repo/pulls/123
-            parts = pr_url.replace('https://github.com/', '').split('/')
-            if len(parts) >= 4:
-                owner, repo, pull_word, pr_number = parts[0], parts[1], parts[2], parts[3]
-                api_url = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}'
-                token = token_pool.get_next_token()
-                headers = {'Authorization': f'token {token}'} if token else {}
-                response = request_with_backoff('GET', api_url, headers=headers, max_retries=3, token_pool=token_pool, token=token)
-                if response and response.status_code == 200:
-                    pr_data = response.json()
-                    state = pr_data.get('state', 'open')
-                    merged = pr_data.get('merged', False)
-                    closed_at = pr_data.get('closed_at')
-                    merged_at = pr_data.get('merged_at')
-                    # Determine final status
-                    if merged:
-                        status = 'merged'
-                    elif state == 'closed':
-                        status = 'closed'
-                    else:
-                        status = 'open'
-                    status_info = {
-                        'status': status,
-                        'merged': merged,
-                        'closed_at': closed_at or merged_at
-                    }
-                    # Cache and update
-                    pr_url_to_status[pr_url] = status_info
-                    metadata['pr_status'] = status
-                    metadata['pr_merged'] = merged
-                    metadata['pr_closed_at'] = closed_at or merged_at
-                    updated_count += 1
-                # Small delay to avoid rate limiting
-                time.sleep(0.1)
-        except Exception as e:
-            print(f"   Warning: Could not check PR status for {pr_url}: {e}")
             continue
-    if updated_count > 0:
-        print(f"   ✓ Updated status for {updated_count} unique PRs")
-    return metadata_list
-def fetch_all_reviews_metadata(identifier, agent_name, token_pool):
-    """
-    Fetch PR reviews associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
-    Returns lightweight metadata instead of full review objects.
-    This function employs time-based partitioning to navigate GitHub's 1000-result limit per query.
-    It searches using the query pattern:
-    - reviewed-by:{identifier} (PR reviews by the agent)
-    After fetching reviews, it updates PR status to determine if PRs were merged or closed.
     Args:
-        identifier: GitHub username or bot identifier
-        agent_name: Human-readable name of the agent for metadata purposes
-        token_pool: TokenPool instance for rotating tokens
     Returns:
-        List of dictionaries containing minimal PR review metadata with PR status
     """
-    # Define query pattern for PR reviews
-    query_patterns = [f'is:pr reviewed-by:{identifier}']
-    # Use a dict to deduplicate PRs by URL
-    prs_by_url = {}
     # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
     current_time = datetime.now(timezone.utc)
-    end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)  # 12:00 AM UTC today
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-    print(f"\n🔍 Searching for PRs reviewed by {identifier}")
-    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')} (today excluded)")
-    print(f"   Query patterns: {len(query_patterns)}")
-    overall_start_time = time.time()
-    # Use parallel execution if multiple patterns and sufficient tokens
-    if len(query_patterns) > 1:
-        reviews_found = fetch_reviews_parallel(
-            query_patterns,
-            start_date,
-            end_date,
-            token_pool,
-            prs_by_url
-        )
     else:
-        # Single pattern, use sequential
-        reviews_found = fetch_reviews_with_time_partition(
-            query_patterns[0],
-            start_date,
-            end_date,
-            token_pool,
-            prs_by_url
-        )
-    overall_duration = time.time() - overall_start_time
-    print(f"   ✓ All patterns complete: {len(prs_by_url)} unique PRs found")
-    print(f"   ⏱️ Total time: {overall_duration:.1f} seconds")
-    all_prs = list(prs_by_url.values())
-    print(f"\n✅ COMPLETE: Found {len(all_prs)} unique PRs reviewed by {identifier}")
-    print(f"📦 Extracting minimal metadata and updating PR status...")
-    # Extract metadata for each PR review
-    metadata_list = [extract_review_metadata(pr) for pr in all_prs]
-    # Update PR status to get current merged/closed state
-    print(f"🔍 Updating PR status for reviewed PRs...")
-    metadata_list = update_pr_status(metadata_list, token_pool)
-    # Calculate memory savings
-    import sys
-    original_size = sys.getsizeof(str(all_prs))
-    metadata_size = sys.getsizeof(str(metadata_list))
-    savings_pct = ((original_size - metadata_size) / original_size * 100) if original_size > 0 else 0
-    print(f"💾 Memory efficiency: {original_size // 1024}KB → {metadata_size // 1024}KB (saved {savings_pct:.1f}%)")
-    return metadata_list
 # =============================================================================
@@ -866,37 +574,6 @@ def group_metadata_by_date(metadata_list):
     return dict(grouped)
-def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
-    """
-    Upload file to HuggingFace with exponential backoff retry logic.
-    """
-    delay = 2.0
-    for attempt in range(max_retries):
-        try:
-            api.upload_file(
-                path_or_fileobj=path_or_fileobj,
-                path_in_repo=path_in_repo,
-                repo_id=repo_id,
-                repo_type=repo_type,
-                token=token
-            )
-            if attempt > 0:
-                print(f"   ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
-            return True
-        except Exception as e:
-            if attempt < max_retries - 1:
-                wait_time = delay + random.uniform(0, 1.0)
-                print(f"   ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
-                print(f"   ⏳ Retrying in {wait_time:.1f} seconds...")
-                time.sleep(wait_time)
-                delay = min(delay * 2, 60.0)
-            else:
-                print(f"   ✗ Upload failed after {max_retries} attempts: {str(e)}")
-                raise
 def save_review_metadata_to_hf(metadata_list, agent_identifier):
     """
     Save review metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
@@ -909,7 +586,6 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
         metadata_list: List of review metadata dictionaries
         agent_identifier: GitHub identifier of the agent (used as folder name)
     """
-    import tempfile
     import shutil
     try:
@@ -987,7 +663,11 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
 def load_agents_from_hf():
-    """Load all agent metadata JSON files from HuggingFace dataset."""
     try:
         api = HfApi()
         agents = []
@@ -1011,6 +691,11 @@ def load_agents_from_hf():
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
                     agents.append(agent_data)
             except Exception as e:
@@ -1032,10 +717,8 @@ def load_agents_from_hf():
 def mine_all_agents():
     """
     Mine review metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
     """
-    tokens = get_github_tokens()
-    token_pool = TokenPool(tokens)
     # Load agent metadata from HuggingFace
     agents = load_agents_from_hf()
     if not agents:
@@ -1045,34 +728,43 @@ def mine_all_agents():
     print(f"\n{'='*80}")
     print(f"Starting review metadata mining for {len(agents)} agents")
     print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
     print(f"{'='*80}\n")
-    # Mine each agent
     for agent in agents:
         identifier = agent.get('github_identifier')
-        agent_name = agent.get('agent_name', 'Unknown')
         if not identifier:
             print(f"Warning: Skipping agent without identifier: {agent}")
             continue
-        try:
-            print(f"\n{'='*80}")
-            print(f"Processing: {agent_name} ({identifier})")
-            print(f"{'='*80}")
-            # Fetch review metadata
-            metadata = fetch_all_reviews_metadata(identifier, agent_name, token_pool)
             if metadata:
-                print(f"💾 Saving {len(metadata)} review records...")
                 save_review_metadata_to_hf(metadata, identifier)
-                print(f"✓ Successfully processed {agent_name}")
             else:
                 print(f"   No reviews found for {agent_name}")
         except Exception as e:
-            print(f"✗ Error processing {identifier}: {str(e)}")
             import traceback
             traceback.print_exc()
             continue

 """
 Minimalist Review Metadata Mining Script
+Mines PR review metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset.
 """
 import json
 import os
+import tempfile
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
 from dotenv import load_dotenv
+from google.cloud import bigquery
 # Load environment variables
 load_dotenv()
             f.write(json.dumps(item) + '\n')
+def get_hf_token():
+    """Get HuggingFace token from environment variables."""
+    token = os.getenv('HF_TOKEN')
+    if not token:
+        print("Warning: HF_TOKEN not found in environment variables")
+    return token
+def get_bigquery_client():
     """
+    Initialize BigQuery client using credentials from environment variable.
+    Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
+    the service account JSON credentials as a string.
     """
+    # Get the JSON content from environment variable
+    creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
+    if creds_json:
+        # Create a temporary file to store credentials
+        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
+            temp_file.write(creds_json)
+            temp_path = temp_file.name
+        # Set environment variable to point to temp file
+        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
+        # Initialize BigQuery client
+        client = bigquery.Client()
+        # Clean up temp file
+        os.unlink(temp_path)
+        return client
+    else:
+        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
 # =============================================================================
+# BIGQUERY FUNCTIONS
 # =============================================================================
+def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
     """
+    Fetch PR review events from GitHub Archive for a specific agent.
+    Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
+    actor.login matches the agent identifier.
     Args:
+        client: BigQuery client instance
+        identifier: GitHub username or bot identifier (e.g., 'amazon-inspector-beta[bot]')
+        start_date: Start datetime (timezone-aware)
+        end_date: End datetime (timezone-aware)
+    Returns:
+        List of review event rows with PR information
     """
+    print(f"\n🔍 Querying BigQuery for reviews by {identifier}")
+    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
+    # Generate list of table names for each day in the range
+    table_refs = []
+    current_date = start_date
+    while current_date < end_date:
+        table_name = f"githubarchive.day.{current_date.strftime('%Y%m%d')}"
+        table_refs.append(table_name)
+        current_date += timedelta(days=1)
+    # Build UNION ALL query for all daily tables
+    union_parts = []
+    for table_name in table_refs:
+        union_parts.append(f"""
+        SELECT
+            repo.name as repo_name,
+            actor.login as actor_login,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
+            CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) as pr_number,
+            JSON_EXTRACT_SCALAR(payload, '$.review.submitted_at') as reviewed_at,
+            created_at
+        FROM `{table_name}`
+        WHERE type = 'PullRequestReviewEvent'
+        AND actor.login = @identifier
+        """)
+    query = " UNION ALL ".join(union_parts)
+    job_config = bigquery.QueryJobConfig(
+        query_parameters=[
+            bigquery.ScalarQueryParameter("identifier", "STRING", identifier)
+        ]
+    )
+    print(f"   Querying {len(table_refs)} daily tables...")
+    try:
+        query_job = client.query(query, job_config=job_config)
+        results = list(query_job.result())
+        print(f"   ✓ Found {len(results)} review events")
+        return results
+    except Exception as e:
+        print(f"   ✗ BigQuery error: {str(e)}")
+        return []
+def fetch_pr_status_from_bigquery(client, pr_urls, start_date, end_date):
     """
+    Fetch PR status (merged/closed) from GitHub Archive PullRequestEvent.
+    For each PR URL, looks for PullRequestEvent with action='closed' to determine
+    if the PR was merged or just closed.
+    Args:
+        client: BigQuery client instance
+        pr_urls: List of PR URLs to check status for
+        start_date: Start datetime (should cover review period and after)
+        end_date: End datetime (should be recent/current)
+    Returns:
+        Dictionary mapping PR URL to status dict:
+        {
+            'pr_url': {
+                'status': 'merged'|'closed'|'open',
+                'merged': bool,
+                'closed_at': timestamp or None
+            }
         }
+    """
+    if not pr_urls:
+        return {}
+    print(f"\n🔍 Querying BigQuery for PR status ({len(pr_urls)} PRs)...")
+    # Extract repo and PR number from URLs
+    # URL format: https://github.com/owner/repo/pull/123
+    pr_info = []
+    for url in pr_urls:
+        try:
+            parts = url.replace('https://github.com/', '').split('/')
+            if len(parts) >= 4:
+                owner = parts[0]
+                repo = parts[1]
+                pr_number = int(parts[3])
+                repo_name = f"{owner}/{repo}"
+                pr_info.append({
+                    'url': url,
+                    'repo': repo_name,
+                    'number': pr_number
+                })
         except Exception as e:
+            print(f"   Warning: Could not parse PR URL {url}: {e}")
+            continue
+    if not pr_info:
+        return {}
+    # Build repo filter condition for WHERE clause
+    # Group PRs by repo to create efficient filters
+    repos_to_prs = defaultdict(list)
+    for pr in pr_info:
+        repos_to_prs[pr['repo']].append(pr['number'])
+    # Generate list of table names for date range
+    # Look back 1 full year from end_date to catch PR close events that may have occurred before reviews
+    pr_status_start = end_date - timedelta(days=365)
+    table_refs = []
+    current_date = pr_status_start
+    while current_date < end_date:
+        table_name = f"githubarchive.day.{current_date.strftime('%Y%m%d')}"
+        table_refs.append(table_name)
+        current_date += timedelta(days=1)
+    # Build WHERE clause to filter by specific repos and PR numbers
+    # Format: (repo='owner/repo1' AND pr_number IN (1,2,3)) OR (repo='owner/repo2' AND pr_number IN (4,5))
+    filter_conditions = []
+    for repo, pr_numbers in repos_to_prs.items():
+        pr_list = ','.join(map(str, pr_numbers))
+        filter_conditions.append(f"(repo.name = '{repo}' AND CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) IN ({pr_list}))")
+    pr_filter = " OR ".join(filter_conditions)
+    # Build query to find close/merge events for specific PRs
+    union_parts = []
+    for table_name in table_refs:
+        union_parts.append(f"""
+        SELECT
+            repo.name as repo_name,
+            CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) as pr_number,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
+            JSON_EXTRACT_SCALAR(payload, '$.action') as action,
+            CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged') AS BOOL) as merged,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.closed_at') as closed_at,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.merged_at') as merged_at,
+            created_at
+        FROM `{table_name}`
+        WHERE type = 'PullRequestEvent'
+        AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'closed'
+        AND ({pr_filter})
+        """)
+    query = " UNION ALL ".join(union_parts)
+    print(f"   Querying {len(table_refs)} daily tables for PR status (1-year lookback: {pr_status_start.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')})...")
+    print(f"   Filtering for {len(pr_info)} specific PRs across {len(repos_to_prs)} repos")
+    try:
+        query_job = client.query(query)
+        results = list(query_job.result())
+        print(f"   ✓ Found {len(results)} PR close events")
+        # Build status map by PR URL
+        status_map = {}
+        for row in results:
+            pr_url = row.pr_url
+            merged = row.merged if row.merged is not None else False
+            closed_at = row.closed_at or row.merged_at
+            # Convert to ISO format if datetime
+            if hasattr(closed_at, 'isoformat'):
+                closed_at = closed_at.isoformat()
+            status = 'merged' if merged else 'closed'
+            status_map[pr_url] = {
+                'status': status,
+                'merged': merged,
+                'closed_at': closed_at
+            }
+        # Mark remaining PRs as open
+        for url in pr_urls:
+            if url not in status_map:
+                status_map[url] = {
+                    'status': 'open',
+                    'merged': False,
+                    'closed_at': None
+                }
+        merged_count = sum(1 for s in status_map.values() if s['merged'])
+        closed_count = sum(1 for s in status_map.values() if s['status'] == 'closed')
+        open_count = sum(1 for s in status_map.values() if s['status'] == 'open')
+        print(f"   Status breakdown: {merged_count} merged, {closed_count} closed, {open_count} open")
+        return status_map
+    except Exception as e:
+        print(f"   ✗ BigQuery error: {str(e)}")
+        # Return all as open on error
+        return {url: {'status': 'open', 'merged': False, 'closed_at': None} for url in pr_urls}
+def extract_review_metadata(review_row, status_info):
     """
+    Extract minimal PR review metadata from BigQuery row and status info.
+    Args:
+        review_row: BigQuery row from PullRequestReviewEvent query
+        status_info: Status dictionary from fetch_pr_status_from_bigquery
+    Returns:
+        Dictionary with review metadata
     """
+    pr_url = review_row.pr_url
+    pr_number = review_row.pr_number
+    reviewed_at = review_row.reviewed_at or review_row.created_at
+    # Convert to ISO format if datetime
+    if hasattr(reviewed_at, 'isoformat'):
+        reviewed_at = reviewed_at.isoformat()
     return {
         'html_url': pr_url,
+        'reviewed_at': reviewed_at,
+        'pr_status': status_info['status'],
+        'pr_merged': status_info['merged'],
+        'pr_closed_at': status_info['closed_at'],
+        'pr_url': pr_url,
+        'review_id': f"pr_{pr_number}"
     }
+def fetch_all_reviews_metadata(identifier, agent_name):
     """
+    Fetch PR reviews associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
+    Uses BigQuery to query GitHub Archive instead of GitHub API.
     Args:
+        identifier: GitHub username or bot identifier (for BigQuery queries)
+        agent_name: Human-readable name of the agent (for display only)
     Returns:
+        List of dictionaries containing minimal PR review metadata with PR status
     """
+    # Initialize BigQuery client
+    try:
+        client = get_bigquery_client()
+    except Exception as e:
+        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
+        return []
+    # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
+    current_time = datetime.now(timezone.utc)
+    end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)  # 12:00 AM UTC today
+    start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
+    print(f"\n{'='*80}")
+    print(f"Fetching reviews for: {agent_name} ({identifier})")
+    print(f"{'='*80}")
+    # Fetch review events from BigQuery
+    review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
+    if not review_rows:
+        print(f"   No reviews found for {identifier}")
+        return []
+    # Extract unique PR URLs
+    pr_urls = list(set([row.pr_url for row in review_rows if row.pr_url]))
+    print(f"\n📊 Found {len(review_rows)} review events across {len(pr_urls)} unique PRs")
+    # Fetch PR status from BigQuery
+    # Use extended end date to catch recent merges/closes
+    extended_end_date = current_time
+    status_map = fetch_pr_status_from_bigquery(client, pr_urls, start_date, extended_end_date)
+    # Extract metadata for each review
+    print(f"\n📦 Extracting metadata...")
+    metadata_list = []
+    # Deduplicate by PR URL (multiple reviews on same PR)
+    seen_prs = set()
+    for row in review_rows:
+        pr_url = row.pr_url
+        if pr_url in seen_prs:
             continue
+        seen_prs.add(pr_url)
+        status_info = status_map.get(pr_url, {
+            'status': 'open',
+            'merged': False,
+            'closed_at': None
+        })
+        metadata = extract_review_metadata(row, status_info)
+        metadata_list.append(metadata)
+    print(f"   ✓ Extracted {len(metadata_list)} unique PR review records")
+    return metadata_list
+def fetch_all_reviews_metadata_batch(agents):
+    """
+    Fetch PR reviews for ALL agents in a single batch operation.
+    Uses only 2 BigQuery queries total (instead of 2*N queries for N agents).
     Args:
+        agents: List of agent dictionaries with 'github_identifier' and 'name' fields
     Returns:
+        Dictionary mapping agent identifier to list of review metadata:
+        {
+            'agent-identifier': [metadata_list],
+            ...
+        }
     """
+    if not agents:
+        return {}
+    # Initialize BigQuery client
+    try:
+        client = get_bigquery_client()
+    except Exception as e:
+        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
+        return {}
     # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
     current_time = datetime.now(timezone.utc)
+    end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
+    # Extract all identifiers
+    identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
+    if not identifiers:
+        return {}
+    print(f"\n🚀 BATCH MODE: Fetching reviews for {len(identifiers)} agents in 2 queries")
+    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
+    # =========================================================================
+    # QUERY 1: Fetch ALL review events for ALL agents in one query
+    # =========================================================================
+    print(f"\n🔍 Query 1/2: Fetching ALL review events...")
+    # Generate list of table names
+    table_refs = []
+    current_date = start_date
+    while current_date < end_date:
+        table_name = f"githubarchive.day.{current_date.strftime('%Y%m%d')}"
+        table_refs.append(table_name)
+        current_date += timedelta(days=1)
+    # Build IN clause for all identifiers
+    identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    # Build UNION ALL query for all daily tables
+    union_parts = []
+    for table_name in table_refs:
+        union_parts.append(f"""
+        SELECT
+            repo.name as repo_name,
+            actor.login as actor_login,
+            JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url') as pr_url,
+            CAST(JSON_EXTRACT_SCALAR(payload, '$.pull_request.number') AS INT64) as pr_number,
+            JSON_EXTRACT_SCALAR(payload, '$.review.submitted_at') as reviewed_at,
+            created_at
+        FROM `{table_name}`
+        WHERE type = 'PullRequestReviewEvent'
+        AND actor.login IN ({identifier_list})
+        """)
+    query = " UNION ALL ".join(union_parts)
+    print(f"   Querying {len(table_refs)} daily tables...")
+    try:
+        query_job = client.query(query)
+        all_review_rows = list(query_job.result())
+        print(f"   ✓ Found {len(all_review_rows)} total review events")
+    except Exception as e:
+        print(f"   ✗ BigQuery error: {str(e)}")
+        return {}
+    # Group reviews by agent
+    reviews_by_agent = defaultdict(list)
+    all_pr_urls = set()
+    for row in all_review_rows:
+        reviews_by_agent[row.actor_login].append(row)
+        if row.pr_url:
+            all_pr_urls.add(row.pr_url)
+    print(f"   📊 Reviews found for {len(reviews_by_agent)} agents")
+    print(f"   📊 {len(all_pr_urls)} unique PRs to check status for")
+    # =========================================================================
+    # QUERY 2: Fetch ALL PR statuses in one query
+    # =========================================================================
+    if all_pr_urls:
+        print(f"\n🔍 Query 2/2: Fetching ALL PR statuses...")
+        extended_end_date = current_time
+        status_map = fetch_pr_status_from_bigquery(client, list(all_pr_urls), start_date, extended_end_date)
     else:
+        status_map = {}
+    # =========================================================================
+    # Post-process: Build metadata for each agent
+    # =========================================================================
+    print(f"\n📦 Processing metadata for each agent...")
+    results = {}
+    for agent in agents:
+        identifier = agent.get('github_identifier')
+        if not identifier or identifier not in reviews_by_agent:
+            results[identifier] = []
+            continue
+        review_rows = reviews_by_agent[identifier]
+        # Deduplicate by PR URL
+        metadata_list = []
+        seen_prs = set()
+        for row in review_rows:
+            pr_url = row.pr_url
+            if pr_url in seen_prs:
+                continue
+            seen_prs.add(pr_url)
+            status_info = status_map.get(pr_url, {
+                'status': 'open',
+                'merged': False,
+                'closed_at': None
+            })
+            metadata = extract_review_metadata(row, status_info)
+            metadata_list.append(metadata)
+        results[identifier] = metadata_list
+        print(f"   ✓ {agent.get('name', identifier)}: {len(metadata_list)} unique PRs")
+    return results
 # =============================================================================
     return dict(grouped)
 def save_review_metadata_to_hf(metadata_list, agent_identifier):
     """
     Save review metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
         metadata_list: List of review metadata dictionaries
         agent_identifier: GitHub identifier of the agent (used as folder name)
     """
     import shutil
     try:
 def load_agents_from_hf():
+    """
+    Load all agent metadata JSON files from HuggingFace dataset.
+    The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]')
+    """
     try:
         api = HfApi()
         agents = []
                 with open(file_path, 'r') as f:
                     agent_data = json.load(f)
+                    # Extract github_identifier from filename (remove .json extension)
+                    github_identifier = json_file.replace('.json', '')
+                    agent_data['github_identifier'] = github_identifier
                     agents.append(agent_data)
             except Exception as e:
 def mine_all_agents():
     """
     Mine review metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
+    Uses BigQuery to query GitHub Archive with batch processing (only 2 queries for all agents).
     """
     # Load agent metadata from HuggingFace
     agents = load_agents_from_hf()
     if not agents:
     print(f"\n{'='*80}")
     print(f"Starting review metadata mining for {len(agents)} agents")
     print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
+    print(f"Data source: BigQuery + GitHub Archive (BATCH MODE)")
+    print(f"{'='*80}\n")
+    # Fetch ALL reviews for ALL agents in batch (only 2 BigQuery queries total!)
+    try:
+        all_metadata = fetch_all_reviews_metadata_batch(agents)
+    except Exception as e:
+        print(f"✗ Error during batch fetch: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return
+    # Save results for each agent
+    print(f"\n{'='*80}")
+    print(f"💾 Saving results to HuggingFace...")
     print(f"{'='*80}\n")
     for agent in agents:
         identifier = agent.get('github_identifier')
+        agent_name = agent.get('name', agent.get('agent_name', 'Unknown'))
         if not identifier:
             print(f"Warning: Skipping agent without identifier: {agent}")
             continue
+        metadata = all_metadata.get(identifier, [])
+        try:
             if metadata:
+                print(f"💾 {agent_name}: Saving {len(metadata)} review records...")
                 save_review_metadata_to_hf(metadata, identifier)
+                print(f"   ✓ Successfully saved")
             else:
                 print(f"   No reviews found for {agent_name}")
         except Exception as e:
+            print(f"✗ Error saving {identifier}: {str(e)}")
             import traceback
             traceback.print_exc()
             continue

requirements.txt CHANGED Viewed

@@ -1,5 +1,7 @@
 APScheduler
 datasets
 gradio
 gradio_leaderboard
 huggingface_hub

 APScheduler
 datasets
+db-dtypes
+google-cloud-bigquery
 gradio
 gradio_leaderboard
 huggingface_hub