Spaces:

SWE-Arena
/

SWE-Review

Running

App Files Files Community

zhiminy commited on 17 days ago

Commit

313696f

1 Parent(s): eb2f380

refine

Browse files

Files changed (2) hide show

app.py +73 -1
msr.py +76 -5

app.py CHANGED Viewed

@@ -139,9 +139,81 @@ def get_bigquery_client():
         raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
 def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
     """
-    Fetch PR review events from GitHub Archive for a specific agent.
     Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
     actor.login matches the agent identifier.

         raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
+def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
+    """
+    Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
+    Splits agents into smaller batches to avoid performance issues with large queries.
+    Args:
+        client: BigQuery client instance
+        identifiers: List of GitHub usernames/bot identifiers
+        start_date: Start datetime (timezone-aware)
+        end_date: End datetime (timezone-aware)
+        batch_size: Number of agents to process per batch (default: 100)
+    Returns:
+        Dictionary mapping agent identifier to list of PR metadata
+    """
+    print(f"\n🔍 Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
+    # Split identifiers into batches
+    batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
+    total_batches = len(batches)
+    print(f"   Total batches: {total_batches}")
+    # Collect results from all batches
+    all_metadata = {}
+    successful_batches = 0
+    failed_batches = 0
+    for batch_num, batch_identifiers in enumerate(batches, 1):
+        print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
+        try:
+            # Query this batch - process each agent in the batch
+            for identifier in batch_identifiers:
+                review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
+                # Extract metadata
+                metadata_list = []
+                seen_prs = set()
+                for row in review_rows:
+                    url = row.url
+                    if url in seen_prs:
+                        continue
+                    seen_prs.add(url)
+                    metadata = extract_review_metadata_from_bigquery(row)
+                    metadata_list.append(metadata)
+                if metadata_list:
+                    all_metadata[identifier] = metadata_list
+            successful_batches += 1
+            print(f"   ✓ Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
+        except Exception as e:
+            failed_batches += 1
+            print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
+            print(f"   Continuing with remaining batches...")
+            continue
+    print(f"\n📊 Batching Summary:")
+    print(f"   Total batches: {total_batches}")
+    print(f"   Successful: {successful_batches}")
+    print(f"   Failed: {failed_batches}")
+    print(f"   Total agents with data: {len(all_metadata)}")
+    return all_metadata
 def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
     """
+    Fetch PR review events from GitHub Archive for a SINGLE agent.
+    NOTE: This function is designed for querying a single agent at a time.
+    For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
     Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
     actor.login matches the agent identifier.

msr.py CHANGED Viewed

@@ -153,9 +153,74 @@ def generate_table_union_statements(start_date, end_date):
 # BIGQUERY FUNCTIONS
 # =============================================================================
 def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
     """
-    Fetch PR review metadata for ALL agents using ONE comprehensive BigQuery query.
     This query combines:
     1. Review events (PullRequestReviewEvent) for all agents
@@ -851,7 +916,7 @@ def mine_all_agents():
     print(f"\n{'='*80}")
     print(f"Starting review metadata mining for {len(identifiers)} agents")
     print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
-    print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
     print(f"{'='*80}\n")
     # Initialize BigQuery client
@@ -867,8 +932,9 @@ def mine_all_agents():
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
-        all_metadata = fetch_all_pr_metadata_single_query(
-            client, identifiers, start_date, end_date
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
@@ -916,13 +982,18 @@ def mine_all_agents():
             error_count += 1
             continue
     print(f"\n{'='*80}")
     print(f"✅ Mining complete!")
     print(f"   Total agents: {len(agents)}")
     print(f"   Successfully saved: {success_count}")
     print(f"   No data (skipped): {no_data_count}")
     print(f"   Errors: {error_count}")
-    print(f"   BigQuery queries executed: 1")
     print(f"{'='*80}\n")
     # Construct and save leaderboard data

 # BIGQUERY FUNCTIONS
 # =============================================================================
+def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
+    """
+    Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
+    Splits agents into smaller batches to avoid performance issues with large queries.
+    Args:
+        client: BigQuery client instance
+        identifiers: List of GitHub usernames/bot identifiers
+        start_date: Start datetime (timezone-aware)
+        end_date: End datetime (timezone-aware)
+        batch_size: Number of agents to process per batch (default: 100)
+    Returns:
+        Dictionary mapping agent identifier to list of PR metadata (same format as single query)
+    """
+    print(f"\n🔍 Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
+    # Split identifiers into batches
+    batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
+    total_batches = len(batches)
+    print(f"   Total batches: {total_batches}")
+    # Collect results from all batches
+    all_metadata = {}
+    successful_batches = 0
+    failed_batches = 0
+    for batch_num, batch_identifiers in enumerate(batches, 1):
+        print(f"\n📦 Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
+        try:
+            # Query this batch
+            batch_results = fetch_all_pr_metadata_single_query(
+                client, batch_identifiers, start_date, end_date
+            )
+            # Merge results
+            for identifier, metadata_list in batch_results.items():
+                if identifier in all_metadata:
+                    all_metadata[identifier].extend(metadata_list)
+                else:
+                    all_metadata[identifier] = metadata_list
+            successful_batches += 1
+            print(f"   ✓ Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
+        except Exception as e:
+            failed_batches += 1
+            print(f"   ✗ Batch {batch_num}/{total_batches} failed: {str(e)}")
+            print(f"   Continuing with remaining batches...")
+            continue
+    print(f"\n📊 Batching Summary:")
+    print(f"   Total batches: {total_batches}")
+    print(f"   Successful: {successful_batches}")
+    print(f"   Failed: {failed_batches}")
+    print(f"   Total agents with data: {len(all_metadata)}")
+    return all_metadata
 def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
     """
+    Fetch PR review metadata for a BATCH of agents using ONE comprehensive BigQuery query.
+    NOTE: This function is designed for smaller batches (~100 agents).
+    For large numbers of agents, use fetch_all_pr_metadata_batched() instead.
     This query combines:
     1. Review events (PullRequestReviewEvent) for all agents
     print(f"\n{'='*80}")
     print(f"Starting review metadata mining for {len(identifiers)} agents")
     print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
+    print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
     print(f"{'='*80}\n")
     # Initialize BigQuery client
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
+        # Use batched approach for better performance
+        all_metadata = fetch_all_pr_metadata_batched(
+            client, identifiers, start_date, end_date, batch_size=100
         )
     except Exception as e:
         print(f"✗ Error during BigQuery fetch: {str(e)}")
             error_count += 1
             continue
+    # Calculate number of batches
+    total_identifiers = len(identifiers)
+    batch_size = 100
+    num_batches = (total_identifiers + batch_size - 1) // batch_size  # Ceiling division
     print(f"\n{'='*80}")
     print(f"✅ Mining complete!")
     print(f"   Total agents: {len(agents)}")
     print(f"   Successfully saved: {success_count}")
     print(f"   No data (skipped): {no_data_count}")
     print(f"   Errors: {error_count}")
+    print(f"   BigQuery batches executed: {num_batches} (batch size: {batch_size})")
     print(f"{'='*80}\n")
     # Construct and save leaderboard data