Spaces:
Running
Running
refine
Browse files
app.py
CHANGED
|
@@ -139,9 +139,81 @@ def get_bigquery_client():
|
|
| 139 |
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
|
| 140 |
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
| 143 |
"""
|
| 144 |
-
Fetch PR review events from GitHub Archive for a
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
|
| 147 |
actor.login matches the agent identifier.
|
|
|
|
| 139 |
raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
|
| 140 |
|
| 141 |
|
| 142 |
+
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
|
| 143 |
+
"""
|
| 144 |
+
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
| 145 |
+
Splits agents into smaller batches to avoid performance issues with large queries.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
client: BigQuery client instance
|
| 149 |
+
identifiers: List of GitHub usernames/bot identifiers
|
| 150 |
+
start_date: Start datetime (timezone-aware)
|
| 151 |
+
end_date: End datetime (timezone-aware)
|
| 152 |
+
batch_size: Number of agents to process per batch (default: 100)
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Dictionary mapping agent identifier to list of PR metadata
|
| 156 |
+
"""
|
| 157 |
+
print(f"\nπ Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
|
| 158 |
+
|
| 159 |
+
# Split identifiers into batches
|
| 160 |
+
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 161 |
+
total_batches = len(batches)
|
| 162 |
+
|
| 163 |
+
print(f" Total batches: {total_batches}")
|
| 164 |
+
|
| 165 |
+
# Collect results from all batches
|
| 166 |
+
all_metadata = {}
|
| 167 |
+
successful_batches = 0
|
| 168 |
+
failed_batches = 0
|
| 169 |
+
|
| 170 |
+
for batch_num, batch_identifiers in enumerate(batches, 1):
|
| 171 |
+
print(f"\nπ¦ Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
# Query this batch - process each agent in the batch
|
| 175 |
+
for identifier in batch_identifiers:
|
| 176 |
+
review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
|
| 177 |
+
|
| 178 |
+
# Extract metadata
|
| 179 |
+
metadata_list = []
|
| 180 |
+
seen_prs = set()
|
| 181 |
+
for row in review_rows:
|
| 182 |
+
url = row.url
|
| 183 |
+
if url in seen_prs:
|
| 184 |
+
continue
|
| 185 |
+
seen_prs.add(url)
|
| 186 |
+
|
| 187 |
+
metadata = extract_review_metadata_from_bigquery(row)
|
| 188 |
+
metadata_list.append(metadata)
|
| 189 |
+
|
| 190 |
+
if metadata_list:
|
| 191 |
+
all_metadata[identifier] = metadata_list
|
| 192 |
+
|
| 193 |
+
successful_batches += 1
|
| 194 |
+
print(f" β Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
|
| 195 |
+
|
| 196 |
+
except Exception as e:
|
| 197 |
+
failed_batches += 1
|
| 198 |
+
print(f" β Batch {batch_num}/{total_batches} failed: {str(e)}")
|
| 199 |
+
print(f" Continuing with remaining batches...")
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
print(f"\nπ Batching Summary:")
|
| 203 |
+
print(f" Total batches: {total_batches}")
|
| 204 |
+
print(f" Successful: {successful_batches}")
|
| 205 |
+
print(f" Failed: {failed_batches}")
|
| 206 |
+
print(f" Total agents with data: {len(all_metadata)}")
|
| 207 |
+
|
| 208 |
+
return all_metadata
|
| 209 |
+
|
| 210 |
+
|
| 211 |
def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
|
| 212 |
"""
|
| 213 |
+
Fetch PR review events from GitHub Archive for a SINGLE agent.
|
| 214 |
+
|
| 215 |
+
NOTE: This function is designed for querying a single agent at a time.
|
| 216 |
+
For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
|
| 217 |
|
| 218 |
Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
|
| 219 |
actor.login matches the agent identifier.
|
msr.py
CHANGED
|
@@ -153,9 +153,74 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 153 |
# BIGQUERY FUNCTIONS
|
| 154 |
# =============================================================================
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
|
| 157 |
"""
|
| 158 |
-
Fetch PR review metadata for
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
This query combines:
|
| 161 |
1. Review events (PullRequestReviewEvent) for all agents
|
|
@@ -851,7 +916,7 @@ def mine_all_agents():
|
|
| 851 |
print(f"\n{'='*80}")
|
| 852 |
print(f"Starting review metadata mining for {len(identifiers)} agents")
|
| 853 |
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
|
| 854 |
-
print(f"Data source: BigQuery + GitHub Archive (
|
| 855 |
print(f"{'='*80}\n")
|
| 856 |
|
| 857 |
# Initialize BigQuery client
|
|
@@ -867,8 +932,9 @@ def mine_all_agents():
|
|
| 867 |
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 868 |
|
| 869 |
try:
|
| 870 |
-
|
| 871 |
-
|
|
|
|
| 872 |
)
|
| 873 |
except Exception as e:
|
| 874 |
print(f"β Error during BigQuery fetch: {str(e)}")
|
|
@@ -916,13 +982,18 @@ def mine_all_agents():
|
|
| 916 |
error_count += 1
|
| 917 |
continue
|
| 918 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 919 |
print(f"\n{'='*80}")
|
| 920 |
print(f"β
Mining complete!")
|
| 921 |
print(f" Total agents: {len(agents)}")
|
| 922 |
print(f" Successfully saved: {success_count}")
|
| 923 |
print(f" No data (skipped): {no_data_count}")
|
| 924 |
print(f" Errors: {error_count}")
|
| 925 |
-
print(f" BigQuery
|
| 926 |
print(f"{'='*80}\n")
|
| 927 |
|
| 928 |
# Construct and save leaderboard data
|
|
|
|
| 153 |
# BIGQUERY FUNCTIONS
|
| 154 |
# =============================================================================
|
| 155 |
|
| 156 |
+
def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
|
| 157 |
+
"""
|
| 158 |
+
Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
|
| 159 |
+
Splits agents into smaller batches to avoid performance issues with large queries.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
client: BigQuery client instance
|
| 163 |
+
identifiers: List of GitHub usernames/bot identifiers
|
| 164 |
+
start_date: Start datetime (timezone-aware)
|
| 165 |
+
end_date: End datetime (timezone-aware)
|
| 166 |
+
batch_size: Number of agents to process per batch (default: 100)
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
Dictionary mapping agent identifier to list of PR metadata (same format as single query)
|
| 170 |
+
"""
|
| 171 |
+
print(f"\nπ Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
|
| 172 |
+
|
| 173 |
+
# Split identifiers into batches
|
| 174 |
+
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 175 |
+
total_batches = len(batches)
|
| 176 |
+
|
| 177 |
+
print(f" Total batches: {total_batches}")
|
| 178 |
+
|
| 179 |
+
# Collect results from all batches
|
| 180 |
+
all_metadata = {}
|
| 181 |
+
successful_batches = 0
|
| 182 |
+
failed_batches = 0
|
| 183 |
+
|
| 184 |
+
for batch_num, batch_identifiers in enumerate(batches, 1):
|
| 185 |
+
print(f"\nπ¦ Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
# Query this batch
|
| 189 |
+
batch_results = fetch_all_pr_metadata_single_query(
|
| 190 |
+
client, batch_identifiers, start_date, end_date
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# Merge results
|
| 194 |
+
for identifier, metadata_list in batch_results.items():
|
| 195 |
+
if identifier in all_metadata:
|
| 196 |
+
all_metadata[identifier].extend(metadata_list)
|
| 197 |
+
else:
|
| 198 |
+
all_metadata[identifier] = metadata_list
|
| 199 |
+
|
| 200 |
+
successful_batches += 1
|
| 201 |
+
print(f" β Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
failed_batches += 1
|
| 205 |
+
print(f" β Batch {batch_num}/{total_batches} failed: {str(e)}")
|
| 206 |
+
print(f" Continuing with remaining batches...")
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
print(f"\nπ Batching Summary:")
|
| 210 |
+
print(f" Total batches: {total_batches}")
|
| 211 |
+
print(f" Successful: {successful_batches}")
|
| 212 |
+
print(f" Failed: {failed_batches}")
|
| 213 |
+
print(f" Total agents with data: {len(all_metadata)}")
|
| 214 |
+
|
| 215 |
+
return all_metadata
|
| 216 |
+
|
| 217 |
+
|
| 218 |
def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
|
| 219 |
"""
|
| 220 |
+
Fetch PR review metadata for a BATCH of agents using ONE comprehensive BigQuery query.
|
| 221 |
+
|
| 222 |
+
NOTE: This function is designed for smaller batches (~100 agents).
|
| 223 |
+
For large numbers of agents, use fetch_all_pr_metadata_batched() instead.
|
| 224 |
|
| 225 |
This query combines:
|
| 226 |
1. Review events (PullRequestReviewEvent) for all agents
|
|
|
|
| 916 |
print(f"\n{'='*80}")
|
| 917 |
print(f"Starting review metadata mining for {len(identifiers)} agents")
|
| 918 |
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
|
| 919 |
+
print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
|
| 920 |
print(f"{'='*80}\n")
|
| 921 |
|
| 922 |
# Initialize BigQuery client
|
|
|
|
| 932 |
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 933 |
|
| 934 |
try:
|
| 935 |
+
# Use batched approach for better performance
|
| 936 |
+
all_metadata = fetch_all_pr_metadata_batched(
|
| 937 |
+
client, identifiers, start_date, end_date, batch_size=100
|
| 938 |
)
|
| 939 |
except Exception as e:
|
| 940 |
print(f"β Error during BigQuery fetch: {str(e)}")
|
|
|
|
| 982 |
error_count += 1
|
| 983 |
continue
|
| 984 |
|
| 985 |
+
# Calculate number of batches
|
| 986 |
+
total_identifiers = len(identifiers)
|
| 987 |
+
batch_size = 100
|
| 988 |
+
num_batches = (total_identifiers + batch_size - 1) // batch_size # Ceiling division
|
| 989 |
+
|
| 990 |
print(f"\n{'='*80}")
|
| 991 |
print(f"β
Mining complete!")
|
| 992 |
print(f" Total agents: {len(agents)}")
|
| 993 |
print(f" Successfully saved: {success_count}")
|
| 994 |
print(f" No data (skipped): {no_data_count}")
|
| 995 |
print(f" Errors: {error_count}")
|
| 996 |
+
print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
|
| 997 |
print(f"{'='*80}\n")
|
| 998 |
|
| 999 |
# Construct and save leaderboard data
|