zhiminy commited on
Commit
313696f
Β·
1 Parent(s): eb2f380
Files changed (2) hide show
  1. app.py +73 -1
  2. msr.py +76 -5
app.py CHANGED
@@ -139,9 +139,81 @@ def get_bigquery_client():
139
  raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
143
  """
144
- Fetch PR review events from GitHub Archive for a specific agent.
 
 
 
145
 
146
  Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
147
  actor.login matches the agent identifier.
 
139
  raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
140
 
141
 
142
+ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
143
+ """
144
+ Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
145
+ Splits agents into smaller batches to avoid performance issues with large queries.
146
+
147
+ Args:
148
+ client: BigQuery client instance
149
+ identifiers: List of GitHub usernames/bot identifiers
150
+ start_date: Start datetime (timezone-aware)
151
+ end_date: End datetime (timezone-aware)
152
+ batch_size: Number of agents to process per batch (default: 100)
153
+
154
+ Returns:
155
+ Dictionary mapping agent identifier to list of PR metadata
156
+ """
157
+ print(f"\nπŸ” Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
158
+
159
+ # Split identifiers into batches
160
+ batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
161
+ total_batches = len(batches)
162
+
163
+ print(f" Total batches: {total_batches}")
164
+
165
+ # Collect results from all batches
166
+ all_metadata = {}
167
+ successful_batches = 0
168
+ failed_batches = 0
169
+
170
+ for batch_num, batch_identifiers in enumerate(batches, 1):
171
+ print(f"\nπŸ“¦ Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
172
+
173
+ try:
174
+ # Query this batch - process each agent in the batch
175
+ for identifier in batch_identifiers:
176
+ review_rows = fetch_reviews_from_bigquery(client, identifier, start_date, end_date)
177
+
178
+ # Extract metadata
179
+ metadata_list = []
180
+ seen_prs = set()
181
+ for row in review_rows:
182
+ url = row.url
183
+ if url in seen_prs:
184
+ continue
185
+ seen_prs.add(url)
186
+
187
+ metadata = extract_review_metadata_from_bigquery(row)
188
+ metadata_list.append(metadata)
189
+
190
+ if metadata_list:
191
+ all_metadata[identifier] = metadata_list
192
+
193
+ successful_batches += 1
194
+ print(f" βœ“ Batch {batch_num}/{total_batches} complete: {len(batch_identifiers)} agents processed")
195
+
196
+ except Exception as e:
197
+ failed_batches += 1
198
+ print(f" βœ— Batch {batch_num}/{total_batches} failed: {str(e)}")
199
+ print(f" Continuing with remaining batches...")
200
+ continue
201
+
202
+ print(f"\nπŸ“Š Batching Summary:")
203
+ print(f" Total batches: {total_batches}")
204
+ print(f" Successful: {successful_batches}")
205
+ print(f" Failed: {failed_batches}")
206
+ print(f" Total agents with data: {len(all_metadata)}")
207
+
208
+ return all_metadata
209
+
210
+
211
  def fetch_reviews_from_bigquery(client, identifier, start_date, end_date):
212
  """
213
+ Fetch PR review events from GitHub Archive for a SINGLE agent.
214
+
215
+ NOTE: This function is designed for querying a single agent at a time.
216
+ For querying multiple agents efficiently, use fetch_all_pr_metadata_batched() instead.
217
 
218
  Queries githubarchive.day.YYYYMMDD tables for PullRequestReviewEvent where
219
  actor.login matches the agent identifier.
msr.py CHANGED
@@ -153,9 +153,74 @@ def generate_table_union_statements(start_date, end_date):
153
  # BIGQUERY FUNCTIONS
154
  # =============================================================================
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
157
  """
158
- Fetch PR review metadata for ALL agents using ONE comprehensive BigQuery query.
 
 
 
159
 
160
  This query combines:
161
  1. Review events (PullRequestReviewEvent) for all agents
@@ -851,7 +916,7 @@ def mine_all_agents():
851
  print(f"\n{'='*80}")
852
  print(f"Starting review metadata mining for {len(identifiers)} agents")
853
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
854
- print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
855
  print(f"{'='*80}\n")
856
 
857
  # Initialize BigQuery client
@@ -867,8 +932,9 @@ def mine_all_agents():
867
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
868
 
869
  try:
870
- all_metadata = fetch_all_pr_metadata_single_query(
871
- client, identifiers, start_date, end_date
 
872
  )
873
  except Exception as e:
874
  print(f"βœ— Error during BigQuery fetch: {str(e)}")
@@ -916,13 +982,18 @@ def mine_all_agents():
916
  error_count += 1
917
  continue
918
 
 
 
 
 
 
919
  print(f"\n{'='*80}")
920
  print(f"βœ… Mining complete!")
921
  print(f" Total agents: {len(agents)}")
922
  print(f" Successfully saved: {success_count}")
923
  print(f" No data (skipped): {no_data_count}")
924
  print(f" Errors: {error_count}")
925
- print(f" BigQuery queries executed: 1")
926
  print(f"{'='*80}\n")
927
 
928
  # Construct and save leaderboard data
 
153
  # BIGQUERY FUNCTIONS
154
  # =============================================================================
155
 
156
+ def fetch_all_pr_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
157
+ """
158
+ Fetch PR review metadata for ALL agents using BATCHED BigQuery queries.
159
+ Splits agents into smaller batches to avoid performance issues with large queries.
160
+
161
+ Args:
162
+ client: BigQuery client instance
163
+ identifiers: List of GitHub usernames/bot identifiers
164
+ start_date: Start datetime (timezone-aware)
165
+ end_date: End datetime (timezone-aware)
166
+ batch_size: Number of agents to process per batch (default: 100)
167
+
168
+ Returns:
169
+ Dictionary mapping agent identifier to list of PR metadata (same format as single query)
170
+ """
171
+ print(f"\nπŸ” Using BATCHED approach: {len(identifiers)} agents in batches of {batch_size}")
172
+
173
+ # Split identifiers into batches
174
+ batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
175
+ total_batches = len(batches)
176
+
177
+ print(f" Total batches: {total_batches}")
178
+
179
+ # Collect results from all batches
180
+ all_metadata = {}
181
+ successful_batches = 0
182
+ failed_batches = 0
183
+
184
+ for batch_num, batch_identifiers in enumerate(batches, 1):
185
+ print(f"\nπŸ“¦ Processing batch {batch_num}/{total_batches} ({len(batch_identifiers)} agents)...")
186
+
187
+ try:
188
+ # Query this batch
189
+ batch_results = fetch_all_pr_metadata_single_query(
190
+ client, batch_identifiers, start_date, end_date
191
+ )
192
+
193
+ # Merge results
194
+ for identifier, metadata_list in batch_results.items():
195
+ if identifier in all_metadata:
196
+ all_metadata[identifier].extend(metadata_list)
197
+ else:
198
+ all_metadata[identifier] = metadata_list
199
+
200
+ successful_batches += 1
201
+ print(f" βœ“ Batch {batch_num}/{total_batches} complete: {len(batch_results)} agents processed")
202
+
203
+ except Exception as e:
204
+ failed_batches += 1
205
+ print(f" βœ— Batch {batch_num}/{total_batches} failed: {str(e)}")
206
+ print(f" Continuing with remaining batches...")
207
+ continue
208
+
209
+ print(f"\nπŸ“Š Batching Summary:")
210
+ print(f" Total batches: {total_batches}")
211
+ print(f" Successful: {successful_batches}")
212
+ print(f" Failed: {failed_batches}")
213
+ print(f" Total agents with data: {len(all_metadata)}")
214
+
215
+ return all_metadata
216
+
217
+
218
  def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date):
219
  """
220
+ Fetch PR review metadata for a BATCH of agents using ONE comprehensive BigQuery query.
221
+
222
+ NOTE: This function is designed for smaller batches (~100 agents).
223
+ For large numbers of agents, use fetch_all_pr_metadata_batched() instead.
224
 
225
  This query combines:
226
  1. Review events (PullRequestReviewEvent) for all agents
 
916
  print(f"\n{'='*80}")
917
  print(f"Starting review metadata mining for {len(identifiers)} agents")
918
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
919
+ print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
920
  print(f"{'='*80}\n")
921
 
922
  # Initialize BigQuery client
 
932
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
933
 
934
  try:
935
+ # Use batched approach for better performance
936
+ all_metadata = fetch_all_pr_metadata_batched(
937
+ client, identifiers, start_date, end_date, batch_size=100
938
  )
939
  except Exception as e:
940
  print(f"βœ— Error during BigQuery fetch: {str(e)}")
 
982
  error_count += 1
983
  continue
984
 
985
+ # Calculate number of batches
986
+ total_identifiers = len(identifiers)
987
+ batch_size = 100
988
+ num_batches = (total_identifiers + batch_size - 1) // batch_size # Ceiling division
989
+
990
  print(f"\n{'='*80}")
991
  print(f"βœ… Mining complete!")
992
  print(f" Total agents: {len(agents)}")
993
  print(f" Successfully saved: {success_count}")
994
  print(f" No data (skipped): {no_data_count}")
995
  print(f" Errors: {error_count}")
996
+ print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
997
  print(f"{'='*80}\n")
998
 
999
  # Construct and save leaderboard data