zhiminy commited on
Commit
3dcd27a
Β·
1 Parent(s): 080822b
Files changed (1) hide show
  1. msr.py +781 -0
msr.py ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimalist Review Metadata Mining Script
3
+ Mines PR review metadata from GitHub and saves to HuggingFace dataset.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ import requests
10
+ from datetime import datetime, timezone, timedelta
11
+ from collections import defaultdict
12
+ from huggingface_hub import HfApi, hf_hub_download
13
+ from dotenv import load_dotenv
14
+ import random
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # =============================================================================
20
+ # CONFIGURATION
21
+ # =============================================================================
22
+
23
+ AGENTS_REPO = "SWE-Arena/swe_agents"
24
+ REVIEW_METADATA_REPO = "SWE-Arena/review_metadata"
25
+ LEADERBOARD_TIME_FRAME_DAYS = 180 # 6 months
26
+
27
+ # =============================================================================
28
+ # UTILITY FUNCTIONS
29
+ # =============================================================================
30
+
31
+ def load_jsonl(filename):
32
+ """Load JSONL file and return list of dictionaries."""
33
+ if not os.path.exists(filename):
34
+ return []
35
+
36
+ data = []
37
+ with open(filename, 'r', encoding='utf-8') as f:
38
+ for line in f:
39
+ line = line.strip()
40
+ if line:
41
+ try:
42
+ data.append(json.loads(line))
43
+ except json.JSONDecodeError as e:
44
+ print(f"Warning: Skipping invalid JSON line: {e}")
45
+ return data
46
+
47
+
48
+ def save_jsonl(filename, data):
49
+ """Save list of dictionaries to JSONL file."""
50
+ with open(filename, 'w', encoding='utf-8') as f:
51
+ for item in data:
52
+ f.write(json.dumps(item) + '\n')
53
+
54
+
55
+ def get_github_token():
56
+ """Get GitHub token from environment variables."""
57
+ token = os.getenv('GITHUB_TOKEN')
58
+ if not token:
59
+ print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
60
+ return token
61
+
62
+
63
+ def get_hf_token():
64
+ """Get HuggingFace token from environment variables."""
65
+ token = os.getenv('HF_TOKEN')
66
+ if not token:
67
+ print("Warning: HF_TOKEN not found in environment variables")
68
+ return token
69
+
70
+
71
+ # =============================================================================
72
+ # GITHUB API FUNCTIONS
73
+ # =============================================================================
74
+
75
+ def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30):
76
+ """
77
+ Perform an HTTP request with exponential backoff and jitter for GitHub API.
78
+ Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
79
+ """
80
+ delay = 1.0
81
+ for attempt in range(max_retries):
82
+ try:
83
+ resp = requests.request(
84
+ method,
85
+ url,
86
+ headers=headers or {},
87
+ params=params,
88
+ json=json_body,
89
+ data=data,
90
+ timeout=timeout
91
+ )
92
+
93
+ status = resp.status_code
94
+
95
+ # Success
96
+ if 200 <= status < 300:
97
+ return resp
98
+
99
+ # Rate limits or server errors -> retry with backoff
100
+ if status in (403, 429) or 500 <= status < 600:
101
+ wait = None
102
+
103
+ # Prefer Retry-After when present
104
+ retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
105
+ if retry_after:
106
+ try:
107
+ wait = float(retry_after)
108
+ except Exception:
109
+ wait = None
110
+
111
+ # Fallback to X-RateLimit-Reset when 403/429
112
+ if wait is None and status in (403, 429):
113
+ reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
114
+ if reset_hdr:
115
+ try:
116
+ reset_ts = int(float(reset_hdr))
117
+ wait = max(reset_ts - time.time() + 2, 1)
118
+ except Exception:
119
+ wait = None
120
+
121
+ # Final fallback: exponential backoff with jitter
122
+ if wait is None:
123
+ wait = delay + random.uniform(0, 0.5)
124
+
125
+ # Cap individual wait to avoid extreme sleeps
126
+ wait = max(1.0, min(wait, 120.0))
127
+ print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
128
+ time.sleep(wait)
129
+ delay = min(delay * 2, 60.0)
130
+ continue
131
+
132
+ # Non-retryable error; return response for caller to handle
133
+ return resp
134
+
135
+ except requests.RequestException as e:
136
+ # Network error -> retry with backoff
137
+ wait = delay + random.uniform(0, 0.5)
138
+ wait = max(1.0, min(wait, 60.0))
139
+ print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
140
+ time.sleep(wait)
141
+ delay = min(delay * 2, 60.0)
142
+
143
+ print(f"Exceeded max retries for {url}")
144
+ return None
145
+
146
+
147
+ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers, prs_by_url, depth=0):
148
+ """
149
+ Fetch reviews within a specific time range using time-based partitioning.
150
+ Recursively splits the time range if hitting the 1000-result limit.
151
+ Supports splitting by day, hour, minute, and second as needed.
152
+
153
+ Returns the number of reviews found in this time partition.
154
+ """
155
+ # Calculate time difference
156
+ time_diff = end_date - start_date
157
+ total_seconds = time_diff.total_seconds()
158
+
159
+ # Determine granularity and format dates accordingly
160
+ if total_seconds >= 86400: # >= 1 day
161
+ # Use day granularity (YYYY-MM-DD)
162
+ start_str = start_date.strftime('%Y-%m-%d')
163
+ end_str = end_date.strftime('%Y-%m-%d')
164
+ elif total_seconds >= 3600: # >= 1 hour but < 1 day
165
+ # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
166
+ start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
167
+ end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
168
+ elif total_seconds >= 60: # >= 1 minute but < 1 hour
169
+ # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
170
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
171
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
172
+ else: # < 1 minute
173
+ # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
174
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
175
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
176
+
177
+ # Add date range to query (use created for PR search)
178
+ query = f'{base_query} created:{start_str}..{end_str}'
179
+
180
+ indent = " " + " " * depth
181
+ print(f"{indent}Searching range {start_str} to {end_str}...")
182
+
183
+ page = 1
184
+ per_page = 100
185
+ total_in_partition = 0
186
+
187
+ while True:
188
+ url = 'https://api.github.com/search/issues' # Use issues endpoint for PR search
189
+ params = {
190
+ 'q': query,
191
+ 'per_page': per_page,
192
+ 'page': page,
193
+ 'sort': 'created',
194
+ 'order': 'asc'
195
+ }
196
+ headers_with_accept = headers.copy() if headers else {}
197
+
198
+ try:
199
+ response = request_with_backoff('GET', url, headers=headers_with_accept, params=params)
200
+ if response is None:
201
+ print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
202
+ return total_in_partition
203
+
204
+ if response.status_code != 200:
205
+ print(f"{indent} Error: HTTP {response.status_code} for range {start_str} to {end_str}")
206
+ return total_in_partition
207
+
208
+ data = response.json()
209
+ total_count = data.get('total_count', 0)
210
+ items = data.get('items', [])
211
+
212
+ if not items:
213
+ break
214
+
215
+ # Add PR reviews to global dict (keyed by PR URL)
216
+ for pr in items:
217
+ pr_url = pr.get('html_url')
218
+ if pr_url and pr_url not in prs_by_url:
219
+ prs_by_url[pr_url] = pr
220
+ total_in_partition += 1
221
+
222
+ # Check if we hit the 1000-result limit
223
+ if total_count > 1000 and page == 10:
224
+ print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
225
+
226
+ # Determine how to split based on time range duration
227
+ if total_seconds < 2: # Less than 2 seconds - can't split further
228
+ print(f"{indent} ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
229
+ break
230
+
231
+ elif total_seconds < 120: # Less than 2 minutes - split by seconds
232
+ num_splits = min(4, max(2, int(total_seconds / 30)))
233
+ split_duration = time_diff / num_splits
234
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
235
+
236
+ total_from_splits = 0
237
+ for i in range(num_splits):
238
+ split_start = split_dates[i]
239
+ split_end = split_dates[i + 1]
240
+ if i > 0:
241
+ split_start = split_start + timedelta(seconds=1)
242
+
243
+ count = fetch_reviews_with_time_partition(
244
+ base_query, split_start, split_end, headers, prs_by_url, depth + 1
245
+ )
246
+ total_from_splits += count
247
+
248
+ return total_from_splits
249
+
250
+ elif total_seconds < 7200: # Less than 2 hours - split by minutes
251
+ num_splits = min(4, max(2, int(total_seconds / 1800)))
252
+ split_duration = time_diff / num_splits
253
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
254
+
255
+ total_from_splits = 0
256
+ for i in range(num_splits):
257
+ split_start = split_dates[i]
258
+ split_end = split_dates[i + 1]
259
+ if i > 0:
260
+ split_start = split_start + timedelta(minutes=1)
261
+
262
+ count = fetch_reviews_with_time_partition(
263
+ base_query, split_start, split_end, headers, prs_by_url, depth + 1
264
+ )
265
+ total_from_splits += count
266
+
267
+ return total_from_splits
268
+
269
+ elif total_seconds < 172800: # Less than 2 days - split by hours
270
+ num_splits = min(4, max(2, int(total_seconds / 43200)))
271
+ split_duration = time_diff / num_splits
272
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
273
+
274
+ total_from_splits = 0
275
+ for i in range(num_splits):
276
+ split_start = split_dates[i]
277
+ split_end = split_dates[i + 1]
278
+ if i > 0:
279
+ split_start = split_start + timedelta(hours=1)
280
+
281
+ count = fetch_reviews_with_time_partition(
282
+ base_query, split_start, split_end, headers, prs_by_url, depth + 1
283
+ )
284
+ total_from_splits += count
285
+
286
+ return total_from_splits
287
+
288
+ else: # 2+ days - split by days
289
+ days_diff = time_diff.days
290
+
291
+ # Use aggressive splitting for large ranges or deep recursion
292
+ if days_diff > 30 or depth > 5:
293
+ # Split into 4 parts for more aggressive partitioning
294
+ quarter_diff = time_diff / 4
295
+ split_dates = [
296
+ start_date,
297
+ start_date + quarter_diff,
298
+ start_date + quarter_diff * 2,
299
+ start_date + quarter_diff * 3,
300
+ end_date
301
+ ]
302
+
303
+ total_from_splits = 0
304
+ for i in range(4):
305
+ split_start = split_dates[i]
306
+ split_end = split_dates[i + 1]
307
+ if i > 0:
308
+ split_start = split_start + timedelta(days=1)
309
+
310
+ count = fetch_reviews_with_time_partition(
311
+ base_query, split_start, split_end, headers, prs_by_url, depth + 1
312
+ )
313
+ total_from_splits += count
314
+
315
+ return total_from_splits
316
+ else:
317
+ # Binary split for smaller ranges
318
+ mid_date = start_date + time_diff / 2
319
+
320
+ count1 = fetch_reviews_with_time_partition(
321
+ base_query, start_date, mid_date, headers, prs_by_url, depth + 1
322
+ )
323
+ count2 = fetch_reviews_with_time_partition(
324
+ base_query, mid_date + timedelta(days=1), end_date, headers, prs_by_url, depth + 1
325
+ )
326
+
327
+ return count1 + count2
328
+
329
+ # Normal pagination: check if there are more pages
330
+ if len(items) < per_page or page >= 10:
331
+ break
332
+
333
+ page += 1
334
+ time.sleep(0.5) # Courtesy delay between pages
335
+
336
+ except Exception as e:
337
+ print(f"{indent} Error fetching range {start_str} to {end_str}: {str(e)}")
338
+ return total_in_partition
339
+
340
+ if total_in_partition > 0:
341
+ print(f"{indent} βœ“ Found {total_in_partition} reviews in range {start_str} to {end_str}")
342
+
343
+ return total_in_partition
344
+
345
+
346
+ def extract_review_metadata(pr):
347
+ """
348
+ Extract minimal PR review metadata for efficient storage.
349
+ Only keeps essential fields: html_url, reviewed_at, pr_status, pr_merged, pr_closed_at.
350
+
351
+ PR status:
352
+ - pr_status: 'open', 'merged', or 'closed'
353
+ - pr_merged: True if PR was merged, False otherwise
354
+ - pr_closed_at: Date when PR was closed/merged (if applicable)
355
+ """
356
+ pr_url = pr.get('html_url')
357
+ pr_number = pr.get('number')
358
+ created_at = pr.get('created_at')
359
+ closed_at = pr.get('closed_at')
360
+ state = pr.get('state', 'open') # open or closed
361
+
362
+ # Check if PR has pull_request field (indicates it's a PR, not an issue)
363
+ pull_request_data = pr.get('pull_request', {})
364
+ pr_merged = pull_request_data.get('merged_at') is not None if pull_request_data else False
365
+
366
+ # Determine initial status
367
+ if pr_merged:
368
+ status = 'merged'
369
+ elif state == 'closed':
370
+ status = 'closed'
371
+ else:
372
+ status = 'open'
373
+
374
+ return {
375
+ 'html_url': pr_url,
376
+ 'reviewed_at': created_at, # When the PR was created (agent reviewed it)
377
+ 'pr_status': status,
378
+ 'pr_merged': pr_merged,
379
+ 'pr_closed_at': closed_at,
380
+ 'pr_url': pr_url, # Store PR URL for tracking
381
+ 'review_id': f"pr_{pr_number}" # Use PR number for deduplication
382
+ }
383
+
384
+
385
+ def update_pr_status(metadata_list, headers, token):
386
+ """
387
+ Update PR status for reviews to get current merged/closed state.
388
+
389
+ For each PR associated with a review, fetch current status from GitHub API.
390
+ Updates metadata_list in-place with PR status information.
391
+
392
+ Args:
393
+ metadata_list: List of review metadata dictionaries
394
+ headers: HTTP headers for GitHub API
395
+ token: GitHub API token
396
+
397
+ Returns:
398
+ Updated metadata_list with current PR status
399
+ """
400
+ if not metadata_list:
401
+ return metadata_list
402
+
403
+ # Track unique PRs to avoid duplicate API calls
404
+ pr_url_to_status = {}
405
+ updated_count = 0
406
+
407
+ for metadata in metadata_list:
408
+ pr_url = metadata.get('pr_url')
409
+ if not pr_url:
410
+ continue
411
+
412
+ # Skip if already fetched for this PR
413
+ if pr_url in pr_url_to_status:
414
+ status_info = pr_url_to_status[pr_url]
415
+ metadata['pr_status'] = status_info['status']
416
+ metadata['pr_merged'] = status_info['merged']
417
+ metadata['pr_closed_at'] = status_info['closed_at']
418
+ continue
419
+
420
+ try:
421
+ # Convert HTML URL to API URL
422
+ # https://github.com/owner/repo/pull/123 -> https://api.github.com/repos/owner/repo/pulls/123
423
+ parts = pr_url.replace('https://github.com/', '').split('/')
424
+ if len(parts) >= 4:
425
+ owner, repo, pull_word, pr_number = parts[0], parts[1], parts[2], parts[3]
426
+ api_url = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}'
427
+
428
+ response = request_with_backoff('GET', api_url, headers=headers, max_retries=3)
429
+
430
+ if response and response.status_code == 200:
431
+ pr_data = response.json()
432
+ state = pr_data.get('state', 'open')
433
+ merged = pr_data.get('merged', False)
434
+ closed_at = pr_data.get('closed_at')
435
+ merged_at = pr_data.get('merged_at')
436
+
437
+ # Determine final status
438
+ if merged:
439
+ status = 'merged'
440
+ elif state == 'closed':
441
+ status = 'closed'
442
+ else:
443
+ status = 'open'
444
+
445
+ status_info = {
446
+ 'status': status,
447
+ 'merged': merged,
448
+ 'closed_at': closed_at or merged_at
449
+ }
450
+
451
+ # Cache and update
452
+ pr_url_to_status[pr_url] = status_info
453
+ metadata['pr_status'] = status
454
+ metadata['pr_merged'] = merged
455
+ metadata['pr_closed_at'] = closed_at or merged_at
456
+ updated_count += 1
457
+
458
+ # Small delay to avoid rate limiting
459
+ time.sleep(0.1)
460
+
461
+ except Exception as e:
462
+ print(f" Warning: Could not check PR status for {pr_url}: {e}")
463
+ continue
464
+
465
+ if updated_count > 0:
466
+ print(f" βœ“ Updated status for {updated_count} unique PRs")
467
+
468
+ return metadata_list
469
+
470
+
471
+ def fetch_all_reviews_metadata(identifier, agent_name, token=None):
472
+ """
473
+ Fetch PR reviews associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
474
+ Returns lightweight metadata instead of full review objects.
475
+
476
+ This function employs time-based partitioning to navigate GitHub's 1000-result limit per query.
477
+ It searches using the query pattern:
478
+ - reviewed-by:{identifier} (PR reviews by the agent)
479
+
480
+ After fetching reviews, it updates PR status to determine if PRs were merged or closed.
481
+
482
+ Args:
483
+ identifier: GitHub username or bot identifier
484
+ agent_name: Human-readable name of the agent for metadata purposes
485
+ token: GitHub API token for authentication
486
+
487
+ Returns:
488
+ List of dictionaries containing minimal PR review metadata with PR status
489
+ """
490
+ headers = {'Authorization': f'token {token}'} if token else {}
491
+
492
+ # Define query pattern for PR reviews
493
+ query_patterns = [f'is:pr reviewed-by:{identifier}']
494
+
495
+ # Use a dict to deduplicate PRs by URL
496
+ prs_by_url = {}
497
+
498
+ # Define time range: past LEADERBOARD_TIME_FRAME_DAYS
499
+ current_time = datetime.now(timezone.utc)
500
+ start_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
501
+ end_date = current_time
502
+
503
+ for query_pattern in query_patterns:
504
+ print(f"\nπŸ” Searching with query: {query_pattern}")
505
+ print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
506
+
507
+ pattern_start_time = time.time()
508
+ initial_count = len(prs_by_url)
509
+
510
+ # Fetch with time partitioning
511
+ reviews_found = fetch_reviews_with_time_partition(
512
+ query_pattern,
513
+ start_date,
514
+ end_date,
515
+ headers,
516
+ prs_by_url
517
+ )
518
+
519
+ pattern_duration = time.time() - pattern_start_time
520
+ new_reviews = len(prs_by_url) - initial_count
521
+
522
+ print(f" βœ“ Pattern complete: {new_reviews} new PRs found ({reviews_found} total fetched)")
523
+ print(f" ⏱️ Time taken: {pattern_duration:.1f} seconds")
524
+
525
+ time.sleep(1.0)
526
+
527
+ all_prs = list(prs_by_url.values())
528
+
529
+ print(f"\nβœ… COMPLETE: Found {len(all_prs)} unique PRs reviewed by {identifier}")
530
+ print(f"πŸ“¦ Extracting minimal metadata and updating PR status...")
531
+
532
+ # Extract metadata for each PR review
533
+ metadata_list = [extract_review_metadata(pr) for pr in all_prs]
534
+
535
+ # Update PR status to get current merged/closed state
536
+ print(f"πŸ” Updating PR status for reviewed PRs...")
537
+ metadata_list = update_pr_status(metadata_list, headers, token)
538
+
539
+ # Calculate memory savings
540
+ import sys
541
+ original_size = sys.getsizeof(str(all_prs))
542
+ metadata_size = sys.getsizeof(str(metadata_list))
543
+ savings_pct = ((original_size - metadata_size) / original_size * 100) if original_size > 0 else 0
544
+
545
+ print(f"πŸ’Ύ Memory efficiency: {original_size // 1024}KB β†’ {metadata_size // 1024}KB (saved {savings_pct:.1f}%)")
546
+
547
+ return metadata_list
548
+
549
+
550
+ # =============================================================================
551
+ # HUGGINGFACE STORAGE FUNCTIONS
552
+ # =============================================================================
553
+
554
+ def group_metadata_by_date(metadata_list):
555
+ """
556
+ Group review metadata by exact date (year.month.day) for efficient daily storage.
557
+ Returns dict: {(year, month, day): [metadata_list]}
558
+ """
559
+ grouped = defaultdict(list)
560
+
561
+ for review_meta in metadata_list:
562
+ reviewed_at = review_meta.get('reviewed_at')
563
+ if not reviewed_at:
564
+ continue
565
+
566
+ try:
567
+ dt = datetime.fromisoformat(reviewed_at.replace('Z', '+00:00'))
568
+ key = (dt.year, dt.month, dt.day)
569
+ grouped[key].append(review_meta)
570
+ except Exception as e:
571
+ print(f"Warning: Could not parse date '{reviewed_at}': {e}")
572
+
573
+ return dict(grouped)
574
+
575
+
576
+ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
577
+ """
578
+ Upload file to HuggingFace with exponential backoff retry logic.
579
+ """
580
+ delay = 2.0
581
+
582
+ for attempt in range(max_retries):
583
+ try:
584
+ api.upload_file(
585
+ path_or_fileobj=path_or_fileobj,
586
+ path_in_repo=path_in_repo,
587
+ repo_id=repo_id,
588
+ repo_type=repo_type,
589
+ token=token
590
+ )
591
+ if attempt > 0:
592
+ print(f" βœ“ Upload succeeded on attempt {attempt + 1}/{max_retries}")
593
+ return True
594
+
595
+ except Exception as e:
596
+ if attempt < max_retries - 1:
597
+ wait_time = delay + random.uniform(0, 1.0)
598
+ print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
599
+ print(f" ⏳ Retrying in {wait_time:.1f} seconds...")
600
+ time.sleep(wait_time)
601
+ delay = min(delay * 2, 60.0)
602
+ else:
603
+ print(f" βœ— Upload failed after {max_retries} attempts: {str(e)}")
604
+ raise
605
+
606
+
607
+ def save_review_metadata_to_hf(metadata_list, agent_identifier):
608
+ """
609
+ Save review metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
610
+ Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
611
+
612
+ This function APPENDS new metadata and DEDUPLICATES by review_id.
613
+
614
+ Args:
615
+ metadata_list: List of review metadata dictionaries
616
+ agent_identifier: GitHub identifier of the agent (used as folder name)
617
+ """
618
+ try:
619
+ token = get_hf_token()
620
+ if not token:
621
+ raise Exception("No HuggingFace token found")
622
+
623
+ api = HfApi()
624
+
625
+ # Group by exact date (year, month, day)
626
+ grouped = group_metadata_by_date(metadata_list)
627
+
628
+ for (review_year, month, day), day_metadata in grouped.items():
629
+ filename = f"{agent_identifier}/{review_year}.{month:02d}.{day:02d}.jsonl"
630
+ local_filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
631
+ print(f"πŸ“€ Uploading {len(day_metadata)} reviews to {filename}...")
632
+
633
+ # Download existing file if it exists
634
+ existing_metadata = []
635
+ try:
636
+ file_path = hf_hub_download(
637
+ repo_id=REVIEW_METADATA_REPO,
638
+ filename=filename,
639
+ repo_type="dataset",
640
+ token=token
641
+ )
642
+ existing_metadata = load_jsonl(file_path)
643
+ print(f" Found {len(existing_metadata)} existing reviews in {filename}")
644
+ except Exception:
645
+ print(f" No existing file found for {filename}, creating new")
646
+
647
+ # Merge and deduplicate by review_id
648
+ existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
649
+ new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
650
+
651
+ # Update with new data (new data overwrites old)
652
+ existing_by_id.update(new_by_id)
653
+ merged_metadata = list(existing_by_id.values())
654
+
655
+ # Save locally
656
+ save_jsonl(local_filename, merged_metadata)
657
+
658
+ try:
659
+ # Upload to HuggingFace with folder path
660
+ upload_with_retry(
661
+ api=api,
662
+ path_or_fileobj=local_filename,
663
+ path_in_repo=filename,
664
+ repo_id=REVIEW_METADATA_REPO,
665
+ repo_type="dataset",
666
+ token=token
667
+ )
668
+ print(f" βœ“ Saved {len(merged_metadata)} total reviews to {filename}")
669
+ finally:
670
+ # Always clean up local file, even if upload fails
671
+ if os.path.exists(local_filename):
672
+ os.remove(local_filename)
673
+
674
+ return True
675
+
676
+ except Exception as e:
677
+ print(f"βœ— Error saving review metadata: {str(e)}")
678
+ return False
679
+
680
+
681
+ def load_agents_from_hf():
682
+ """Load all agent metadata JSON files from HuggingFace dataset."""
683
+ try:
684
+ api = HfApi()
685
+ agents = []
686
+
687
+ # List all files in the repository
688
+ files = api.list_repo_files(repo_id=AGENTS_REPO, repo_type="dataset")
689
+
690
+ # Filter for JSON files only
691
+ json_files = [f for f in files if f.endswith('.json')]
692
+
693
+ print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
694
+
695
+ # Download and parse each JSON file
696
+ for json_file in json_files:
697
+ try:
698
+ file_path = hf_hub_download(
699
+ repo_id=AGENTS_REPO,
700
+ filename=json_file,
701
+ repo_type="dataset"
702
+ )
703
+
704
+ with open(file_path, 'r') as f:
705
+ agent_data = json.load(f)
706
+ agents.append(agent_data)
707
+
708
+ except Exception as e:
709
+ print(f"Warning: Could not load {json_file}: {str(e)}")
710
+ continue
711
+
712
+ print(f"βœ“ Loaded {len(agents)} agents from HuggingFace")
713
+ return agents
714
+
715
+ except Exception as e:
716
+ print(f"Could not load agents from HuggingFace: {str(e)}")
717
+ return []
718
+
719
+
720
+ # =============================================================================
721
+ # MAIN MINING FUNCTION
722
+ # =============================================================================
723
+
724
+ def mine_all_agents():
725
+ """
726
+ Mine review metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
727
+ """
728
+ token = get_github_token()
729
+
730
+ # Load agent metadata from HuggingFace
731
+ agents = load_agents_from_hf()
732
+ if not agents:
733
+ print("No agents found in HuggingFace dataset")
734
+ return
735
+
736
+ print(f"\n{'='*80}")
737
+ print(f"Starting review metadata mining for {len(agents)} agents")
738
+ print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
739
+ print(f"{'='*80}\n")
740
+
741
+ # Mine each agent
742
+ for agent in agents:
743
+ identifier = agent.get('github_identifier')
744
+ agent_name = agent.get('agent_name', 'Unknown')
745
+
746
+ if not identifier:
747
+ print(f"Warning: Skipping agent without identifier: {agent}")
748
+ continue
749
+
750
+ try:
751
+ print(f"\n{'='*80}")
752
+ print(f"Processing: {agent_name} ({identifier})")
753
+ print(f"{'='*80}")
754
+
755
+ # Fetch review metadata
756
+ metadata = fetch_all_reviews_metadata(identifier, agent_name, token)
757
+
758
+ if metadata:
759
+ print(f"πŸ’Ύ Saving {len(metadata)} review records...")
760
+ save_review_metadata_to_hf(metadata, identifier)
761
+ print(f"βœ“ Successfully processed {agent_name}")
762
+ else:
763
+ print(f" No reviews found for {agent_name}")
764
+
765
+ except Exception as e:
766
+ print(f"βœ— Error processing {identifier}: {str(e)}")
767
+ import traceback
768
+ traceback.print_exc()
769
+ continue
770
+
771
+ print(f"\n{'='*80}")
772
+ print(f"βœ… Mining complete for all agents")
773
+ print(f"{'='*80}\n")
774
+
775
+
776
+ # =============================================================================
777
+ # ENTRY POINT
778
+ # =============================================================================
779
+
780
+ if __name__ == "__main__":
781
+ mine_all_agents()