Spaces:
Running
Running
refine msr
Browse files
app.py
CHANGED
|
@@ -198,12 +198,48 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
|
|
| 198 |
print(f"Exceeded max retries for {url}")
|
| 199 |
return None
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def get_github_token():
|
| 202 |
-
"""Get GitHub token from environment variables."""
|
| 203 |
-
|
| 204 |
-
if
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
def validate_github_username(identifier):
|
|
@@ -225,7 +261,7 @@ def validate_github_username(identifier):
|
|
| 225 |
return False, f"Validation error: {str(e)}"
|
| 226 |
|
| 227 |
|
| 228 |
-
def fetch_reviews_with_time_partition(base_query, start_date, end_date,
|
| 229 |
"""
|
| 230 |
Fetch reviews within a specific time range using time-based partitioning.
|
| 231 |
Recursively splits the time range if hitting the 1000-result limit.
|
|
@@ -282,10 +318,10 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 282 |
'sort': 'created',
|
| 283 |
'order': 'asc'
|
| 284 |
}
|
| 285 |
-
|
| 286 |
|
| 287 |
try:
|
| 288 |
-
response = request_with_backoff('GET', url, headers=
|
| 289 |
if response is None:
|
| 290 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
| 291 |
return total_in_partition
|
|
@@ -334,7 +370,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 334 |
split_start = split_start + timedelta(seconds=1)
|
| 335 |
|
| 336 |
count = fetch_reviews_with_time_partition(
|
| 337 |
-
base_query, split_start, split_end,
|
| 338 |
)
|
| 339 |
total_from_splits += count
|
| 340 |
|
|
@@ -355,7 +391,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 355 |
split_start = split_start + timedelta(minutes=1)
|
| 356 |
|
| 357 |
count = fetch_reviews_with_time_partition(
|
| 358 |
-
base_query, split_start, split_end,
|
| 359 |
)
|
| 360 |
total_from_splits += count
|
| 361 |
|
|
@@ -376,7 +412,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 376 |
split_start = split_start + timedelta(hours=1)
|
| 377 |
|
| 378 |
count = fetch_reviews_with_time_partition(
|
| 379 |
-
base_query, split_start, split_end,
|
| 380 |
)
|
| 381 |
total_from_splits += count
|
| 382 |
|
|
@@ -407,7 +443,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 407 |
split_start = split_start + timedelta(days=1)
|
| 408 |
|
| 409 |
count = fetch_reviews_with_time_partition(
|
| 410 |
-
base_query, split_start, split_end,
|
| 411 |
)
|
| 412 |
total_from_splits += count
|
| 413 |
|
|
@@ -418,10 +454,10 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 418 |
|
| 419 |
# Recursively fetch both halves
|
| 420 |
count1 = fetch_reviews_with_time_partition(
|
| 421 |
-
base_query, start_date, mid_date,
|
| 422 |
)
|
| 423 |
count2 = fetch_reviews_with_time_partition(
|
| 424 |
-
base_query, mid_date + timedelta(days=1), end_date,
|
| 425 |
)
|
| 426 |
|
| 427 |
return count1 + count2
|
|
@@ -491,7 +527,7 @@ def extract_review_metadata(pr):
|
|
| 491 |
}
|
| 492 |
|
| 493 |
|
| 494 |
-
def update_pr_status(metadata_list,
|
| 495 |
"""
|
| 496 |
Update PR status for reviews to get current merged/closed state.
|
| 497 |
|
|
@@ -502,8 +538,7 @@ def update_pr_status(metadata_list, headers, token):
|
|
| 502 |
|
| 503 |
Args:
|
| 504 |
metadata_list: List of review metadata dictionaries
|
| 505 |
-
|
| 506 |
-
token: GitHub API token
|
| 507 |
|
| 508 |
Returns:
|
| 509 |
Updated metadata_list with current PR status
|
|
@@ -541,6 +576,7 @@ def update_pr_status(metadata_list, headers, token):
|
|
| 541 |
owner, repo, pull_word, pr_number = parts[0], parts[1], parts[2], parts[3]
|
| 542 |
api_url = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}'
|
| 543 |
|
|
|
|
| 544 |
response = request_with_backoff('GET', api_url, headers=headers, max_retries=3)
|
| 545 |
|
| 546 |
if response and response.status_code == 200:
|
|
@@ -1683,8 +1719,8 @@ def fetch_and_update_daily_reviews():
|
|
| 1683 |
- Fetch new reviews from yesterday 12am to today 12am
|
| 1684 |
- Save all updated/new metadata back to HuggingFace
|
| 1685 |
"""
|
| 1686 |
-
|
| 1687 |
-
|
| 1688 |
|
| 1689 |
# Load all agents
|
| 1690 |
agents = load_agents_from_hf()
|
|
@@ -1741,12 +1777,12 @@ def fetch_and_update_daily_reviews():
|
|
| 1741 |
# This ensures we capture any reviews that may have been closed/merged since last check
|
| 1742 |
if recent_metadata:
|
| 1743 |
print(f"π Examining {len(recent_metadata)} open reviews for status updates (checking closed_at)...")
|
| 1744 |
-
recent_metadata = update_pr_status(recent_metadata,
|
| 1745 |
print(f" β Updated PR status for existing reviews")
|
| 1746 |
|
| 1747 |
# Step 3: Fetch NEW reviews from yesterday 12am to today 12am
|
| 1748 |
print(f"π Fetching new reviews from {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}...")
|
| 1749 |
-
|
| 1750 |
base_query = f'is:pr review:approved author:{identifier} -is:draft'
|
| 1751 |
prs_by_url = {}
|
| 1752 |
|
|
@@ -1754,7 +1790,7 @@ def fetch_and_update_daily_reviews():
|
|
| 1754 |
base_query,
|
| 1755 |
yesterday_midnight,
|
| 1756 |
today_midnight,
|
| 1757 |
-
|
| 1758 |
prs_by_url,
|
| 1759 |
debug_limit=None
|
| 1760 |
)
|
|
@@ -1772,7 +1808,7 @@ def fetch_and_update_daily_reviews():
|
|
| 1772 |
# Step 4: Update PR status for new reviews
|
| 1773 |
if yesterday_metadata:
|
| 1774 |
print(f" Updating PR status for {len(yesterday_metadata)} new reviews...")
|
| 1775 |
-
yesterday_metadata = update_pr_status(yesterday_metadata,
|
| 1776 |
|
| 1777 |
# Step 5: Combine and save all metadata
|
| 1778 |
all_updated_metadata = recent_metadata + yesterday_metadata
|
|
|
|
| 198 |
print(f"Exceeded max retries for {url}")
|
| 199 |
return None
|
| 200 |
|
| 201 |
+
def get_github_tokens():
|
| 202 |
+
"""Get all GitHub tokens from environment variables (all vars starting with GITHUB_TOKEN)."""
|
| 203 |
+
tokens = []
|
| 204 |
+
for key, value in os.environ.items():
|
| 205 |
+
if key.startswith('GITHUB_TOKEN') and value:
|
| 206 |
+
tokens.append(value)
|
| 207 |
+
|
| 208 |
+
if not tokens:
|
| 209 |
+
print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 210 |
+
else:
|
| 211 |
+
print(f"β Loaded {len(tokens)} GitHub token(s) for rotation")
|
| 212 |
+
|
| 213 |
+
return tokens
|
| 214 |
+
|
| 215 |
+
|
| 216 |
def get_github_token():
|
| 217 |
+
"""Get first GitHub token from environment variables (backward compatibility)."""
|
| 218 |
+
tokens = get_github_tokens()
|
| 219 |
+
return tokens[0] if tokens else None
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
class TokenPool:
|
| 223 |
+
"""
|
| 224 |
+
Manages a pool of GitHub tokens for load balancing across rate limits.
|
| 225 |
+
Rotates through tokens in round-robin fashion to distribute API calls.
|
| 226 |
+
"""
|
| 227 |
+
def __init__(self, tokens):
|
| 228 |
+
self.tokens = tokens if tokens else [None]
|
| 229 |
+
self.current_index = 0
|
| 230 |
+
|
| 231 |
+
def get_next_token(self):
|
| 232 |
+
"""Get the next token in round-robin order."""
|
| 233 |
+
if not self.tokens:
|
| 234 |
+
return None
|
| 235 |
+
token = self.tokens[self.current_index]
|
| 236 |
+
self.current_index = (self.current_index + 1) % len(self.tokens)
|
| 237 |
+
return token
|
| 238 |
+
|
| 239 |
+
def get_headers(self):
|
| 240 |
+
"""Get headers with the next token in rotation."""
|
| 241 |
+
token = self.get_next_token()
|
| 242 |
+
return {'Authorization': f'token {token}'} if token else {}
|
| 243 |
|
| 244 |
|
| 245 |
def validate_github_username(identifier):
|
|
|
|
| 261 |
return False, f"Validation error: {str(e)}"
|
| 262 |
|
| 263 |
|
| 264 |
+
def fetch_reviews_with_time_partition(base_query, start_date, end_date, token_pool, prs_by_url, debug_limit=None, depth=0):
|
| 265 |
"""
|
| 266 |
Fetch reviews within a specific time range using time-based partitioning.
|
| 267 |
Recursively splits the time range if hitting the 1000-result limit.
|
|
|
|
| 318 |
'sort': 'created',
|
| 319 |
'order': 'asc'
|
| 320 |
}
|
| 321 |
+
headers = token_pool.get_headers()
|
| 322 |
|
| 323 |
try:
|
| 324 |
+
response = request_with_backoff('GET', url, headers=headers, params=params)
|
| 325 |
if response is None:
|
| 326 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
| 327 |
return total_in_partition
|
|
|
|
| 370 |
split_start = split_start + timedelta(seconds=1)
|
| 371 |
|
| 372 |
count = fetch_reviews_with_time_partition(
|
| 373 |
+
base_query, split_start, split_end, token_pool, prs_by_url, debug_limit, depth + 1
|
| 374 |
)
|
| 375 |
total_from_splits += count
|
| 376 |
|
|
|
|
| 391 |
split_start = split_start + timedelta(minutes=1)
|
| 392 |
|
| 393 |
count = fetch_reviews_with_time_partition(
|
| 394 |
+
base_query, split_start, split_end, token_pool, prs_by_url, debug_limit, depth + 1
|
| 395 |
)
|
| 396 |
total_from_splits += count
|
| 397 |
|
|
|
|
| 412 |
split_start = split_start + timedelta(hours=1)
|
| 413 |
|
| 414 |
count = fetch_reviews_with_time_partition(
|
| 415 |
+
base_query, split_start, split_end, token_pool, prs_by_url, debug_limit, depth + 1
|
| 416 |
)
|
| 417 |
total_from_splits += count
|
| 418 |
|
|
|
|
| 443 |
split_start = split_start + timedelta(days=1)
|
| 444 |
|
| 445 |
count = fetch_reviews_with_time_partition(
|
| 446 |
+
base_query, split_start, split_end, token_pool, prs_by_url, debug_limit, depth + 1
|
| 447 |
)
|
| 448 |
total_from_splits += count
|
| 449 |
|
|
|
|
| 454 |
|
| 455 |
# Recursively fetch both halves
|
| 456 |
count1 = fetch_reviews_with_time_partition(
|
| 457 |
+
base_query, start_date, mid_date, token_pool, prs_by_url, debug_limit, depth + 1
|
| 458 |
)
|
| 459 |
count2 = fetch_reviews_with_time_partition(
|
| 460 |
+
base_query, mid_date + timedelta(days=1), end_date, token_pool, prs_by_url, debug_limit, depth + 1
|
| 461 |
)
|
| 462 |
|
| 463 |
return count1 + count2
|
|
|
|
| 527 |
}
|
| 528 |
|
| 529 |
|
| 530 |
+
def update_pr_status(metadata_list, token_pool):
|
| 531 |
"""
|
| 532 |
Update PR status for reviews to get current merged/closed state.
|
| 533 |
|
|
|
|
| 538 |
|
| 539 |
Args:
|
| 540 |
metadata_list: List of review metadata dictionaries
|
| 541 |
+
token_pool: TokenPool instance for rotating tokens
|
|
|
|
| 542 |
|
| 543 |
Returns:
|
| 544 |
Updated metadata_list with current PR status
|
|
|
|
| 576 |
owner, repo, pull_word, pr_number = parts[0], parts[1], parts[2], parts[3]
|
| 577 |
api_url = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}'
|
| 578 |
|
| 579 |
+
headers = token_pool.get_headers()
|
| 580 |
response = request_with_backoff('GET', api_url, headers=headers, max_retries=3)
|
| 581 |
|
| 582 |
if response and response.status_code == 200:
|
|
|
|
| 1719 |
- Fetch new reviews from yesterday 12am to today 12am
|
| 1720 |
- Save all updated/new metadata back to HuggingFace
|
| 1721 |
"""
|
| 1722 |
+
tokens = get_github_tokens()
|
| 1723 |
+
token_pool = TokenPool(tokens)
|
| 1724 |
|
| 1725 |
# Load all agents
|
| 1726 |
agents = load_agents_from_hf()
|
|
|
|
| 1777 |
# This ensures we capture any reviews that may have been closed/merged since last check
|
| 1778 |
if recent_metadata:
|
| 1779 |
print(f"π Examining {len(recent_metadata)} open reviews for status updates (checking closed_at)...")
|
| 1780 |
+
recent_metadata = update_pr_status(recent_metadata, token_pool)
|
| 1781 |
print(f" β Updated PR status for existing reviews")
|
| 1782 |
|
| 1783 |
# Step 3: Fetch NEW reviews from yesterday 12am to today 12am
|
| 1784 |
print(f"π Fetching new reviews from {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}...")
|
| 1785 |
+
|
| 1786 |
base_query = f'is:pr review:approved author:{identifier} -is:draft'
|
| 1787 |
prs_by_url = {}
|
| 1788 |
|
|
|
|
| 1790 |
base_query,
|
| 1791 |
yesterday_midnight,
|
| 1792 |
today_midnight,
|
| 1793 |
+
token_pool,
|
| 1794 |
prs_by_url,
|
| 1795 |
debug_limit=None
|
| 1796 |
)
|
|
|
|
| 1808 |
# Step 4: Update PR status for new reviews
|
| 1809 |
if yesterday_metadata:
|
| 1810 |
print(f" Updating PR status for {len(yesterday_metadata)} new reviews...")
|
| 1811 |
+
yesterday_metadata = update_pr_status(yesterday_metadata, token_pool)
|
| 1812 |
|
| 1813 |
# Step 5: Combine and save all metadata
|
| 1814 |
all_updated_metadata = recent_metadata + yesterday_metadata
|
msr.py
CHANGED
|
@@ -52,12 +52,42 @@ def save_jsonl(filename, data):
|
|
| 52 |
f.write(json.dumps(item) + '\n')
|
| 53 |
|
| 54 |
|
| 55 |
-
def
|
| 56 |
-
"""Get GitHub
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
def get_hf_token():
|
|
@@ -144,7 +174,7 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
|
|
| 144 |
return None
|
| 145 |
|
| 146 |
|
| 147 |
-
def fetch_reviews_with_time_partition(base_query, start_date, end_date,
|
| 148 |
"""
|
| 149 |
Fetch reviews within a specific time range using time-based partitioning.
|
| 150 |
Recursively splits the time range if hitting the 1000-result limit.
|
|
@@ -193,10 +223,10 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 193 |
'sort': 'created',
|
| 194 |
'order': 'asc'
|
| 195 |
}
|
| 196 |
-
|
| 197 |
|
| 198 |
try:
|
| 199 |
-
response = request_with_backoff('GET', url, headers=
|
| 200 |
if response is None:
|
| 201 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
| 202 |
return total_in_partition
|
|
@@ -241,7 +271,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 241 |
split_start = split_start + timedelta(seconds=1)
|
| 242 |
|
| 243 |
count = fetch_reviews_with_time_partition(
|
| 244 |
-
base_query, split_start, split_end,
|
| 245 |
)
|
| 246 |
total_from_splits += count
|
| 247 |
|
|
@@ -260,7 +290,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 260 |
split_start = split_start + timedelta(minutes=1)
|
| 261 |
|
| 262 |
count = fetch_reviews_with_time_partition(
|
| 263 |
-
base_query, split_start, split_end,
|
| 264 |
)
|
| 265 |
total_from_splits += count
|
| 266 |
|
|
@@ -279,7 +309,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 279 |
split_start = split_start + timedelta(hours=1)
|
| 280 |
|
| 281 |
count = fetch_reviews_with_time_partition(
|
| 282 |
-
base_query, split_start, split_end,
|
| 283 |
)
|
| 284 |
total_from_splits += count
|
| 285 |
|
|
@@ -308,7 +338,7 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 308 |
split_start = split_start + timedelta(days=1)
|
| 309 |
|
| 310 |
count = fetch_reviews_with_time_partition(
|
| 311 |
-
base_query, split_start, split_end,
|
| 312 |
)
|
| 313 |
total_from_splits += count
|
| 314 |
|
|
@@ -318,10 +348,10 @@ def fetch_reviews_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 318 |
mid_date = start_date + time_diff / 2
|
| 319 |
|
| 320 |
count1 = fetch_reviews_with_time_partition(
|
| 321 |
-
base_query, start_date, mid_date,
|
| 322 |
)
|
| 323 |
count2 = fetch_reviews_with_time_partition(
|
| 324 |
-
base_query, mid_date + timedelta(days=1), end_date,
|
| 325 |
)
|
| 326 |
|
| 327 |
return count1 + count2
|
|
@@ -382,7 +412,7 @@ def extract_review_metadata(pr):
|
|
| 382 |
}
|
| 383 |
|
| 384 |
|
| 385 |
-
def update_pr_status(metadata_list,
|
| 386 |
"""
|
| 387 |
Update PR status for reviews to get current merged/closed state.
|
| 388 |
|
|
@@ -391,8 +421,7 @@ def update_pr_status(metadata_list, headers, token):
|
|
| 391 |
|
| 392 |
Args:
|
| 393 |
metadata_list: List of review metadata dictionaries
|
| 394 |
-
|
| 395 |
-
token: GitHub API token
|
| 396 |
|
| 397 |
Returns:
|
| 398 |
Updated metadata_list with current PR status
|
|
@@ -425,6 +454,7 @@ def update_pr_status(metadata_list, headers, token):
|
|
| 425 |
owner, repo, pull_word, pr_number = parts[0], parts[1], parts[2], parts[3]
|
| 426 |
api_url = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}'
|
| 427 |
|
|
|
|
| 428 |
response = request_with_backoff('GET', api_url, headers=headers, max_retries=3)
|
| 429 |
|
| 430 |
if response and response.status_code == 200:
|
|
@@ -468,7 +498,7 @@ def update_pr_status(metadata_list, headers, token):
|
|
| 468 |
return metadata_list
|
| 469 |
|
| 470 |
|
| 471 |
-
def fetch_all_reviews_metadata(identifier, agent_name,
|
| 472 |
"""
|
| 473 |
Fetch PR reviews associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
|
| 474 |
Returns lightweight metadata instead of full review objects.
|
|
@@ -482,12 +512,11 @@ def fetch_all_reviews_metadata(identifier, agent_name, token=None):
|
|
| 482 |
Args:
|
| 483 |
identifier: GitHub username or bot identifier
|
| 484 |
agent_name: Human-readable name of the agent for metadata purposes
|
| 485 |
-
|
| 486 |
|
| 487 |
Returns:
|
| 488 |
List of dictionaries containing minimal PR review metadata with PR status
|
| 489 |
"""
|
| 490 |
-
headers = {'Authorization': f'token {token}'} if token else {}
|
| 491 |
|
| 492 |
# Define query pattern for PR reviews
|
| 493 |
query_patterns = [f'is:pr reviewed-by:{identifier}']
|
|
@@ -512,7 +541,7 @@ def fetch_all_reviews_metadata(identifier, agent_name, token=None):
|
|
| 512 |
query_pattern,
|
| 513 |
start_date,
|
| 514 |
end_date,
|
| 515 |
-
|
| 516 |
prs_by_url
|
| 517 |
)
|
| 518 |
|
|
@@ -534,7 +563,7 @@ def fetch_all_reviews_metadata(identifier, agent_name, token=None):
|
|
| 534 |
|
| 535 |
# Update PR status to get current merged/closed state
|
| 536 |
print(f"π Updating PR status for reviewed PRs...")
|
| 537 |
-
metadata_list = update_pr_status(metadata_list,
|
| 538 |
|
| 539 |
# Calculate memory savings
|
| 540 |
import sys
|
|
@@ -725,7 +754,8 @@ def mine_all_agents():
|
|
| 725 |
"""
|
| 726 |
Mine review metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
|
| 727 |
"""
|
| 728 |
-
|
|
|
|
| 729 |
|
| 730 |
# Load agent metadata from HuggingFace
|
| 731 |
agents = load_agents_from_hf()
|
|
@@ -753,7 +783,7 @@ def mine_all_agents():
|
|
| 753 |
print(f"{'='*80}")
|
| 754 |
|
| 755 |
# Fetch review metadata
|
| 756 |
-
metadata = fetch_all_reviews_metadata(identifier, agent_name,
|
| 757 |
|
| 758 |
if metadata:
|
| 759 |
print(f"πΎ Saving {len(metadata)} review records...")
|
|
|
|
| 52 |
f.write(json.dumps(item) + '\n')
|
| 53 |
|
| 54 |
|
| 55 |
+
def get_github_tokens():
|
| 56 |
+
"""Get all GitHub tokens from environment variables (all vars starting with GITHUB_TOKEN)."""
|
| 57 |
+
tokens = []
|
| 58 |
+
for key, value in os.environ.items():
|
| 59 |
+
if key.startswith('GITHUB_TOKEN') and value:
|
| 60 |
+
tokens.append(value)
|
| 61 |
+
|
| 62 |
+
if not tokens:
|
| 63 |
+
print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 64 |
+
else:
|
| 65 |
+
print(f"β Loaded {len(tokens)} GitHub token(s) for rotation")
|
| 66 |
+
|
| 67 |
+
return tokens
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class TokenPool:
|
| 71 |
+
"""
|
| 72 |
+
Manages a pool of GitHub tokens for load balancing across rate limits.
|
| 73 |
+
Rotates through tokens in round-robin fashion to distribute API calls.
|
| 74 |
+
"""
|
| 75 |
+
def __init__(self, tokens):
|
| 76 |
+
self.tokens = tokens if tokens else [None]
|
| 77 |
+
self.current_index = 0
|
| 78 |
+
|
| 79 |
+
def get_next_token(self):
|
| 80 |
+
"""Get the next token in round-robin order."""
|
| 81 |
+
if not self.tokens:
|
| 82 |
+
return None
|
| 83 |
+
token = self.tokens[self.current_index]
|
| 84 |
+
self.current_index = (self.current_index + 1) % len(self.tokens)
|
| 85 |
+
return token
|
| 86 |
+
|
| 87 |
+
def get_headers(self):
|
| 88 |
+
"""Get headers with the next token in rotation."""
|
| 89 |
+
token = self.get_next_token()
|
| 90 |
+
return {'Authorization': f'token {token}'} if token else {}
|
| 91 |
|
| 92 |
|
| 93 |
def get_hf_token():
|
|
|
|
| 174 |
return None
|
| 175 |
|
| 176 |
|
| 177 |
+
def fetch_reviews_with_time_partition(base_query, start_date, end_date, token_pool, prs_by_url, depth=0):
|
| 178 |
"""
|
| 179 |
Fetch reviews within a specific time range using time-based partitioning.
|
| 180 |
Recursively splits the time range if hitting the 1000-result limit.
|
|
|
|
| 223 |
'sort': 'created',
|
| 224 |
'order': 'asc'
|
| 225 |
}
|
| 226 |
+
headers = token_pool.get_headers()
|
| 227 |
|
| 228 |
try:
|
| 229 |
+
response = request_with_backoff('GET', url, headers=headers, params=params)
|
| 230 |
if response is None:
|
| 231 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
| 232 |
return total_in_partition
|
|
|
|
| 271 |
split_start = split_start + timedelta(seconds=1)
|
| 272 |
|
| 273 |
count = fetch_reviews_with_time_partition(
|
| 274 |
+
base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
|
| 275 |
)
|
| 276 |
total_from_splits += count
|
| 277 |
|
|
|
|
| 290 |
split_start = split_start + timedelta(minutes=1)
|
| 291 |
|
| 292 |
count = fetch_reviews_with_time_partition(
|
| 293 |
+
base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
|
| 294 |
)
|
| 295 |
total_from_splits += count
|
| 296 |
|
|
|
|
| 309 |
split_start = split_start + timedelta(hours=1)
|
| 310 |
|
| 311 |
count = fetch_reviews_with_time_partition(
|
| 312 |
+
base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
|
| 313 |
)
|
| 314 |
total_from_splits += count
|
| 315 |
|
|
|
|
| 338 |
split_start = split_start + timedelta(days=1)
|
| 339 |
|
| 340 |
count = fetch_reviews_with_time_partition(
|
| 341 |
+
base_query, split_start, split_end, token_pool, prs_by_url, depth + 1
|
| 342 |
)
|
| 343 |
total_from_splits += count
|
| 344 |
|
|
|
|
| 348 |
mid_date = start_date + time_diff / 2
|
| 349 |
|
| 350 |
count1 = fetch_reviews_with_time_partition(
|
| 351 |
+
base_query, start_date, mid_date, token_pool, prs_by_url, depth + 1
|
| 352 |
)
|
| 353 |
count2 = fetch_reviews_with_time_partition(
|
| 354 |
+
base_query, mid_date + timedelta(days=1), end_date, token_pool, prs_by_url, depth + 1
|
| 355 |
)
|
| 356 |
|
| 357 |
return count1 + count2
|
|
|
|
| 412 |
}
|
| 413 |
|
| 414 |
|
| 415 |
+
def update_pr_status(metadata_list, token_pool):
|
| 416 |
"""
|
| 417 |
Update PR status for reviews to get current merged/closed state.
|
| 418 |
|
|
|
|
| 421 |
|
| 422 |
Args:
|
| 423 |
metadata_list: List of review metadata dictionaries
|
| 424 |
+
token_pool: TokenPool instance for rotating tokens
|
|
|
|
| 425 |
|
| 426 |
Returns:
|
| 427 |
Updated metadata_list with current PR status
|
|
|
|
| 454 |
owner, repo, pull_word, pr_number = parts[0], parts[1], parts[2], parts[3]
|
| 455 |
api_url = f'https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}'
|
| 456 |
|
| 457 |
+
headers = token_pool.get_headers()
|
| 458 |
response = request_with_backoff('GET', api_url, headers=headers, max_retries=3)
|
| 459 |
|
| 460 |
if response and response.status_code == 200:
|
|
|
|
| 498 |
return metadata_list
|
| 499 |
|
| 500 |
|
| 501 |
+
def fetch_all_reviews_metadata(identifier, agent_name, token_pool):
|
| 502 |
"""
|
| 503 |
Fetch PR reviews associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
|
| 504 |
Returns lightweight metadata instead of full review objects.
|
|
|
|
| 512 |
Args:
|
| 513 |
identifier: GitHub username or bot identifier
|
| 514 |
agent_name: Human-readable name of the agent for metadata purposes
|
| 515 |
+
token_pool: TokenPool instance for rotating tokens
|
| 516 |
|
| 517 |
Returns:
|
| 518 |
List of dictionaries containing minimal PR review metadata with PR status
|
| 519 |
"""
|
|
|
|
| 520 |
|
| 521 |
# Define query pattern for PR reviews
|
| 522 |
query_patterns = [f'is:pr reviewed-by:{identifier}']
|
|
|
|
| 541 |
query_pattern,
|
| 542 |
start_date,
|
| 543 |
end_date,
|
| 544 |
+
token_pool,
|
| 545 |
prs_by_url
|
| 546 |
)
|
| 547 |
|
|
|
|
| 563 |
|
| 564 |
# Update PR status to get current merged/closed state
|
| 565 |
print(f"π Updating PR status for reviewed PRs...")
|
| 566 |
+
metadata_list = update_pr_status(metadata_list, token_pool)
|
| 567 |
|
| 568 |
# Calculate memory savings
|
| 569 |
import sys
|
|
|
|
| 754 |
"""
|
| 755 |
Mine review metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
|
| 756 |
"""
|
| 757 |
+
tokens = get_github_tokens()
|
| 758 |
+
token_pool = TokenPool(tokens)
|
| 759 |
|
| 760 |
# Load agent metadata from HuggingFace
|
| 761 |
agents = load_agents_from_hf()
|
|
|
|
| 783 |
print(f"{'='*80}")
|
| 784 |
|
| 785 |
# Fetch review metadata
|
| 786 |
+
metadata = fetch_all_reviews_metadata(identifier, agent_name, token_pool)
|
| 787 |
|
| 788 |
if metadata:
|
| 789 |
print(f"πΎ Saving {len(metadata)} review records...")
|