zhiminy commited on
Commit
17571af
·
1 Parent(s): 72c9e3c
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +119 -33
  3. msr.py +174 -16
.gitignore CHANGED
@@ -1,3 +1,4 @@
 
1
  *.env
2
  *.venv
3
  *.ipynb
 
1
+ *.claude
2
  *.env
3
  *.venv
4
  *.ipynb
app.py CHANGED
@@ -228,6 +228,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
228
  """
229
  Fetch issues within a specific time range using time-based partitioning.
230
  Recursively splits the time range if hitting the 1000-result limit.
 
231
 
232
  Args:
233
  debug_limit: If set, stops fetching after this many issues (for testing)
@@ -235,9 +236,27 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
235
 
236
  Returns the number of issues found in this time partition.
237
  """
238
- # Format dates for GitHub search (YYYY-MM-DD)
239
- start_str = start_date.strftime('%Y-%m-%d')
240
- end_str = end_date.strftime('%Y-%m-%d')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  # Add date range to query
243
  query = f'{base_query} created:{start_str}..{end_str}'
@@ -291,30 +310,24 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
291
  if total_count > 1000 and page == 10:
292
  print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
293
 
294
- # Calculate time range in days
295
- time_diff = end_date - start_date
296
- days_diff = time_diff.days
297
-
298
- # Use aggressive splitting for large ranges or deep recursion
299
- # Split into 4 parts if range is > 30 days, otherwise split in half
300
- if days_diff > 30 or depth > 5:
301
- # Split into 4 parts for more aggressive partitioning
302
- quarter_diff = time_diff / 4
303
- split_dates = [
304
- start_date,
305
- start_date + quarter_diff,
306
- start_date + quarter_diff * 2,
307
- start_date + quarter_diff * 3,
308
- end_date
309
- ]
310
 
311
  total_from_splits = 0
312
- for i in range(4):
313
  split_start = split_dates[i]
314
  split_end = split_dates[i + 1]
315
- # Avoid overlapping ranges
316
  if i > 0:
317
- split_start = split_start + timedelta(days=1)
318
 
319
  count = fetch_issues_with_time_partition(
320
  base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
@@ -322,19 +335,92 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
322
  total_from_splits += count
323
 
324
  return total_from_splits
325
- else:
326
- # Binary split for smaller ranges
327
- mid_date = start_date + time_diff / 2
328
 
329
- # Recursively fetch both halves
330
- count1 = fetch_issues_with_time_partition(
331
- base_query, start_date, mid_date, headers, issues_by_id, debug_limit, depth + 1
332
- )
333
- count2 = fetch_issues_with_time_partition(
334
- base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, debug_limit, depth + 1
335
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
- return count1 + count2
338
 
339
  # Normal pagination: check if there are more pages
340
  if len(items) < per_page or page >= 10:
 
228
  """
229
  Fetch issues within a specific time range using time-based partitioning.
230
  Recursively splits the time range if hitting the 1000-result limit.
231
+ Supports splitting by day, hour, minute, and second as needed.
232
 
233
  Args:
234
  debug_limit: If set, stops fetching after this many issues (for testing)
 
236
 
237
  Returns the number of issues found in this time partition.
238
  """
239
+ # Calculate time difference
240
+ time_diff = end_date - start_date
241
+ total_seconds = time_diff.total_seconds()
242
+
243
+ # Determine granularity and format dates accordingly
244
+ if total_seconds >= 86400: # >= 1 day
245
+ # Use day granularity (YYYY-MM-DD)
246
+ start_str = start_date.strftime('%Y-%m-%d')
247
+ end_str = end_date.strftime('%Y-%m-%d')
248
+ elif total_seconds >= 3600: # >= 1 hour but < 1 day
249
+ # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
250
+ start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
251
+ end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
252
+ elif total_seconds >= 60: # >= 1 minute but < 1 hour
253
+ # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
254
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
255
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
256
+ else: # < 1 minute
257
+ # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
258
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
259
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
260
 
261
  # Add date range to query
262
  query = f'{base_query} created:{start_str}..{end_str}'
 
310
  if total_count > 1000 and page == 10:
311
  print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
312
 
313
+ # Determine how to split based on time range duration
314
+ if total_seconds < 2: # Less than 2 seconds - can't split further
315
+ print(f"{indent} ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
316
+ break
317
+
318
+ elif total_seconds < 120: # Less than 2 minutes - split by seconds
319
+ # Split into 2-4 parts depending on range
320
+ num_splits = min(4, max(2, int(total_seconds / 30)))
321
+ split_duration = time_diff / num_splits
322
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
 
 
 
 
 
 
323
 
324
  total_from_splits = 0
325
+ for i in range(num_splits):
326
  split_start = split_dates[i]
327
  split_end = split_dates[i + 1]
328
+ # Avoid overlapping ranges (add 1 second to start)
329
  if i > 0:
330
+ split_start = split_start + timedelta(seconds=1)
331
 
332
  count = fetch_issues_with_time_partition(
333
  base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
 
335
  total_from_splits += count
336
 
337
  return total_from_splits
 
 
 
338
 
339
+ elif total_seconds < 7200: # Less than 2 hours - split by minutes
340
+ # Split into 2-4 parts
341
+ num_splits = min(4, max(2, int(total_seconds / 1800)))
342
+ split_duration = time_diff / num_splits
343
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
344
+
345
+ total_from_splits = 0
346
+ for i in range(num_splits):
347
+ split_start = split_dates[i]
348
+ split_end = split_dates[i + 1]
349
+ # Avoid overlapping ranges (add 1 minute to start)
350
+ if i > 0:
351
+ split_start = split_start + timedelta(minutes=1)
352
+
353
+ count = fetch_issues_with_time_partition(
354
+ base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
355
+ )
356
+ total_from_splits += count
357
+
358
+ return total_from_splits
359
+
360
+ elif total_seconds < 172800: # Less than 2 days - split by hours
361
+ # Split into 2-4 parts
362
+ num_splits = min(4, max(2, int(total_seconds / 43200)))
363
+ split_duration = time_diff / num_splits
364
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
365
+
366
+ total_from_splits = 0
367
+ for i in range(num_splits):
368
+ split_start = split_dates[i]
369
+ split_end = split_dates[i + 1]
370
+ # Avoid overlapping ranges (add 1 hour to start)
371
+ if i > 0:
372
+ split_start = split_start + timedelta(hours=1)
373
+
374
+ count = fetch_issues_with_time_partition(
375
+ base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
376
+ )
377
+ total_from_splits += count
378
+
379
+ return total_from_splits
380
+
381
+ else: # 2+ days - split by days
382
+ days_diff = time_diff.days
383
+
384
+ # Use aggressive splitting for large ranges or deep recursion
385
+ # Split into 4 parts if range is > 30 days, otherwise split in half
386
+ if days_diff > 30 or depth > 5:
387
+ # Split into 4 parts for more aggressive partitioning
388
+ quarter_diff = time_diff / 4
389
+ split_dates = [
390
+ start_date,
391
+ start_date + quarter_diff,
392
+ start_date + quarter_diff * 2,
393
+ start_date + quarter_diff * 3,
394
+ end_date
395
+ ]
396
+
397
+ total_from_splits = 0
398
+ for i in range(4):
399
+ split_start = split_dates[i]
400
+ split_end = split_dates[i + 1]
401
+ # Avoid overlapping ranges
402
+ if i > 0:
403
+ split_start = split_start + timedelta(days=1)
404
+
405
+ count = fetch_issues_with_time_partition(
406
+ base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
407
+ )
408
+ total_from_splits += count
409
+
410
+ return total_from_splits
411
+ else:
412
+ # Binary split for smaller ranges
413
+ mid_date = start_date + time_diff / 2
414
+
415
+ # Recursively fetch both halves
416
+ count1 = fetch_issues_with_time_partition(
417
+ base_query, start_date, mid_date, headers, issues_by_id, debug_limit, depth + 1
418
+ )
419
+ count2 = fetch_issues_with_time_partition(
420
+ base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, debug_limit, depth + 1
421
+ )
422
 
423
+ return count1 + count2
424
 
425
  # Normal pagination: check if there are more pages
426
  if len(items) < per_page or page >= 10:
msr.py CHANGED
@@ -224,17 +224,54 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
224
  return None
225
 
226
 
227
- def fetch_issues_with_time_partition(base_query, start_date, end_date, headers, issues_by_id, debug_limit=None):
228
- start_str = start_date.strftime('%Y-%m-%d')
229
- end_str = end_date.strftime('%Y-%m-%d')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  query = f'{base_query} created:{start_str}..{end_str}'
231
- print(f" Searching range {start_str} to {end_str}...")
 
 
 
232
  page = 1
233
  per_page = 100
234
  total_in_partition = 0
 
235
  while True:
 
236
  if debug_limit is not None and total_in_partition >= debug_limit:
237
- print(f" 🐛 DEBUG MODE: Reached limit of {debug_limit} issues, stopping...")
238
  return total_in_partition
239
  url = 'https://api.github.com/search/issues'
240
  params = {
@@ -244,40 +281,161 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
244
  'sort': 'created',
245
  'order': 'asc'
246
  }
 
247
  try:
248
  response = request_with_backoff('GET', url, headers=headers, params=params)
249
  if response is None:
250
- print(f" Error: retries exhausted for range {start_str} to {end_str}")
251
  return total_in_partition
 
252
  if response.status_code != 200:
253
- print(f" Error: HTTP {response.status_code} for range {start_str} to {end_str}")
254
  return total_in_partition
 
255
  data = response.json()
256
  total_count = data.get('total_count', 0)
257
  items = data.get('items', [])
 
258
  if not items:
259
  break
 
 
260
  for issue in items:
261
  issue_id = issue.get('id')
262
  if issue_id and issue_id not in issues_by_id:
263
  issues_by_id[issue_id] = issue
264
  total_in_partition += 1
 
 
265
  if total_count > 1000 and page == 10:
266
- print(f" ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
267
- time_diff = end_date - start_date
268
- mid_date = start_date + time_diff / 2
269
- count1 = fetch_issues_with_time_partition(base_query, start_date, mid_date, headers, issues_by_id, debug_limit)
270
- count2 = fetch_issues_with_time_partition(base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, debug_limit)
271
- return count1 + count2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  if len(items) < per_page or page >= 10:
273
  break
 
274
  page += 1
275
- time.sleep(0.5)
 
276
  except Exception as e:
277
- print(f" Error fetching range {start_str} to {end_str}: {str(e)}")
278
  return total_in_partition
 
279
  if total_in_partition > 0:
280
- print(f" ✓ Found {total_in_partition} issues in range {start_str} to {end_str}")
 
281
  return total_in_partition
282
 
283
 
 
224
  return None
225
 
226
 
227
+ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers, issues_by_id, debug_limit=None, depth=0):
228
+ """
229
+ Fetch issues within a specific time range using time-based partitioning.
230
+ Recursively splits the time range if hitting the 1000-result limit.
231
+ Supports splitting by day, hour, minute, and second as needed.
232
+
233
+ Args:
234
+ debug_limit: If set, stops fetching after this many issues (for testing)
235
+ depth: Current recursion depth (for tracking)
236
+
237
+ Returns the number of issues found in this time partition.
238
+ """
239
+ # Calculate time difference
240
+ time_diff = end_date - start_date
241
+ total_seconds = time_diff.total_seconds()
242
+
243
+ # Determine granularity and format dates accordingly
244
+ if total_seconds >= 86400: # >= 1 day
245
+ # Use day granularity (YYYY-MM-DD)
246
+ start_str = start_date.strftime('%Y-%m-%d')
247
+ end_str = end_date.strftime('%Y-%m-%d')
248
+ elif total_seconds >= 3600: # >= 1 hour but < 1 day
249
+ # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
250
+ start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
251
+ end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
252
+ elif total_seconds >= 60: # >= 1 minute but < 1 hour
253
+ # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
254
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
255
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
256
+ else: # < 1 minute
257
+ # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
258
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
259
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
260
+
261
+ # Add date range to query
262
  query = f'{base_query} created:{start_str}..{end_str}'
263
+
264
+ indent = " " + " " * depth
265
+ print(f"{indent}Searching range {start_str} to {end_str}...")
266
+
267
  page = 1
268
  per_page = 100
269
  total_in_partition = 0
270
+
271
  while True:
272
+ # Check debug limit
273
  if debug_limit is not None and total_in_partition >= debug_limit:
274
+ print(f"{indent} 🐛 DEBUG MODE: Reached limit of {debug_limit} issues, stopping...")
275
  return total_in_partition
276
  url = 'https://api.github.com/search/issues'
277
  params = {
 
281
  'sort': 'created',
282
  'order': 'asc'
283
  }
284
+
285
  try:
286
  response = request_with_backoff('GET', url, headers=headers, params=params)
287
  if response is None:
288
+ print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
289
  return total_in_partition
290
+
291
  if response.status_code != 200:
292
+ print(f"{indent} Error: HTTP {response.status_code} for range {start_str} to {end_str}")
293
  return total_in_partition
294
+
295
  data = response.json()
296
  total_count = data.get('total_count', 0)
297
  items = data.get('items', [])
298
+
299
  if not items:
300
  break
301
+
302
+ # Add issues to global dict
303
  for issue in items:
304
  issue_id = issue.get('id')
305
  if issue_id and issue_id not in issues_by_id:
306
  issues_by_id[issue_id] = issue
307
  total_in_partition += 1
308
+
309
+ # Check if we hit the 1000-result limit
310
  if total_count > 1000 and page == 10:
311
+ print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
312
+
313
+ # Determine how to split based on time range duration
314
+ if total_seconds < 2: # Less than 2 seconds - can't split further
315
+ print(f"{indent} ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
316
+ break
317
+
318
+ elif total_seconds < 120: # Less than 2 minutes - split by seconds
319
+ # Split into 2-4 parts depending on range
320
+ num_splits = min(4, max(2, int(total_seconds / 30)))
321
+ split_duration = time_diff / num_splits
322
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
323
+
324
+ total_from_splits = 0
325
+ for i in range(num_splits):
326
+ split_start = split_dates[i]
327
+ split_end = split_dates[i + 1]
328
+ # Avoid overlapping ranges (add 1 second to start)
329
+ if i > 0:
330
+ split_start = split_start + timedelta(seconds=1)
331
+
332
+ count = fetch_issues_with_time_partition(
333
+ base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
334
+ )
335
+ total_from_splits += count
336
+
337
+ return total_from_splits
338
+
339
+ elif total_seconds < 7200: # Less than 2 hours - split by minutes
340
+ # Split into 2-4 parts
341
+ num_splits = min(4, max(2, int(total_seconds / 1800)))
342
+ split_duration = time_diff / num_splits
343
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
344
+
345
+ total_from_splits = 0
346
+ for i in range(num_splits):
347
+ split_start = split_dates[i]
348
+ split_end = split_dates[i + 1]
349
+ # Avoid overlapping ranges (add 1 minute to start)
350
+ if i > 0:
351
+ split_start = split_start + timedelta(minutes=1)
352
+
353
+ count = fetch_issues_with_time_partition(
354
+ base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
355
+ )
356
+ total_from_splits += count
357
+
358
+ return total_from_splits
359
+
360
+ elif total_seconds < 172800: # Less than 2 days - split by hours
361
+ # Split into 2-4 parts
362
+ num_splits = min(4, max(2, int(total_seconds / 43200)))
363
+ split_duration = time_diff / num_splits
364
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
365
+
366
+ total_from_splits = 0
367
+ for i in range(num_splits):
368
+ split_start = split_dates[i]
369
+ split_end = split_dates[i + 1]
370
+ # Avoid overlapping ranges (add 1 hour to start)
371
+ if i > 0:
372
+ split_start = split_start + timedelta(hours=1)
373
+
374
+ count = fetch_issues_with_time_partition(
375
+ base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
376
+ )
377
+ total_from_splits += count
378
+
379
+ return total_from_splits
380
+
381
+ else: # 2+ days - split by days
382
+ days_diff = time_diff.days
383
+
384
+ # Use aggressive splitting for large ranges or deep recursion
385
+ # Split into 4 parts if range is > 30 days, otherwise split in half
386
+ if days_diff > 30 or depth > 5:
387
+ # Split into 4 parts for more aggressive partitioning
388
+ quarter_diff = time_diff / 4
389
+ split_dates = [
390
+ start_date,
391
+ start_date + quarter_diff,
392
+ start_date + quarter_diff * 2,
393
+ start_date + quarter_diff * 3,
394
+ end_date
395
+ ]
396
+
397
+ total_from_splits = 0
398
+ for i in range(4):
399
+ split_start = split_dates[i]
400
+ split_end = split_dates[i + 1]
401
+ # Avoid overlapping ranges
402
+ if i > 0:
403
+ split_start = split_start + timedelta(days=1)
404
+
405
+ count = fetch_issues_with_time_partition(
406
+ base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
407
+ )
408
+ total_from_splits += count
409
+
410
+ return total_from_splits
411
+ else:
412
+ # Binary split for smaller ranges
413
+ mid_date = start_date + time_diff / 2
414
+
415
+ # Recursively fetch both halves
416
+ count1 = fetch_issues_with_time_partition(
417
+ base_query, start_date, mid_date, headers, issues_by_id, debug_limit, depth + 1
418
+ )
419
+ count2 = fetch_issues_with_time_partition(
420
+ base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, debug_limit, depth + 1
421
+ )
422
+
423
+ return count1 + count2
424
+
425
+ # Normal pagination: check if there are more pages
426
  if len(items) < per_page or page >= 10:
427
  break
428
+
429
  page += 1
430
+ time.sleep(0.5) # Courtesy delay between pages
431
+
432
  except Exception as e:
433
+ print(f"{indent} Error fetching range {start_str} to {end_str}: {str(e)}")
434
  return total_in_partition
435
+
436
  if total_in_partition > 0:
437
+ print(f"{indent} ✓ Found {total_in_partition} issues in range {start_str} to {end_str}")
438
+
439
  return total_in_partition
440
 
441