zhiminy commited on
Commit
dcdb282
·
1 Parent(s): eed1a0a
Files changed (2) hide show
  1. app.py +6 -1
  2. msr.py +43 -5
app.py CHANGED
@@ -223,13 +223,18 @@ def extract_review_metadata_from_bigquery(review_row):
223
  merged_at = getattr(review_row, 'merged_at', None)
224
  closed_at = getattr(review_row, 'closed_at', None)
225
 
226
- # Convert to ISO format if datetime
227
  if hasattr(reviewed_at, 'isoformat'):
228
  reviewed_at = reviewed_at.isoformat()
 
 
229
  if merged_at and hasattr(merged_at, 'isoformat'):
230
  merged_at = merged_at.isoformat()
 
 
231
  if closed_at and hasattr(closed_at, 'isoformat'):
232
  closed_at = closed_at.isoformat()
 
233
 
234
  return {
235
  'url': url,
 
223
  merged_at = getattr(review_row, 'merged_at', None)
224
  closed_at = getattr(review_row, 'closed_at', None)
225
 
226
+ # Convert to ISO format if datetime and normalize
227
  if hasattr(reviewed_at, 'isoformat'):
228
  reviewed_at = reviewed_at.isoformat()
229
+ reviewed_at = normalize_date_format(reviewed_at) if reviewed_at else None
230
+
231
  if merged_at and hasattr(merged_at, 'isoformat'):
232
  merged_at = merged_at.isoformat()
233
+ merged_at = normalize_date_format(merged_at) if merged_at else None
234
+
235
  if closed_at and hasattr(closed_at, 'isoformat'):
236
  closed_at = closed_at.isoformat()
237
+ closed_at = normalize_date_format(closed_at) if closed_at else None
238
 
239
  return {
240
  'url': url,
msr.py CHANGED
@@ -51,6 +51,41 @@ def save_jsonl(filename, data):
51
  f.write(json.dumps(item) + '\n')
52
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def get_hf_token():
55
  """Get HuggingFace token from environment variables."""
56
  token = os.getenv('HF_TOKEN')
@@ -267,20 +302,23 @@ def fetch_all_pr_metadata_single_query(client, identifiers, start_date, end_date
267
 
268
  for row in results:
269
  reviewer = row.reviewer
270
-
271
- # Convert datetime objects to ISO strings
272
  reviewed_at = row.reviewed_at
273
  if hasattr(reviewed_at, 'isoformat'):
274
  reviewed_at = reviewed_at.isoformat()
275
-
 
276
  merged_at = row.merged_at
277
  if hasattr(merged_at, 'isoformat'):
278
  merged_at = merged_at.isoformat()
279
-
 
280
  closed_at = row.closed_at
281
  if hasattr(closed_at, 'isoformat'):
282
  closed_at = closed_at.isoformat()
283
-
 
284
  metadata_by_agent[reviewer].append({
285
  'url': row.url,
286
  'reviewed_at': reviewed_at,
 
51
  f.write(json.dumps(item) + '\n')
52
 
53
 
54
+ def normalize_date_format(date_string):
55
+ """
56
+ Convert date strings to standardized ISO 8601 format with Z suffix.
57
+ Handles both 'T' and space-separated datetime formats (including newlines).
58
+ Examples:
59
+ - 2025-10-15T23:23:47.983068 -> 2025-10-15T23:23:47Z
60
+ - 2025-06-17 21:21:07+00 -> 2025-06-17T21:21:07Z
61
+ """
62
+ if not date_string or date_string == 'N/A':
63
+ return 'N/A'
64
+
65
+ try:
66
+ import re
67
+ # Remove all whitespace (spaces, newlines, tabs) and replace with single space
68
+ date_string = re.sub(r'\s+', ' ', date_string.strip())
69
+
70
+ # Replace space with 'T' for ISO format compatibility
71
+ date_string = date_string.replace(' ', 'T')
72
+
73
+ # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
74
+ # Check if timezone offset exists and is incomplete
75
+ if len(date_string) >= 3:
76
+ if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
77
+ date_string = date_string + ':00'
78
+
79
+ # Parse the date string (handles both with and without microseconds)
80
+ dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
81
+
82
+ # Convert to standardized format
83
+ return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
84
+ except Exception as e:
85
+ print(f"Warning: Could not parse date '{date_string}': {e}")
86
+ return date_string
87
+
88
+
89
  def get_hf_token():
90
  """Get HuggingFace token from environment variables."""
91
  token = os.getenv('HF_TOKEN')
 
302
 
303
  for row in results:
304
  reviewer = row.reviewer
305
+
306
+ # Convert datetime objects to ISO strings and normalize
307
  reviewed_at = row.reviewed_at
308
  if hasattr(reviewed_at, 'isoformat'):
309
  reviewed_at = reviewed_at.isoformat()
310
+ reviewed_at = normalize_date_format(reviewed_at) if reviewed_at else None
311
+
312
  merged_at = row.merged_at
313
  if hasattr(merged_at, 'isoformat'):
314
  merged_at = merged_at.isoformat()
315
+ merged_at = normalize_date_format(merged_at) if merged_at else None
316
+
317
  closed_at = row.closed_at
318
  if hasattr(closed_at, 'isoformat'):
319
  closed_at = closed_at.isoformat()
320
+ closed_at = normalize_date_format(closed_at) if closed_at else None
321
+
322
  metadata_by_agent[reviewer].append({
323
  'url': row.url,
324
  'reviewed_at': reviewed_at,