Spaces:
Sleeping
refactor: streamline mine_all_agents execution order with batched queries
Browse files- Refactored mine_all_agents() to follow cleaner execution order:
1. Load agents from HuggingFace
2. Extract identifiers
3. Initialize BigQuery client
4. Define time range
5. Fetch all metadata using BATCHED queries (now properly using fetch_all_pr_metadata_batched)
6. Save results for each agent
7. Construct leaderboard and metrics
8. Upload to HuggingFace
- Key improvements:
* Now uses fetch_all_pr_metadata_batched() which was previously unused
* Eliminates individual per-agent BigQuery calls (was chaotic)
* Processes all agents in batches of 50 for better performance
* Clearer step-by-step structure matching example code
* Better progress tracking with formatted output
* More detailed summary statistics
- Execution flow is now:
Query Phase: All agents batched together (efficient)
Save Phase: Individual agent saves (atomic per-agent)
Build Phase: Leaderboard + metrics from saved data
Upload Phase: Single final upload to HuggingFace
- Maintains backward compatibility with all existing functions
- All date handling and metadata structure unchanged
|
@@ -2051,139 +2051,153 @@ def save_leaderboard_and_metrics_to_hf():
|
|
| 2051 |
|
| 2052 |
def mine_all_agents():
|
| 2053 |
"""
|
| 2054 |
-
Scheduled task for
|
| 2055 |
-
|
| 2056 |
-
|
| 2057 |
-
1.
|
| 2058 |
-
2.
|
| 2059 |
-
3.
|
| 2060 |
-
4.
|
| 2061 |
-
5.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2062 |
"""
|
| 2063 |
print(f"\n{'='*80}")
|
| 2064 |
-
print(f"π
|
| 2065 |
-
print(f"{'='*80}")
|
| 2066 |
|
| 2067 |
try:
|
| 2068 |
-
|
| 2069 |
-
|
| 2070 |
-
# Load all agents
|
| 2071 |
agents = load_agents_from_hf()
|
| 2072 |
if not agents:
|
| 2073 |
-
print("No agents found in HuggingFace dataset")
|
| 2074 |
return
|
| 2075 |
|
| 2076 |
-
#
|
| 2077 |
-
|
| 2078 |
-
|
| 2079 |
-
|
| 2080 |
-
|
| 2081 |
|
| 2082 |
-
print(f"
|
| 2083 |
-
print(f" Update period start (12am UTC): {update_start_midnight.isoformat()}")
|
| 2084 |
-
print(f" Today 12am UTC: {today_midnight.isoformat()}")
|
| 2085 |
-
print(f" Cutoff for existing reviews: {cutoff_date.isoformat()}")
|
| 2086 |
-
print(f" Examining reviews from: {cutoff_date.date()} to {today_midnight.date()}")
|
| 2087 |
|
| 2088 |
-
|
| 2089 |
-
|
| 2090 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2091 |
|
| 2092 |
-
|
| 2093 |
-
|
| 2094 |
-
|
|
|
|
| 2095 |
|
| 2096 |
-
|
| 2097 |
-
|
| 2098 |
-
|
| 2099 |
-
|
| 2100 |
-
|
| 2101 |
-
# Step 1: Load all existing metadata within timeframe
|
| 2102 |
-
print(f"π Loading existing metadata from last {LEADERBOARD_TIME_FRAME_DAYS - UPDATE_TIME_FRAME_DAYS} days...")
|
| 2103 |
-
all_metadata = load_review_metadata()
|
| 2104 |
-
agent_metadata = [r for r in all_metadata if r.get("agent_identifier") == identifier]
|
| 2105 |
-
|
| 2106 |
-
# Filter to last (LEADERBOARD_TIME_FRAME_DAYS - UPDATE_TIME_FRAME_DAYS) days (from cutoff to today)
|
| 2107 |
-
recent_metadata = []
|
| 2108 |
-
for review in agent_metadata:
|
| 2109 |
-
reviewed_at = review.get('reviewed_at', '')
|
| 2110 |
-
if reviewed_at:
|
| 2111 |
-
try:
|
| 2112 |
-
review_date = datetime.fromisoformat(reviewed_at.replace('Z', '+00:00'))
|
| 2113 |
-
if cutoff_date <= review_date < today_midnight:
|
| 2114 |
-
recent_metadata.append(review)
|
| 2115 |
-
except Exception as e:
|
| 2116 |
-
print(f" Warning: Could not parse date '{reviewed_at}': {e}")
|
| 2117 |
-
continue
|
| 2118 |
|
| 2119 |
-
|
|
|
|
|
|
|
|
|
|
| 2120 |
|
| 2121 |
-
|
| 2122 |
-
|
|
|
|
| 2123 |
|
| 2124 |
-
|
|
|
|
|
|
|
|
|
|
| 2125 |
|
| 2126 |
-
|
| 2127 |
-
|
| 2128 |
-
|
| 2129 |
|
| 2130 |
-
|
| 2131 |
-
|
| 2132 |
-
|
| 2133 |
-
for row in review_rows:
|
| 2134 |
-
url = row.url
|
| 2135 |
-
if url in seen_prs:
|
| 2136 |
-
continue
|
| 2137 |
-
seen_prs.add(url)
|
| 2138 |
|
| 2139 |
-
|
| 2140 |
-
|
| 2141 |
-
|
|
|
|
| 2142 |
|
| 2143 |
-
|
| 2144 |
|
| 2145 |
-
|
| 2146 |
-
all_updated_metadata = recent_metadata + review_metadata
|
| 2147 |
|
| 2148 |
-
|
| 2149 |
-
|
| 2150 |
-
|
| 2151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2152 |
else:
|
| 2153 |
-
print(f"
|
|
|
|
| 2154 |
|
| 2155 |
except Exception as e:
|
| 2156 |
-
print(f"β Error
|
| 2157 |
-
|
| 2158 |
-
traceback.print_exc()
|
| 2159 |
continue
|
| 2160 |
|
| 2161 |
-
#
|
| 2162 |
-
print(f"\n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2163 |
if save_leaderboard_and_metrics_to_hf():
|
| 2164 |
-
print(f"β Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
|
| 2165 |
else:
|
| 2166 |
-
print(f"β οΈ Failed to upload leaderboard and metrics data")
|
| 2167 |
|
| 2168 |
-
#
|
| 2169 |
-
|
| 2170 |
-
|
| 2171 |
-
|
| 2172 |
-
agent_count = len(saved_data['leaderboard'])
|
| 2173 |
|
| 2174 |
-
print(f"\n{'='*80}")
|
| 2175 |
-
print(f"π Update Summary:")
|
| 2176 |
-
print(f" β Updated existing review statuses")
|
| 2177 |
-
print(f" β Fetched new reviews from last {UPDATE_TIME_FRAME_DAYS} days")
|
| 2178 |
-
print(f" β Leaderboard constructed with {agent_count} agents")
|
| 2179 |
-
print(f" β Monthly metrics calculated")
|
| 2180 |
-
print(f" β Data saved to {LEADERBOARD_REPO}")
|
| 2181 |
print(f"{'='*80}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2182 |
|
| 2183 |
-
print(f"
|
| 2184 |
|
| 2185 |
except Exception as e:
|
| 2186 |
-
print(f"
|
| 2187 |
import traceback
|
| 2188 |
traceback.print_exc()
|
| 2189 |
|
|
|
|
| 2051 |
|
| 2052 |
def mine_all_agents():
|
| 2053 |
"""
|
| 2054 |
+
Scheduled task for review metadata mining and statistics update.
|
| 2055 |
+
|
| 2056 |
+
Execution order:
|
| 2057 |
+
1. Load all agents from HuggingFace
|
| 2058 |
+
2. Extract all identifiers
|
| 2059 |
+
3. Initialize BigQuery client
|
| 2060 |
+
4. Define time range
|
| 2061 |
+
5. Fetch ALL review metadata using BATCHED BigQuery queries (efficient)
|
| 2062 |
+
6. Save results for each agent
|
| 2063 |
+
7. Construct leaderboard and monthly metrics
|
| 2064 |
+
8. Save to HuggingFace
|
| 2065 |
+
|
| 2066 |
+
Uses batched approach for better performance with large numbers of agents.
|
| 2067 |
"""
|
| 2068 |
print(f"\n{'='*80}")
|
| 2069 |
+
print(f"π Review Metadata Mining Task started at {datetime.now(timezone.utc).isoformat()}")
|
| 2070 |
+
print(f"{'='*80}\n")
|
| 2071 |
|
| 2072 |
try:
|
| 2073 |
+
# Step 1: Load all agents from HuggingFace
|
| 2074 |
+
print("π Loading agents from HuggingFace...")
|
|
|
|
| 2075 |
agents = load_agents_from_hf()
|
| 2076 |
if not agents:
|
| 2077 |
+
print("β No agents found in HuggingFace dataset")
|
| 2078 |
return
|
| 2079 |
|
| 2080 |
+
# Step 2: Extract all identifiers
|
| 2081 |
+
identifiers = [agent.get('github_identifier') for agent in agents if agent.get('github_identifier')]
|
| 2082 |
+
if not identifiers:
|
| 2083 |
+
print("β No valid agent identifiers found")
|
| 2084 |
+
return
|
| 2085 |
|
| 2086 |
+
print(f"β Loaded {len(agents)} agents ({len(identifiers)} with valid identifiers)\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2087 |
|
| 2088 |
+
# Step 3: Initialize BigQuery client
|
| 2089 |
+
print("π Initializing BigQuery client...")
|
| 2090 |
+
try:
|
| 2091 |
+
client = get_bigquery_client()
|
| 2092 |
+
print("β BigQuery client initialized\n")
|
| 2093 |
+
except Exception as e:
|
| 2094 |
+
print(f"β Failed to initialize BigQuery client: {str(e)}")
|
| 2095 |
+
return
|
| 2096 |
|
| 2097 |
+
# Step 4: Define time range
|
| 2098 |
+
current_time = datetime.now(timezone.utc)
|
| 2099 |
+
end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 2100 |
+
start_date = end_date - timedelta(days=UPDATE_TIME_FRAME_DAYS)
|
| 2101 |
|
| 2102 |
+
print(f"π
Time Range Configuration:")
|
| 2103 |
+
print(f" Mining period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 2104 |
+
print(f" Time frame: Last {UPDATE_TIME_FRAME_DAYS} days")
|
| 2105 |
+
print(f" Data source: BigQuery + GitHub Archive (BATCHED QUERIES)\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2106 |
|
| 2107 |
+
# Step 5: Fetch ALL review metadata using BATCHED approach
|
| 2108 |
+
print(f"{'='*80}")
|
| 2109 |
+
print(f"π Fetching review metadata using BATCHED queries...")
|
| 2110 |
+
print(f"{'='*80}\n")
|
| 2111 |
|
| 2112 |
+
all_metadata = fetch_all_pr_metadata_batched(
|
| 2113 |
+
client, identifiers, start_date, end_date, batch_size=50
|
| 2114 |
+
)
|
| 2115 |
|
| 2116 |
+
# Step 6: Save results for each agent
|
| 2117 |
+
print(f"\n{'='*80}")
|
| 2118 |
+
print(f"πΎ Saving results to HuggingFace for each agent...")
|
| 2119 |
+
print(f"{'='*80}\n")
|
| 2120 |
|
| 2121 |
+
success_count = 0
|
| 2122 |
+
error_count = 0
|
| 2123 |
+
no_data_count = 0
|
| 2124 |
|
| 2125 |
+
for i, agent in enumerate(agents, 1):
|
| 2126 |
+
identifier = agent.get('github_identifier')
|
| 2127 |
+
agent_name = agent.get('name', 'Unknown')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2128 |
|
| 2129 |
+
if not identifier:
|
| 2130 |
+
print(f"[{i}/{len(agents)}] β οΈ Skipping agent without identifier")
|
| 2131 |
+
error_count += 1
|
| 2132 |
+
continue
|
| 2133 |
|
| 2134 |
+
metadata = all_metadata.get(identifier, [])
|
| 2135 |
|
| 2136 |
+
print(f"[{i}/{len(agents)}] {agent_name} ({identifier}):")
|
|
|
|
| 2137 |
|
| 2138 |
+
try:
|
| 2139 |
+
if metadata:
|
| 2140 |
+
print(f" πΎ Saving {len(metadata)} review records...")
|
| 2141 |
+
if save_review_metadata_to_hf(metadata, identifier):
|
| 2142 |
+
success_count += 1
|
| 2143 |
+
print(f" β Successfully saved")
|
| 2144 |
+
else:
|
| 2145 |
+
error_count += 1
|
| 2146 |
+
print(f" β Failed to save")
|
| 2147 |
else:
|
| 2148 |
+
print(f" β No reviews found")
|
| 2149 |
+
no_data_count += 1
|
| 2150 |
|
| 2151 |
except Exception as e:
|
| 2152 |
+
print(f" β Error saving {identifier}: {str(e)}")
|
| 2153 |
+
error_count += 1
|
|
|
|
| 2154 |
continue
|
| 2155 |
|
| 2156 |
+
# Step 7: Construct leaderboard and monthly metrics
|
| 2157 |
+
print(f"\n{'='*80}")
|
| 2158 |
+
print(f"π Building leaderboard and metrics...")
|
| 2159 |
+
print(f"{'='*80}\n")
|
| 2160 |
+
|
| 2161 |
+
print(" Constructing leaderboard data from review metadata...")
|
| 2162 |
+
leaderboard_dict = construct_leaderboard_from_metadata()
|
| 2163 |
+
|
| 2164 |
+
print(" Calculating monthly metrics for all agents...")
|
| 2165 |
+
monthly_metrics = calculate_monthly_metrics_by_agent(top_n=None)
|
| 2166 |
+
|
| 2167 |
+
# Step 8: Save to HuggingFace
|
| 2168 |
+
print(f"\n{'='*80}")
|
| 2169 |
+
print(f"π€ Uploading leaderboard and metrics to HuggingFace...")
|
| 2170 |
+
print(f"{'='*80}\n")
|
| 2171 |
+
|
| 2172 |
if save_leaderboard_and_metrics_to_hf():
|
| 2173 |
+
print(f"β Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}\n")
|
| 2174 |
else:
|
| 2175 |
+
print(f"β οΈ Failed to upload leaderboard and metrics data\n")
|
| 2176 |
|
| 2177 |
+
# Print final summary
|
| 2178 |
+
batch_size = 50
|
| 2179 |
+
total_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 2180 |
+
total_reviews = sum(len(metadata) for metadata in all_metadata.values())
|
|
|
|
| 2181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2182 |
print(f"{'='*80}")
|
| 2183 |
+
print(f"β
Mining Task Complete!")
|
| 2184 |
+
print(f"{'='*80}")
|
| 2185 |
+
print(f"π Summary:")
|
| 2186 |
+
print(f" Total agents: {len(agents)}")
|
| 2187 |
+
print(f" Agents with valid identifiers: {len(identifiers)}")
|
| 2188 |
+
print(f" Successfully saved: {success_count}")
|
| 2189 |
+
print(f" No data (skipped): {no_data_count}")
|
| 2190 |
+
print(f" Errors: {error_count}")
|
| 2191 |
+
print(f" Total reviews fetched: {total_reviews}")
|
| 2192 |
+
print(f" BigQuery batches executed: {total_batches} (batch size: {batch_size})")
|
| 2193 |
+
print(f" Leaderboard entries: {len(leaderboard_dict)}")
|
| 2194 |
+
print(f" Monthly metrics agents: {len(monthly_metrics.get('agents', []))}")
|
| 2195 |
+
print(f"{'='*80}\n")
|
| 2196 |
|
| 2197 |
+
print(f"β
Mining Task completed at {datetime.now(timezone.utc).isoformat()}\n")
|
| 2198 |
|
| 2199 |
except Exception as e:
|
| 2200 |
+
print(f"β Mining task failed: {str(e)}")
|
| 2201 |
import traceback
|
| 2202 |
traceback.print_exc()
|
| 2203 |
|