skill-gap-backend / precompute_insights.py
aaronjosephd's picture
Initial backend upload
567b16c
raw
history blame
6.84 kB
import pandas as pd
import ast
import re
import itertools
from collections import Counter
import json
import os
# --- Configuration ---
INPUT_FILE = 'market_data_with_entities.csv'
OUTPUT_FILE = 'market_insights.json'
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_PATH = os.path.join(OUTPUT_DIR, INPUT_FILE)
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
def safe_literal_eval(s):
try:
if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
return ast.literal_eval(s)
except (ValueError, SyntaxError):
pass
return []
def get_top_items(series):
"""Calculates value counts for an exploded series."""
all_items = series.explode().dropna()
all_items = all_items.str.lower().str.strip()
counts = all_items.value_counts().reset_index()
counts.columns = ['item', 'count']
return counts
def get_co_occurrence(df, column, top_n=100):
"""Calculates co-occurrence for a given column."""
co_occurrence_df = df[df[column].apply(lambda x: len(set(x))) >= 2].copy()
co_occurrence_df[f'{column}_normalized'] = co_occurrence_df[column].apply(
lambda items: sorted(list(set([i.lower().strip() for i in items])))
)
pairs = co_occurrence_df[f'{column}_normalized'].apply(lambda x: list(itertools.combinations(x, 2)))
pair_counts = Counter(pairs.explode().dropna())
most_common_pairs = pair_counts.most_common(top_n)
results = pd.DataFrame(most_common_pairs, columns=['pair', 'count'])
results[['item1', 'item2']] = pd.DataFrame(results['pair'].tolist(), index=results.index)
return results[['item1', 'item2', 'count']]
def parse_experience(exp_list):
"""Parses experience strings to find years."""
if not isinstance(exp_list, list) or not exp_list:
return None
for exp_string in exp_list:
exp_string = str(exp_string).lower()
numbers = re.findall(r'\d+\.?\d*', exp_string)
if not numbers:
continue
val = float(numbers[0])
return val / 12.0 if 'month' in exp_string else val
return None
def main():
print("--- Starting Market Insight Pre-computation ---")
# --- Load and Prepare Data ---
print(f"Loading data from {INPUT_PATH}...")
if not os.path.exists(INPUT_PATH):
print(f"ERROR: Input file not found at {INPUT_PATH}")
return
df = pd.read_csv(INPUT_PATH)
print(f"Data loaded. Found {len(df)} records.")
print("Converting stringified lists to actual lists...")
for col in ['extracted_skills', 'extracted_tools', 'extracted_experience']:
df[col] = df[col].apply(safe_literal_eval)
# --- Master Data Structure ---
insights = {
"overall_market": {},
"by_role": {}
}
# --- Overall Market Analysis ---
print("Analyzing overall market...")
# Skills
overall_skills = get_top_items(df['extracted_skills'])
insights["overall_market"]["top_skills"] = overall_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
# Tools
overall_tools = get_top_items(df['extracted_tools'])
insights["overall_market"]["top_tools"] = overall_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')
# Skill Co-occurrence
overall_skill_co = get_co_occurrence(df, 'extracted_skills')
insights["overall_market"]["skill_co_occurrence"] = overall_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')
# Tool Co-occurrence
overall_tool_co = get_co_occurrence(df, 'extracted_tools')
insights["overall_market"]["tool_co_occurrence"] = overall_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
# Experience
df['min_years'] = df['extracted_experience'].apply(parse_experience)
exp_df = df.dropna(subset=['min_years'])
exp_df_filtered = exp_df[exp_df['min_years'] >= 1]
exp_dist = exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
exp_dist.columns = ['year', 'count']
insights["overall_market"]["experience_distribution"] = exp_dist.to_dict(orient='records')
insights["overall_market"]["average_experience"] = exp_df['min_years'].mean()
# Job Role Distribution
role_counts = df['cmo_role_match'].value_counts().reset_index()
role_counts.columns = ['cmo_role_match', 'count']
insights["job_role_distribution"] = role_counts.to_dict(orient='records')
# --- Per Role Analysis ---
print("Analyzing data for each role...")
roles = df['cmo_role_match'].unique()
for role in roles:
print(f"- Processing {role}...")
role_df = df[df['cmo_role_match'] == role].copy()
insights["by_role"][role] = {}
# Skills
role_skills = get_top_items(role_df['extracted_skills'])
if not role_skills.empty:
role_skills['cmo_role_match'] = role
insights["by_role"][role]["top_skills"] = role_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
# Tools
role_tools = get_top_items(role_df['extracted_tools'])
if not role_tools.empty:
role_tools['cmo_role_match'] = role
insights["by_role"][role]["top_tools"] = role_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')
# Skill Co-occurrence
role_skill_co = get_co_occurrence(role_df, 'extracted_skills')
insights["by_role"][role]["skill_co_occurrence"] = role_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')
# Tool Co-occurrence
role_tool_co = get_co_occurrence(role_df, 'extracted_tools')
insights["by_role"][role]["tool_co_occurrence"] = role_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
# Experience
role_exp_df = role_df.dropna(subset=['min_years'])
role_exp_df_filtered = role_exp_df[role_exp_df['min_years'] >= 1]
if not role_exp_df.empty:
insights["by_role"][role]["average_experience"] = role_exp_df['min_years'].mean()
role_exp_dist = role_exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
role_exp_dist.columns = ['year', 'count']
insights["by_role"][role]["experience_distribution"] = role_exp_dist.to_dict(orient='records')
else:
insights["by_role"][role]["average_experience"] = None
insights["by_role"][role]["experience_distribution"] = []
# --- Save to JSON ---
print(f"Saving aggregated insights to {OUTPUT_PATH}...")
with open(OUTPUT_PATH, 'w') as f:
json.dump(insights, f, indent=4)
print("--- Pre-computation Finished Successfully! ---")
if __name__ == "__main__":
main()