import pandas as pd import ast import re import itertools from collections import Counter import json import os # --- Configuration --- INPUT_FILE = 'market_data_with_entities.csv' OUTPUT_FILE = 'market_insights.json' OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__)) INPUT_PATH = os.path.join(OUTPUT_DIR, INPUT_FILE) OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE) def safe_literal_eval(s): try: if isinstance(s, str) and s.startswith('[') and s.endswith(']'): return ast.literal_eval(s) except (ValueError, SyntaxError): pass return [] def get_top_items(series): """Calculates value counts for an exploded series.""" all_items = series.explode().dropna() all_items = all_items.str.lower().str.strip() counts = all_items.value_counts().reset_index() counts.columns = ['item', 'count'] return counts def get_co_occurrence(df, column, top_n=100): """Calculates co-occurrence for a given column.""" co_occurrence_df = df[df[column].apply(lambda x: len(set(x))) >= 2].copy() co_occurrence_df[f'{column}_normalized'] = co_occurrence_df[column].apply( lambda items: sorted(list(set([i.lower().strip() for i in items]))) ) pairs = co_occurrence_df[f'{column}_normalized'].apply(lambda x: list(itertools.combinations(x, 2))) pair_counts = Counter(pairs.explode().dropna()) most_common_pairs = pair_counts.most_common(top_n) results = pd.DataFrame(most_common_pairs, columns=['pair', 'count']) results[['item1', 'item2']] = pd.DataFrame(results['pair'].tolist(), index=results.index) return results[['item1', 'item2', 'count']] def parse_experience(exp_list): """Parses experience strings to find years.""" if not isinstance(exp_list, list) or not exp_list: return None for exp_string in exp_list: exp_string = str(exp_string).lower() numbers = re.findall(r'\d+\.?\d*', exp_string) if not numbers: continue val = float(numbers[0]) return val / 12.0 if 'month' in exp_string else val return None def main(): print("--- Starting Market Insight Pre-computation ---") # --- Load and Prepare Data --- print(f"Loading data from {INPUT_PATH}...") if not os.path.exists(INPUT_PATH): print(f"ERROR: Input file not found at {INPUT_PATH}") return df = pd.read_csv(INPUT_PATH) print(f"Data loaded. Found {len(df)} records.") print("Converting stringified lists to actual lists...") for col in ['extracted_skills', 'extracted_tools', 'extracted_experience']: df[col] = df[col].apply(safe_literal_eval) # --- Master Data Structure --- insights = { "overall_market": {}, "by_role": {} } # --- Overall Market Analysis --- print("Analyzing overall market...") # Skills overall_skills = get_top_items(df['extracted_skills']) insights["overall_market"]["top_skills"] = overall_skills.rename(columns={'item': 'skill'}).to_dict(orient='records') # Tools overall_tools = get_top_items(df['extracted_tools']) insights["overall_market"]["top_tools"] = overall_tools.rename(columns={'item': 'tool'}).to_dict(orient='records') # Skill Co-occurrence overall_skill_co = get_co_occurrence(df, 'extracted_skills') insights["overall_market"]["skill_co_occurrence"] = overall_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records') # Tool Co-occurrence overall_tool_co = get_co_occurrence(df, 'extracted_tools') insights["overall_market"]["tool_co_occurrence"] = overall_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records') # Experience df['min_years'] = df['extracted_experience'].apply(parse_experience) exp_df = df.dropna(subset=['min_years']) exp_df_filtered = exp_df[exp_df['min_years'] >= 1] exp_dist = exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index() exp_dist.columns = ['year', 'count'] insights["overall_market"]["experience_distribution"] = exp_dist.to_dict(orient='records') insights["overall_market"]["average_experience"] = exp_df['min_years'].mean() # Job Role Distribution role_counts = df['cmo_role_match'].value_counts().reset_index() role_counts.columns = ['cmo_role_match', 'count'] insights["job_role_distribution"] = role_counts.to_dict(orient='records') # --- Per Role Analysis --- print("Analyzing data for each role...") roles = df['cmo_role_match'].unique() for role in roles: print(f"- Processing {role}...") role_df = df[df['cmo_role_match'] == role].copy() insights["by_role"][role] = {} # Skills role_skills = get_top_items(role_df['extracted_skills']) if not role_skills.empty: role_skills['cmo_role_match'] = role insights["by_role"][role]["top_skills"] = role_skills.rename(columns={'item': 'skill'}).to_dict(orient='records') # Tools role_tools = get_top_items(role_df['extracted_tools']) if not role_tools.empty: role_tools['cmo_role_match'] = role insights["by_role"][role]["top_tools"] = role_tools.rename(columns={'item': 'tool'}).to_dict(orient='records') # Skill Co-occurrence role_skill_co = get_co_occurrence(role_df, 'extracted_skills') insights["by_role"][role]["skill_co_occurrence"] = role_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records') # Tool Co-occurrence role_tool_co = get_co_occurrence(role_df, 'extracted_tools') insights["by_role"][role]["tool_co_occurrence"] = role_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records') # Experience role_exp_df = role_df.dropna(subset=['min_years']) role_exp_df_filtered = role_exp_df[role_exp_df['min_years'] >= 1] if not role_exp_df.empty: insights["by_role"][role]["average_experience"] = role_exp_df['min_years'].mean() role_exp_dist = role_exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index() role_exp_dist.columns = ['year', 'count'] insights["by_role"][role]["experience_distribution"] = role_exp_dist.to_dict(orient='records') else: insights["by_role"][role]["average_experience"] = None insights["by_role"][role]["experience_distribution"] = [] # --- Save to JSON --- print(f"Saving aggregated insights to {OUTPUT_PATH}...") with open(OUTPUT_PATH, 'w') as f: json.dump(insights, f, indent=4) print("--- Pre-computation Finished Successfully! ---") if __name__ == "__main__": main()