Spaces:
Running
Running
| import pandas as pd | |
| import ast | |
| import re | |
| import itertools | |
| from collections import Counter | |
| import json | |
| import os | |
| # --- Configuration --- | |
| INPUT_FILE = 'market_data_with_entities.csv' | |
| OUTPUT_FILE = 'market_insights.json' | |
| OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| INPUT_PATH = os.path.join(OUTPUT_DIR, INPUT_FILE) | |
| OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE) | |
| def safe_literal_eval(s): | |
| try: | |
| if isinstance(s, str) and s.startswith('[') and s.endswith(']'): | |
| return ast.literal_eval(s) | |
| except (ValueError, SyntaxError): | |
| pass | |
| return [] | |
| def get_top_items(series): | |
| """Calculates value counts for an exploded series.""" | |
| all_items = series.explode().dropna() | |
| all_items = all_items.str.lower().str.strip() | |
| counts = all_items.value_counts().reset_index() | |
| counts.columns = ['item', 'count'] | |
| return counts | |
| def get_co_occurrence(df, column, top_n=100): | |
| """Calculates co-occurrence for a given column.""" | |
| co_occurrence_df = df[df[column].apply(lambda x: len(set(x))) >= 2].copy() | |
| co_occurrence_df[f'{column}_normalized'] = co_occurrence_df[column].apply( | |
| lambda items: sorted(list(set([i.lower().strip() for i in items]))) | |
| ) | |
| pairs = co_occurrence_df[f'{column}_normalized'].apply(lambda x: list(itertools.combinations(x, 2))) | |
| pair_counts = Counter(pairs.explode().dropna()) | |
| most_common_pairs = pair_counts.most_common(top_n) | |
| results = pd.DataFrame(most_common_pairs, columns=['pair', 'count']) | |
| results[['item1', 'item2']] = pd.DataFrame(results['pair'].tolist(), index=results.index) | |
| return results[['item1', 'item2', 'count']] | |
| def parse_experience(exp_list): | |
| """Parses experience strings to find years.""" | |
| if not isinstance(exp_list, list) or not exp_list: | |
| return None | |
| for exp_string in exp_list: | |
| exp_string = str(exp_string).lower() | |
| numbers = re.findall(r'\d+\.?\d*', exp_string) | |
| if not numbers: | |
| continue | |
| val = float(numbers[0]) | |
| return val / 12.0 if 'month' in exp_string else val | |
| return None | |
| def main(): | |
| print("--- Starting Market Insight Pre-computation ---") | |
| # --- Load and Prepare Data --- | |
| print(f"Loading data from {INPUT_PATH}...") | |
| if not os.path.exists(INPUT_PATH): | |
| print(f"ERROR: Input file not found at {INPUT_PATH}") | |
| return | |
| df = pd.read_csv(INPUT_PATH) | |
| print(f"Data loaded. Found {len(df)} records.") | |
| print("Converting stringified lists to actual lists...") | |
| for col in ['extracted_skills', 'extracted_tools', 'extracted_experience']: | |
| df[col] = df[col].apply(safe_literal_eval) | |
| # --- Master Data Structure --- | |
| insights = { | |
| "overall_market": {}, | |
| "by_role": {} | |
| } | |
| # --- Overall Market Analysis --- | |
| print("Analyzing overall market...") | |
| # Skills | |
| overall_skills = get_top_items(df['extracted_skills']) | |
| insights["overall_market"]["top_skills"] = overall_skills.rename(columns={'item': 'skill'}).to_dict(orient='records') | |
| # Tools | |
| overall_tools = get_top_items(df['extracted_tools']) | |
| insights["overall_market"]["top_tools"] = overall_tools.rename(columns={'item': 'tool'}).to_dict(orient='records') | |
| # Skill Co-occurrence | |
| overall_skill_co = get_co_occurrence(df, 'extracted_skills') | |
| insights["overall_market"]["skill_co_occurrence"] = overall_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records') | |
| # Tool Co-occurrence | |
| overall_tool_co = get_co_occurrence(df, 'extracted_tools') | |
| insights["overall_market"]["tool_co_occurrence"] = overall_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records') | |
| # Experience | |
| df['min_years'] = df['extracted_experience'].apply(parse_experience) | |
| exp_df = df.dropna(subset=['min_years']) | |
| exp_df_filtered = exp_df[exp_df['min_years'] >= 1] | |
| exp_dist = exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index() | |
| exp_dist.columns = ['year', 'count'] | |
| insights["overall_market"]["experience_distribution"] = exp_dist.to_dict(orient='records') | |
| insights["overall_market"]["average_experience"] = exp_df['min_years'].mean() | |
| # Job Role Distribution | |
| role_counts = df['cmo_role_match'].value_counts().reset_index() | |
| role_counts.columns = ['cmo_role_match', 'count'] | |
| insights["job_role_distribution"] = role_counts.to_dict(orient='records') | |
| # --- Per Role Analysis --- | |
| print("Analyzing data for each role...") | |
| roles = df['cmo_role_match'].unique() | |
| for role in roles: | |
| print(f"- Processing {role}...") | |
| role_df = df[df['cmo_role_match'] == role].copy() | |
| insights["by_role"][role] = {} | |
| # Skills | |
| role_skills = get_top_items(role_df['extracted_skills']) | |
| if not role_skills.empty: | |
| role_skills['cmo_role_match'] = role | |
| insights["by_role"][role]["top_skills"] = role_skills.rename(columns={'item': 'skill'}).to_dict(orient='records') | |
| # Tools | |
| role_tools = get_top_items(role_df['extracted_tools']) | |
| if not role_tools.empty: | |
| role_tools['cmo_role_match'] = role | |
| insights["by_role"][role]["top_tools"] = role_tools.rename(columns={'item': 'tool'}).to_dict(orient='records') | |
| # Skill Co-occurrence | |
| role_skill_co = get_co_occurrence(role_df, 'extracted_skills') | |
| insights["by_role"][role]["skill_co_occurrence"] = role_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records') | |
| # Tool Co-occurrence | |
| role_tool_co = get_co_occurrence(role_df, 'extracted_tools') | |
| insights["by_role"][role]["tool_co_occurrence"] = role_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records') | |
| # Experience | |
| role_exp_df = role_df.dropna(subset=['min_years']) | |
| role_exp_df_filtered = role_exp_df[role_exp_df['min_years'] >= 1] | |
| if not role_exp_df.empty: | |
| insights["by_role"][role]["average_experience"] = role_exp_df['min_years'].mean() | |
| role_exp_dist = role_exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index() | |
| role_exp_dist.columns = ['year', 'count'] | |
| insights["by_role"][role]["experience_distribution"] = role_exp_dist.to_dict(orient='records') | |
| else: | |
| insights["by_role"][role]["average_experience"] = None | |
| insights["by_role"][role]["experience_distribution"] = [] | |
| # --- Save to JSON --- | |
| print(f"Saving aggregated insights to {OUTPUT_PATH}...") | |
| with open(OUTPUT_PATH, 'w') as f: | |
| json.dump(insights, f, indent=4) | |
| print("--- Pre-computation Finished Successfully! ---") | |
| if __name__ == "__main__": | |
| main() | |