Spaces:
Sleeping
Sleeping
File size: 6,835 Bytes
567b16c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import pandas as pd
import ast
import re
import itertools
from collections import Counter
import json
import os
# --- Configuration ---
INPUT_FILE = 'market_data_with_entities.csv'
OUTPUT_FILE = 'market_insights.json'
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_PATH = os.path.join(OUTPUT_DIR, INPUT_FILE)
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE)
def safe_literal_eval(s):
try:
if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
return ast.literal_eval(s)
except (ValueError, SyntaxError):
pass
return []
def get_top_items(series):
"""Calculates value counts for an exploded series."""
all_items = series.explode().dropna()
all_items = all_items.str.lower().str.strip()
counts = all_items.value_counts().reset_index()
counts.columns = ['item', 'count']
return counts
def get_co_occurrence(df, column, top_n=100):
"""Calculates co-occurrence for a given column."""
co_occurrence_df = df[df[column].apply(lambda x: len(set(x))) >= 2].copy()
co_occurrence_df[f'{column}_normalized'] = co_occurrence_df[column].apply(
lambda items: sorted(list(set([i.lower().strip() for i in items])))
)
pairs = co_occurrence_df[f'{column}_normalized'].apply(lambda x: list(itertools.combinations(x, 2)))
pair_counts = Counter(pairs.explode().dropna())
most_common_pairs = pair_counts.most_common(top_n)
results = pd.DataFrame(most_common_pairs, columns=['pair', 'count'])
results[['item1', 'item2']] = pd.DataFrame(results['pair'].tolist(), index=results.index)
return results[['item1', 'item2', 'count']]
def parse_experience(exp_list):
"""Parses experience strings to find years."""
if not isinstance(exp_list, list) or not exp_list:
return None
for exp_string in exp_list:
exp_string = str(exp_string).lower()
numbers = re.findall(r'\d+\.?\d*', exp_string)
if not numbers:
continue
val = float(numbers[0])
return val / 12.0 if 'month' in exp_string else val
return None
def main():
print("--- Starting Market Insight Pre-computation ---")
# --- Load and Prepare Data ---
print(f"Loading data from {INPUT_PATH}...")
if not os.path.exists(INPUT_PATH):
print(f"ERROR: Input file not found at {INPUT_PATH}")
return
df = pd.read_csv(INPUT_PATH)
print(f"Data loaded. Found {len(df)} records.")
print("Converting stringified lists to actual lists...")
for col in ['extracted_skills', 'extracted_tools', 'extracted_experience']:
df[col] = df[col].apply(safe_literal_eval)
# --- Master Data Structure ---
insights = {
"overall_market": {},
"by_role": {}
}
# --- Overall Market Analysis ---
print("Analyzing overall market...")
# Skills
overall_skills = get_top_items(df['extracted_skills'])
insights["overall_market"]["top_skills"] = overall_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
# Tools
overall_tools = get_top_items(df['extracted_tools'])
insights["overall_market"]["top_tools"] = overall_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')
# Skill Co-occurrence
overall_skill_co = get_co_occurrence(df, 'extracted_skills')
insights["overall_market"]["skill_co_occurrence"] = overall_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')
# Tool Co-occurrence
overall_tool_co = get_co_occurrence(df, 'extracted_tools')
insights["overall_market"]["tool_co_occurrence"] = overall_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
# Experience
df['min_years'] = df['extracted_experience'].apply(parse_experience)
exp_df = df.dropna(subset=['min_years'])
exp_df_filtered = exp_df[exp_df['min_years'] >= 1]
exp_dist = exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
exp_dist.columns = ['year', 'count']
insights["overall_market"]["experience_distribution"] = exp_dist.to_dict(orient='records')
insights["overall_market"]["average_experience"] = exp_df['min_years'].mean()
# Job Role Distribution
role_counts = df['cmo_role_match'].value_counts().reset_index()
role_counts.columns = ['cmo_role_match', 'count']
insights["job_role_distribution"] = role_counts.to_dict(orient='records')
# --- Per Role Analysis ---
print("Analyzing data for each role...")
roles = df['cmo_role_match'].unique()
for role in roles:
print(f"- Processing {role}...")
role_df = df[df['cmo_role_match'] == role].copy()
insights["by_role"][role] = {}
# Skills
role_skills = get_top_items(role_df['extracted_skills'])
if not role_skills.empty:
role_skills['cmo_role_match'] = role
insights["by_role"][role]["top_skills"] = role_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
# Tools
role_tools = get_top_items(role_df['extracted_tools'])
if not role_tools.empty:
role_tools['cmo_role_match'] = role
insights["by_role"][role]["top_tools"] = role_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')
# Skill Co-occurrence
role_skill_co = get_co_occurrence(role_df, 'extracted_skills')
insights["by_role"][role]["skill_co_occurrence"] = role_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')
# Tool Co-occurrence
role_tool_co = get_co_occurrence(role_df, 'extracted_tools')
insights["by_role"][role]["tool_co_occurrence"] = role_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
# Experience
role_exp_df = role_df.dropna(subset=['min_years'])
role_exp_df_filtered = role_exp_df[role_exp_df['min_years'] >= 1]
if not role_exp_df.empty:
insights["by_role"][role]["average_experience"] = role_exp_df['min_years'].mean()
role_exp_dist = role_exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
role_exp_dist.columns = ['year', 'count']
insights["by_role"][role]["experience_distribution"] = role_exp_dist.to_dict(orient='records')
else:
insights["by_role"][role]["average_experience"] = None
insights["by_role"][role]["experience_distribution"] = []
# --- Save to JSON ---
print(f"Saving aggregated insights to {OUTPUT_PATH}...")
with open(OUTPUT_PATH, 'w') as f:
json.dump(insights, f, indent=4)
print("--- Pre-computation Finished Successfully! ---")
if __name__ == "__main__":
main()
|