File size: 6,835 Bytes
567b16c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

import pandas as pd
import ast
import re
import itertools
from collections import Counter
import json
import os

# --- Configuration ---
INPUT_FILE = 'market_data_with_entities.csv'
OUTPUT_FILE = 'market_insights.json'
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_PATH = os.path.join(OUTPUT_DIR, INPUT_FILE)
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE)

def safe_literal_eval(s):
    try:
        if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
            return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        pass
    return []

def get_top_items(series):
    """Calculates value counts for an exploded series."""
    all_items = series.explode().dropna()
    all_items = all_items.str.lower().str.strip()
    counts = all_items.value_counts().reset_index()
    counts.columns = ['item', 'count']
    return counts

def get_co_occurrence(df, column, top_n=100):
    """Calculates co-occurrence for a given column."""
    co_occurrence_df = df[df[column].apply(lambda x: len(set(x))) >= 2].copy()
    co_occurrence_df[f'{column}_normalized'] = co_occurrence_df[column].apply(
        lambda items: sorted(list(set([i.lower().strip() for i in items])))
    )
    pairs = co_occurrence_df[f'{column}_normalized'].apply(lambda x: list(itertools.combinations(x, 2)))
    pair_counts = Counter(pairs.explode().dropna())
    most_common_pairs = pair_counts.most_common(top_n)
    
    results = pd.DataFrame(most_common_pairs, columns=['pair', 'count'])
    results[['item1', 'item2']] = pd.DataFrame(results['pair'].tolist(), index=results.index)
    return results[['item1', 'item2', 'count']]

def parse_experience(exp_list):
    """Parses experience strings to find years."""
    if not isinstance(exp_list, list) or not exp_list:
        return None
    for exp_string in exp_list:
        exp_string = str(exp_string).lower()
        numbers = re.findall(r'\d+\.?\d*', exp_string)
        if not numbers:
            continue
        val = float(numbers[0])
        return val / 12.0 if 'month' in exp_string else val
    return None

def main():
    print("--- Starting Market Insight Pre-computation ---")

    # --- Load and Prepare Data ---
    print(f"Loading data from {INPUT_PATH}...")
    if not os.path.exists(INPUT_PATH):
        print(f"ERROR: Input file not found at {INPUT_PATH}")
        return
        
    df = pd.read_csv(INPUT_PATH)
    print(f"Data loaded. Found {len(df)} records.")

    print("Converting stringified lists to actual lists...")
    for col in ['extracted_skills', 'extracted_tools', 'extracted_experience']:
        df[col] = df[col].apply(safe_literal_eval)

    # --- Master Data Structure ---
    insights = {
        "overall_market": {},
        "by_role": {}
    }

    # --- Overall Market Analysis ---
    print("Analyzing overall market...")
    # Skills
    overall_skills = get_top_items(df['extracted_skills'])
    insights["overall_market"]["top_skills"] = overall_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
    
    # Tools
    overall_tools = get_top_items(df['extracted_tools'])
    insights["overall_market"]["top_tools"] = overall_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')

    # Skill Co-occurrence
    overall_skill_co = get_co_occurrence(df, 'extracted_skills')
    insights["overall_market"]["skill_co_occurrence"] = overall_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')

    # Tool Co-occurrence
    overall_tool_co = get_co_occurrence(df, 'extracted_tools')
    insights["overall_market"]["tool_co_occurrence"] = overall_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')

    # Experience
    df['min_years'] = df['extracted_experience'].apply(parse_experience)
    exp_df = df.dropna(subset=['min_years'])
    exp_df_filtered = exp_df[exp_df['min_years'] >= 1]
    
    exp_dist = exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
    exp_dist.columns = ['year', 'count']
    insights["overall_market"]["experience_distribution"] = exp_dist.to_dict(orient='records')
    insights["overall_market"]["average_experience"] = exp_df['min_years'].mean()

    # Job Role Distribution
    role_counts = df['cmo_role_match'].value_counts().reset_index()
    role_counts.columns = ['cmo_role_match', 'count']
    insights["job_role_distribution"] = role_counts.to_dict(orient='records')

    # --- Per Role Analysis ---
    print("Analyzing data for each role...")
    roles = df['cmo_role_match'].unique()
    for role in roles:
        print(f"- Processing {role}...")
        role_df = df[df['cmo_role_match'] == role].copy()
        insights["by_role"][role] = {}

        # Skills
        role_skills = get_top_items(role_df['extracted_skills'])
        if not role_skills.empty:
            role_skills['cmo_role_match'] = role
        insights["by_role"][role]["top_skills"] = role_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')

        # Tools
        role_tools = get_top_items(role_df['extracted_tools'])
        if not role_tools.empty:
            role_tools['cmo_role_match'] = role
        insights["by_role"][role]["top_tools"] = role_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')

        # Skill Co-occurrence
        role_skill_co = get_co_occurrence(role_df, 'extracted_skills')
        insights["by_role"][role]["skill_co_occurrence"] = role_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')

        # Tool Co-occurrence
        role_tool_co = get_co_occurrence(role_df, 'extracted_tools')
        insights["by_role"][role]["tool_co_occurrence"] = role_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
        
        # Experience
        role_exp_df = role_df.dropna(subset=['min_years'])
        role_exp_df_filtered = role_exp_df[role_exp_df['min_years'] >= 1]
        
        if not role_exp_df.empty:
            insights["by_role"][role]["average_experience"] = role_exp_df['min_years'].mean()
            
            role_exp_dist = role_exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
            role_exp_dist.columns = ['year', 'count']
            insights["by_role"][role]["experience_distribution"] = role_exp_dist.to_dict(orient='records')
        else:
            insights["by_role"][role]["average_experience"] = None
            insights["by_role"][role]["experience_distribution"] = []


    # --- Save to JSON ---
    print(f"Saving aggregated insights to {OUTPUT_PATH}...")
    with open(OUTPUT_PATH, 'w') as f:
        json.dump(insights, f, indent=4)

    print("--- Pre-computation Finished Successfully! ---")

if __name__ == "__main__":
    main()