Spaces:

aaronjosephd
/

skill-gap-backend

Sleeping

File size: 6,835 Bytes

567b16c


import pandas as pd
import ast
import re
import itertools
from collections import Counter
import json
import os

# --- Configuration ---
INPUT_FILE = 'market_data_with_entities.csv'
OUTPUT_FILE = 'market_insights.json'
OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_PATH = os.path.join(OUTPUT_DIR, INPUT_FILE)
OUTPUT_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE)

def safe_literal_eval(s):
    try:
        if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
            return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        pass
    return []

def get_top_items(series):
    """Calculates value counts for an exploded series."""
    all_items = series.explode().dropna()
    all_items = all_items.str.lower().str.strip()
    counts = all_items.value_counts().reset_index()
    counts.columns = ['item', 'count']
    return counts

def get_co_occurrence(df, column, top_n=100):
    """Calculates co-occurrence for a given column."""
    co_occurrence_df = df[df[column].apply(lambda x: len(set(x))) >= 2].copy()
    co_occurrence_df[f'{column}_normalized'] = co_occurrence_df[column].apply(
        lambda items: sorted(list(set([i.lower().strip() for i in items])))
    )
    pairs = co_occurrence_df[f'{column}_normalized'].apply(lambda x: list(itertools.combinations(x, 2)))
    pair_counts = Counter(pairs.explode().dropna())
    most_common_pairs = pair_counts.most_common(top_n)
    
    results = pd.DataFrame(most_common_pairs, columns=['pair', 'count'])
    results[['item1', 'item2']] = pd.DataFrame(results['pair'].tolist(), index=results.index)
    return results[['item1', 'item2', 'count']]

def parse_experience(exp_list):
    """Parses experience strings to find years."""
    if not isinstance(exp_list, list) or not exp_list:
        return None
    for exp_string in exp_list:
        exp_string = str(exp_string).lower()
        numbers = re.findall(r'\d+\.?\d*', exp_string)
        if not numbers:
            continue
        val = float(numbers[0])
        return val / 12.0 if 'month' in exp_string else val
    return None

def main():
    print("--- Starting Market Insight Pre-computation ---")

    # --- Load and Prepare Data ---
    print(f"Loading data from {INPUT_PATH}...")
    if not os.path.exists(INPUT_PATH):
        print(f"ERROR: Input file not found at {INPUT_PATH}")
        return
        
    df = pd.read_csv(INPUT_PATH)
    print(f"Data loaded. Found {len(df)} records.")

    print("Converting stringified lists to actual lists...")
    for col in ['extracted_skills', 'extracted_tools', 'extracted_experience']:
        df[col] = df[col].apply(safe_literal_eval)

    # --- Master Data Structure ---
    insights = {
        "overall_market": {},
        "by_role": {}
    }

    # --- Overall Market Analysis ---
    print("Analyzing overall market...")
    # Skills
    overall_skills = get_top_items(df['extracted_skills'])
    insights["overall_market"]["top_skills"] = overall_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')
    
    # Tools
    overall_tools = get_top_items(df['extracted_tools'])
    insights["overall_market"]["top_tools"] = overall_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')

    # Skill Co-occurrence
    overall_skill_co = get_co_occurrence(df, 'extracted_skills')
    insights["overall_market"]["skill_co_occurrence"] = overall_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')

    # Tool Co-occurrence
    overall_tool_co = get_co_occurrence(df, 'extracted_tools')
    insights["overall_market"]["tool_co_occurrence"] = overall_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')

    # Experience
    df['min_years'] = df['extracted_experience'].apply(parse_experience)
    exp_df = df.dropna(subset=['min_years'])
    exp_df_filtered = exp_df[exp_df['min_years'] >= 1]
    
    exp_dist = exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
    exp_dist.columns = ['year', 'count']
    insights["overall_market"]["experience_distribution"] = exp_dist.to_dict(orient='records')
    insights["overall_market"]["average_experience"] = exp_df['min_years'].mean()

    # Job Role Distribution
    role_counts = df['cmo_role_match'].value_counts().reset_index()
    role_counts.columns = ['cmo_role_match', 'count']
    insights["job_role_distribution"] = role_counts.to_dict(orient='records')

    # --- Per Role Analysis ---
    print("Analyzing data for each role...")
    roles = df['cmo_role_match'].unique()
    for role in roles:
        print(f"- Processing {role}...")
        role_df = df[df['cmo_role_match'] == role].copy()
        insights["by_role"][role] = {}

        # Skills
        role_skills = get_top_items(role_df['extracted_skills'])
        if not role_skills.empty:
            role_skills['cmo_role_match'] = role
        insights["by_role"][role]["top_skills"] = role_skills.rename(columns={'item': 'skill'}).to_dict(orient='records')

        # Tools
        role_tools = get_top_items(role_df['extracted_tools'])
        if not role_tools.empty:
            role_tools['cmo_role_match'] = role
        insights["by_role"][role]["top_tools"] = role_tools.rename(columns={'item': 'tool'}).to_dict(orient='records')

        # Skill Co-occurrence
        role_skill_co = get_co_occurrence(role_df, 'extracted_skills')
        insights["by_role"][role]["skill_co_occurrence"] = role_skill_co.rename(columns={'item1': 'skill_A', 'item2': 'skill_B'}).to_dict(orient='records')

        # Tool Co-occurrence
        role_tool_co = get_co_occurrence(role_df, 'extracted_tools')
        insights["by_role"][role]["tool_co_occurrence"] = role_tool_co.rename(columns={'item1': 'tool_A', 'item2': 'tool_B'}).to_dict(orient='records')
        
        # Experience
        role_exp_df = role_df.dropna(subset=['min_years'])
        role_exp_df_filtered = role_exp_df[role_exp_df['min_years'] >= 1]
        
        if not role_exp_df.empty:
            insights["by_role"][role]["average_experience"] = role_exp_df['min_years'].mean()
            
            role_exp_dist = role_exp_df_filtered['min_years'].astype(int).value_counts().sort_index().reset_index()
            role_exp_dist.columns = ['year', 'count']
            insights["by_role"][role]["experience_distribution"] = role_exp_dist.to_dict(orient='records')
        else:
            insights["by_role"][role]["average_experience"] = None
            insights["by_role"][role]["experience_distribution"] = []


    # --- Save to JSON ---
    print(f"Saving aggregated insights to {OUTPUT_PATH}...")
    with open(OUTPUT_PATH, 'w') as f:
        json.dump(insights, f, indent=4)

    print("--- Pre-computation Finished Successfully! ---")

if __name__ == "__main__":
    main()