File size: 4,500 Bytes
567b16c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

import sys
import json
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import argparse
import pathlib
import re
from bs4 import BeautifulSoup
import emoji

# --- Text Cleaning Function (copied from main.py) ---
def new_refined_text_cleaning(text: str) -> str:
    """The NEW, improved cleaning function. Keeps technical symbols."""
    if not isinstance(text, str):
        return ""
    text = BeautifulSoup(text, "html.parser").get_text()
    url_pattern = r'(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?'
    text = re.sub(url_pattern, '', text)
    text = re.sub(r'\S+@\S+\s?', '', text)
    text = emoji.demojize(text)
    text = re.sub(r':[a-zA-Z_]+:', '', text)
    text = text.replace('\\', ' ')
    text = re.sub(r'[*•]', ' ', text)
    text = re.sub(r'\{.*?\}', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9_#+()/\\s.,!?-]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s([,.!?-])', r'\1', text)
    text = text.strip()
    text = text.lower()
    return text

def main():
    """
    Main function to perform similarity search.
    Reads resume text from stdin and target role from args.
    Prints a JSON list of similar jobs to stdout.
    """
    try:
        # 1. Setup paths
        backend_dir = pathlib.Path(__file__).parent.resolve()
        
        # 2. Parse arguments
        parser = argparse.ArgumentParser()
        parser.add_argument("--target_role", type=str, required=True)
        parser.add_argument("--limit", type=int, default=10)
        args = parser.parse_args()
        target_role = args.target_role

        # 3. Read resume text from stdin
        resume_text = sys.stdin.read()
        if not resume_text:
            print(json.dumps([]))
            return

        # 4. Load models and data
        model = SentenceTransformer(
            'TechWolf/JobBERT-v2',
            cache_folder=str(backend_dir / "cached_models"),
            device="cpu"
        )
        job_embeddings = torch.load(backend_dir / "job_embeddings.pt", map_location="cpu")
        market_data = pd.read_csv(backend_dir / "final_prototype_postings.csv")

        # 5. Filter data based on target_role
        if target_role != "Overall Market":
            role_specific_data = market_data[market_data["cmo_role_match"] == target_role]
            if not role_specific_data.empty:
                role_indices = role_specific_data.index.tolist()
                embeddings_tensor = job_embeddings[role_indices]
                filtered_market_data = role_specific_data
            else:
                embeddings_tensor = job_embeddings
                filtered_market_data = market_data
        else:
            embeddings_tensor = job_embeddings
            filtered_market_data = market_data

        # 6. Perform similarity search
        cleaned_resume_text = new_refined_text_cleaning(resume_text)
        resume_embedding = model.encode(
            cleaned_resume_text, convert_to_tensor=True, device="cpu"
        )
        
        cosine_scores = util.cos_sim(resume_embedding, embeddings_tensor)[0]
        top_results = torch.topk(cosine_scores, k=min(args.limit, len(filtered_market_data)))

        # 7. Prepare and print results
        similar_jobs = []
        for score, idx in zip(top_results[0], top_results[1]):
            job = filtered_market_data.iloc[idx.item()]
            similar_jobs.append({
                "job_title": job["title"],
                "cmo_role_match": job["cmo_role_match"],
                "url": job["job_url"],
                "similarity_score": score.item(),
            })

        output = {
            "total_jobs": len(filtered_market_data),
            "similar_jobs": similar_jobs
        }
        print(json.dumps(output))

    except Exception as e:
        # Log any errors to stderr to be captured by the main process
        print(f"Similarity worker error: {e}", file=sys.stderr)
        # Output an empty list to stdout to prevent downstream JSON errors
        print(json.dumps([]))
        sys.exit(1)

if __name__ == "__main__":
    main()