import sys import json import pandas as pd import torch from sentence_transformers import SentenceTransformer, util import argparse import pathlib import re from bs4 import BeautifulSoup import emoji # --- Text Cleaning Function (copied from main.py) --- def new_refined_text_cleaning(text: str) -> str: """The NEW, improved cleaning function. Keeps technical symbols.""" if not isinstance(text, str): return "" text = BeautifulSoup(text, "html.parser").get_text() url_pattern = r'(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?' text = re.sub(url_pattern, '', text) text = re.sub(r'\S+@\S+\s?', '', text) text = emoji.demojize(text) text = re.sub(r':[a-zA-Z_]+:', '', text) text = text.replace('\\', ' ') text = re.sub(r'[*•]', ' ', text) text = re.sub(r'\{.*?\}', ' ', text) text = re.sub(r'[^a-zA-Z0-9_#+()/\\s.,!?-]', ' ', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s([,.!?-])', r'\1', text) text = text.strip() text = text.lower() return text def main(): """ Main function to perform similarity search. Reads resume text from stdin and target role from args. Prints a JSON list of similar jobs to stdout. """ try: # 1. Setup paths backend_dir = pathlib.Path(__file__).parent.resolve() # 2. Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--target_role", type=str, required=True) parser.add_argument("--limit", type=int, default=10) args = parser.parse_args() target_role = args.target_role # 3. Read resume text from stdin resume_text = sys.stdin.read() if not resume_text: print(json.dumps([])) return # 4. Load models and data model = SentenceTransformer( 'TechWolf/JobBERT-v2', cache_folder=str(backend_dir / "cached_models"), device="cpu" ) job_embeddings = torch.load(backend_dir / "job_embeddings.pt", map_location="cpu") market_data = pd.read_csv(backend_dir / "final_prototype_postings.csv") # 5. Filter data based on target_role if target_role != "Overall Market": role_specific_data = market_data[market_data["cmo_role_match"] == target_role] if not role_specific_data.empty: role_indices = role_specific_data.index.tolist() embeddings_tensor = job_embeddings[role_indices] filtered_market_data = role_specific_data else: embeddings_tensor = job_embeddings filtered_market_data = market_data else: embeddings_tensor = job_embeddings filtered_market_data = market_data # 6. Perform similarity search cleaned_resume_text = new_refined_text_cleaning(resume_text) resume_embedding = model.encode( cleaned_resume_text, convert_to_tensor=True, device="cpu" ) cosine_scores = util.cos_sim(resume_embedding, embeddings_tensor)[0] top_results = torch.topk(cosine_scores, k=min(args.limit, len(filtered_market_data))) # 7. Prepare and print results similar_jobs = [] for score, idx in zip(top_results[0], top_results[1]): job = filtered_market_data.iloc[idx.item()] similar_jobs.append({ "job_title": job["title"], "cmo_role_match": job["cmo_role_match"], "url": job["job_url"], "similarity_score": score.item(), }) output = { "total_jobs": len(filtered_market_data), "similar_jobs": similar_jobs } print(json.dumps(output)) except Exception as e: # Log any errors to stderr to be captured by the main process print(f"Similarity worker error: {e}", file=sys.stderr) # Output an empty list to stdout to prevent downstream JSON errors print(json.dumps([])) sys.exit(1) if __name__ == "__main__": main()