skill-gap-backend / similarity_worker.py
aaronjosephd's picture
Initial backend upload
567b16c
import sys
import json
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
import argparse
import pathlib
import re
from bs4 import BeautifulSoup
import emoji
# --- Text Cleaning Function (copied from main.py) ---
def new_refined_text_cleaning(text: str) -> str:
"""The NEW, improved cleaning function. Keeps technical symbols."""
if not isinstance(text, str):
return ""
text = BeautifulSoup(text, "html.parser").get_text()
url_pattern = r'(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?'
text = re.sub(url_pattern, '', text)
text = re.sub(r'\S+@\S+\s?', '', text)
text = emoji.demojize(text)
text = re.sub(r':[a-zA-Z_]+:', '', text)
text = text.replace('\\', ' ')
text = re.sub(r'[*•]', ' ', text)
text = re.sub(r'\{.*?\}', ' ', text)
text = re.sub(r'[^a-zA-Z0-9_#+()/\\s.,!?-]', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\s([,.!?-])', r'\1', text)
text = text.strip()
text = text.lower()
return text
def main():
"""
Main function to perform similarity search.
Reads resume text from stdin and target role from args.
Prints a JSON list of similar jobs to stdout.
"""
try:
# 1. Setup paths
backend_dir = pathlib.Path(__file__).parent.resolve()
# 2. Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--target_role", type=str, required=True)
parser.add_argument("--limit", type=int, default=10)
args = parser.parse_args()
target_role = args.target_role
# 3. Read resume text from stdin
resume_text = sys.stdin.read()
if not resume_text:
print(json.dumps([]))
return
# 4. Load models and data
model = SentenceTransformer(
'TechWolf/JobBERT-v2',
cache_folder=str(backend_dir / "cached_models"),
device="cpu"
)
job_embeddings = torch.load(backend_dir / "job_embeddings.pt", map_location="cpu")
market_data = pd.read_csv(backend_dir / "final_prototype_postings.csv")
# 5. Filter data based on target_role
if target_role != "Overall Market":
role_specific_data = market_data[market_data["cmo_role_match"] == target_role]
if not role_specific_data.empty:
role_indices = role_specific_data.index.tolist()
embeddings_tensor = job_embeddings[role_indices]
filtered_market_data = role_specific_data
else:
embeddings_tensor = job_embeddings
filtered_market_data = market_data
else:
embeddings_tensor = job_embeddings
filtered_market_data = market_data
# 6. Perform similarity search
cleaned_resume_text = new_refined_text_cleaning(resume_text)
resume_embedding = model.encode(
cleaned_resume_text, convert_to_tensor=True, device="cpu"
)
cosine_scores = util.cos_sim(resume_embedding, embeddings_tensor)[0]
top_results = torch.topk(cosine_scores, k=min(args.limit, len(filtered_market_data)))
# 7. Prepare and print results
similar_jobs = []
for score, idx in zip(top_results[0], top_results[1]):
job = filtered_market_data.iloc[idx.item()]
similar_jobs.append({
"job_title": job["title"],
"cmo_role_match": job["cmo_role_match"],
"url": job["job_url"],
"similarity_score": score.item(),
})
output = {
"total_jobs": len(filtered_market_data),
"similar_jobs": similar_jobs
}
print(json.dumps(output))
except Exception as e:
# Log any errors to stderr to be captured by the main process
print(f"Similarity worker error: {e}", file=sys.stderr)
# Output an empty list to stdout to prevent downstream JSON errors
print(json.dumps([]))
sys.exit(1)
if __name__ == "__main__":
main()