Spaces:
Running
Running
| import sys | |
| import json | |
| import pandas as pd | |
| import torch | |
| from sentence_transformers import SentenceTransformer, util | |
| import argparse | |
| import pathlib | |
| import re | |
| from bs4 import BeautifulSoup | |
| import emoji | |
| # --- Text Cleaning Function (copied from main.py) --- | |
| def new_refined_text_cleaning(text: str) -> str: | |
| """The NEW, improved cleaning function. Keeps technical symbols.""" | |
| if not isinstance(text, str): | |
| return "" | |
| text = BeautifulSoup(text, "html.parser").get_text() | |
| url_pattern = r'(?:(?:https?|ftp)://)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?' | |
| text = re.sub(url_pattern, '', text) | |
| text = re.sub(r'\S+@\S+\s?', '', text) | |
| text = emoji.demojize(text) | |
| text = re.sub(r':[a-zA-Z_]+:', '', text) | |
| text = text.replace('\\', ' ') | |
| text = re.sub(r'[*•]', ' ', text) | |
| text = re.sub(r'\{.*?\}', ' ', text) | |
| text = re.sub(r'[^a-zA-Z0-9_#+()/\\s.,!?-]', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'\s([,.!?-])', r'\1', text) | |
| text = text.strip() | |
| text = text.lower() | |
| return text | |
| def main(): | |
| """ | |
| Main function to perform similarity search. | |
| Reads resume text from stdin and target role from args. | |
| Prints a JSON list of similar jobs to stdout. | |
| """ | |
| try: | |
| # 1. Setup paths | |
| backend_dir = pathlib.Path(__file__).parent.resolve() | |
| # 2. Parse arguments | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--target_role", type=str, required=True) | |
| parser.add_argument("--limit", type=int, default=10) | |
| args = parser.parse_args() | |
| target_role = args.target_role | |
| # 3. Read resume text from stdin | |
| resume_text = sys.stdin.read() | |
| if not resume_text: | |
| print(json.dumps([])) | |
| return | |
| # 4. Load models and data | |
| model = SentenceTransformer( | |
| 'TechWolf/JobBERT-v2', | |
| cache_folder=str(backend_dir / "cached_models"), | |
| device="cpu" | |
| ) | |
| job_embeddings = torch.load(backend_dir / "job_embeddings.pt", map_location="cpu") | |
| market_data = pd.read_csv(backend_dir / "final_prototype_postings.csv") | |
| # 5. Filter data based on target_role | |
| if target_role != "Overall Market": | |
| role_specific_data = market_data[market_data["cmo_role_match"] == target_role] | |
| if not role_specific_data.empty: | |
| role_indices = role_specific_data.index.tolist() | |
| embeddings_tensor = job_embeddings[role_indices] | |
| filtered_market_data = role_specific_data | |
| else: | |
| embeddings_tensor = job_embeddings | |
| filtered_market_data = market_data | |
| else: | |
| embeddings_tensor = job_embeddings | |
| filtered_market_data = market_data | |
| # 6. Perform similarity search | |
| cleaned_resume_text = new_refined_text_cleaning(resume_text) | |
| resume_embedding = model.encode( | |
| cleaned_resume_text, convert_to_tensor=True, device="cpu" | |
| ) | |
| cosine_scores = util.cos_sim(resume_embedding, embeddings_tensor)[0] | |
| top_results = torch.topk(cosine_scores, k=min(args.limit, len(filtered_market_data))) | |
| # 7. Prepare and print results | |
| similar_jobs = [] | |
| for score, idx in zip(top_results[0], top_results[1]): | |
| job = filtered_market_data.iloc[idx.item()] | |
| similar_jobs.append({ | |
| "job_title": job["title"], | |
| "cmo_role_match": job["cmo_role_match"], | |
| "url": job["job_url"], | |
| "similarity_score": score.item(), | |
| }) | |
| output = { | |
| "total_jobs": len(filtered_market_data), | |
| "similar_jobs": similar_jobs | |
| } | |
| print(json.dumps(output)) | |
| except Exception as e: | |
| # Log any errors to stderr to be captured by the main process | |
| print(f"Similarity worker error: {e}", file=sys.stderr) | |
| # Output an empty list to stdout to prevent downstream JSON errors | |
| print(json.dumps([])) | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |