Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| import time | |
| from typing import List, Dict, Tuple | |
| import pandas as pd | |
| from streamlit.runtime.uploaded_file_manager import UploadedFile | |
| from anthropic import Anthropic | |
| import pymongo | |
| from dotenv import load_dotenv | |
| import fitz # PyMuPDF | |
| import voyageai | |
| from pinecone.grpc import PineconeGRPC as Pinecone | |
| from pinecone import ServerlessSpec | |
| from pinecone import Index | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize VoyageAI constants | |
| VOYAGEAI_BATCH_SIZE = 128 | |
| # Initialize Pinecone | |
| PINECONE_ID = "intratalent-v2" | |
| # Initialize MongoDB client | |
| MONGO_URI = os.getenv('MONGO_URI') | |
| mongo_client = pymongo.MongoClient(MONGO_URI) | |
| db = mongo_client['intratalent'] | |
| resume_collection = db['resumes'] | |
| # Initialize Anthropic client | |
| anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY')) | |
| # Initialize Streamlit app | |
| st.set_page_config( | |
| page_title="IntraTalent Resume Processor", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| def extract_text_from_pdf(pdf_content: bytes) -> str: | |
| """Extract text from PDF content.""" | |
| try: | |
| # Create a temporary file to store the PDF content | |
| with tempfile.NamedTemporaryFile(mode='w+b', suffix='.pdf', delete=False) as temp_file: | |
| temp_file.write(pdf_content) | |
| temp_file_path = temp_file.name | |
| # Extract text from PDF | |
| doc = fitz.open(temp_file_path) | |
| text = "" | |
| for page_num in range(doc.page_count): | |
| page = doc.load_page(page_num) | |
| text += page.get_text() + "\n" | |
| doc.close() | |
| # Clean up temporary file | |
| os.unlink(temp_file_path) | |
| return text | |
| except Exception as e: | |
| st.error(f"Error extracting text from PDF: {e}") | |
| return "" | |
| def extract_info_with_claude(resume_text: str) -> str: | |
| """Extract information from resume text using Claude.""" | |
| st.write("π€ Sending request to Claude API...") | |
| prompt = """ | |
| Extract the following information from the given resume: | |
| 1. Full Name | |
| 2. List of all experiences with their descriptions (copy exactly from resume) | |
| Please format the output as follows: | |
| Name: [Full Name] | |
| Projects: | |
| 1. [Experience/Project Name]: [Experience/Project Description] | |
| 2. [Experience/Project Name]: [Experience/Project Description] | |
| ... | |
| Extract all experiences, including projects, leadership, work experience, research, etc. Don't include hyphens and put the entire description on one line. | |
| Here's the resume text: | |
| {resume_text} | |
| """.format(resume_text=resume_text) | |
| try: | |
| message = anthropic.messages.create( | |
| model="claude-3-haiku-20240307", | |
| max_tokens=4096, | |
| system="You are a helpful assistant that extracts information from resumes.", | |
| messages=[{ | |
| "role": "user", | |
| "content": prompt | |
| }] | |
| ) | |
| extracted_info = message.content[0].text | |
| st.write("β Received response from Claude:") | |
| st.code(extracted_info, language="text") | |
| except Exception as e: | |
| extracted_info = f"An error occurred: {e}" | |
| st.error(f"β API Error: {e}") | |
| return extracted_info | |
| def get_pinecone_index(database_id: str) -> Index: | |
| # initialize connection to pinecone | |
| pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY')) | |
| # if the index does not exist, we create it | |
| if not pc.has_index(database_id): | |
| pc.create_index( | |
| database_id, | |
| dimension=shape[1], | |
| spec=ServerlessSpec( | |
| cloud='aws', | |
| region='us-east-1' | |
| ), | |
| metric='cosine' | |
| ) | |
| # connect to index | |
| index = pc.Index(index_name) | |
| def add_to_voyage(person_name: str, person_projects: list) -> None: | |
| embeds = [] | |
| metas = [] | |
| ids = [] | |
| index = get_pinecone_index(PINECONE_ID) | |
| vo = voyageai.Client(api_key=os.getenv('VOYAGEAI_API_KEY')) | |
| for i in range(len(person_projects)): | |
| # Get the ith project | |
| project = person_projects[i] | |
| # Embed the description | |
| embed = vo.embed( | |
| texts=project["description"], | |
| model='voyage-3-lite', | |
| truncation=False | |
| ).embeddings[0] | |
| embeds.append(embed) | |
| # Create metadata using person's name + project name | |
| meta = f"{person_name} {project["name"]}" | |
| metas.append(meta) | |
| # Give it a unique id | |
| id = i | |
| ids.append(i) | |
| # create list of (id, vector, metadata) tuples to be upserted | |
| to_upsert = list(zip(ids, embeds, meta)) | |
| for i in range(0, shape[0], VOYAGEAI_BATCH_SIZE): | |
| i_end = min(i+VOYAGEAI_BATCH_SIZE, shape[0]) | |
| index.upsert(vectors=to_upsert[i:i_end]) | |
| # let's view the index statistics | |
| st.write(index.describe_index_stats()) | |
| def parse_resume(uploaded_file: UploadedFile) -> Tuple[str, List[Dict]]: | |
| """Parse a resume file and return name and projects.""" | |
| try: | |
| st.write(f"π Processing resume: {uploaded_file.name}") | |
| resume_content = uploaded_file.getvalue() | |
| st.write("π Extracting text from PDF...") | |
| resume_text = extract_text_from_pdf(resume_content) | |
| st.write("π Extracted text from PDF:") | |
| st.code(resume_text) | |
| extracted_info = extract_info_with_claude(resume_text) | |
| st.write("π Parsing extracted information...") | |
| # Parse the extracted information | |
| lines = extracted_info.split('\n') | |
| name = lines[0].split(': ')[1] if len(lines) > 0 and ': ' in lines[0] else "Unknown" | |
| st.write(f"π€ Extracted name: {name}") | |
| projects = [] | |
| project_started = False | |
| for line in lines: | |
| if line.strip() == "Projects:": | |
| project_started = True | |
| continue | |
| if project_started and line.strip(): | |
| project_parts = line.split(': ', 1) | |
| if len(project_parts) == 2: | |
| project_name = project_parts[0].split('. ', 1)[-1] # Remove the number | |
| project_description = project_parts[1] | |
| projects.append({"name": project_name, "description": project_description}) | |
| st.write("π Extracted projects:") | |
| st.json(projects) | |
| # Store in MongoDB | |
| resume_data = { | |
| "name": name, | |
| "projects": projects, | |
| "full_content": resume_text | |
| } | |
| add_to_voyage(name, projects) | |
| st.write("πΎ Stored data in VoyageAI") | |
| return name, projects | |
| except Exception as e: | |
| st.error(f"β Error processing resume: {e}") | |
| return "Unknown", [] | |
| def process_resumes(uploaded_files: List[UploadedFile]) -> Dict: | |
| """Process multiple resumes and return results.""" | |
| results = {} | |
| progress_bar = st.progress(0) | |
| for idx, file in enumerate(uploaded_files): | |
| st.write(f"\n---\n### Processing file {idx + 1} of {len(uploaded_files)}") | |
| if file.type != "application/pdf": | |
| st.warning(f"β οΈ Skipping {file.name}: Not a PDF file") | |
| continue | |
| try: | |
| name, projects = parse_resume(file) | |
| results[file.name] = { | |
| "name": name, | |
| "projects": projects | |
| } | |
| # Update progress | |
| progress_bar.progress((idx + 1) / len(uploaded_files)) | |
| st.write(f"β Successfully processed {file.name}") | |
| except Exception as e: | |
| st.error(f"β Error processing {file.name}: {e}") | |
| return results | |
| def display_results(results: Dict): | |
| """Display processed resume results in an organized manner.""" | |
| if not results: | |
| return | |
| st.subheader("π Processed Resumes") | |
| for filename, data in results.items(): | |
| with st.expander(f"π {data['name']} ({filename})"): | |
| st.write("π·οΈ File details:") | |
| st.json({ | |
| "filename": filename, | |
| "name": data['name'], | |
| "number_of_projects": len(data['projects']) | |
| }) | |
| if data['projects']: | |
| st.write("π Projects:") | |
| df = pd.DataFrame(data['projects']) | |
| st.dataframe( | |
| df, | |
| column_config={ | |
| "name": "Project Name", | |
| "description": "Description" | |
| }, | |
| hide_index=True | |
| ) | |
| else: | |
| st.info("βΉοΈ No projects found in this resume") | |
| def main(): | |
| st.title("π― IntraTalent Resume Processor") | |
| # File uploader section | |
| st.header("π€ Upload Resumes") | |
| uploaded_files = st.file_uploader( | |
| "Upload up to 10 resumes (PDF only)", | |
| type=['pdf'], | |
| accept_multiple_files=True, | |
| key="resume_uploader" | |
| ) | |
| # Validate number of files | |
| if uploaded_files and len(uploaded_files) > 10: | |
| st.error("β οΈ Maximum 10 files allowed. Please remove some files.") | |
| return | |
| # Process button | |
| if uploaded_files and st.button("π Process Resumes"): | |
| with st.spinner("Processing resumes..."): | |
| st.write("π Starting resume processing...") | |
| results = process_resumes(uploaded_files) | |
| st.session_state['processed_results'] = results | |
| st.write("β¨ Processing complete!") | |
| display_results(results) | |
| # Query section | |
| st.header("π Search Projects") | |
| query = st.text_area( | |
| "Enter your project requirements", | |
| placeholder="Example: Looking for team members with experience in machine learning and computer vision...", | |
| height=100 | |
| ) | |
| if query and st.button("π Search"): | |
| if 'processed_results' not in st.session_state: | |
| st.warning("β οΈ Please process some resumes first!") | |
| return | |
| with st.spinner("Searching for matches..."): | |
| st.write("π Preparing to search...") | |
| # Here you would implement the embedding and similarity search | |
| # Using the code from your original script | |
| st.success("β Search completed!") | |
| # Display results in a nice format | |
| st.subheader("π― Top Matches") | |
| # Placeholder for search results | |
| st.info("π Feature coming soon: Will display matching projects and candidates based on similarity search") | |
| if __name__ == "__main__": | |
| main() |