Spaces:
Sleeping
Sleeping
| import time | |
| import pandas as pd | |
| import torch | |
| import torch.nn.functional as F | |
| from tqdm import tqdm | |
| from transformers import AutoModel, AutoTokenizer | |
| class Embedder: | |
| def __init__(self, path): | |
| self.model_name_or_path = path | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) | |
| self.model = AutoModel.from_pretrained( | |
| self.model_name_or_path, trust_remote_code=True | |
| ) | |
| self.model.to(self.device) | |
| def generate_embedding(self, text): | |
| inputs = self.tokenizer( | |
| text, max_length=8192, padding=True, truncation=True, return_tensors="pt" | |
| ) | |
| inputs = {key: value.to(self.device) for key, value in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| dimension = 768 | |
| embeddings = outputs.last_hidden_state[:, 0][:dimension] | |
| normalized_embeddings = F.normalize(embeddings, p=2, dim=1) | |
| return normalized_embeddings.squeeze().cpu().numpy() | |