import torch import numpy as np from PIL import Image from transformers import AutoModel from typing import Union, List import io class JinaClipEmbeddingService: """ Jina CLIP v2 Embedding Service với hỗ trợ tiếng Việt Sử dụng AutoModel với trust_remote_code """ def __init__(self, model_path: str = "jinaai/jina-clip-v2"): """ Initialize Jina CLIP v2 model Args: model_path: Path to model hoặc HuggingFace model name """ print(f"Loading Jina CLIP v2 model from {model_path}...") # Load model với trust_remote_code self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True) # Chuyển sang eval mode self.model.eval() # Sử dụng GPU nếu có self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) print(f"✓ Loaded Jina CLIP v2 model on: {self.device}") def encode_text( self, text: Union[str, List[str]], truncate_dim: int = None, normalize: bool = True ) -> np.ndarray: """ Encode text thành vector embeddings (hỗ trợ tiếng Việt) Args: text: Text hoặc list of texts (tiếng Việt) truncate_dim: Matryoshka dimension (64-1024, None = full 1024) normalize: Có normalize embeddings không Returns: numpy array của embeddings """ if isinstance(text, str): text = [text] # Jina CLIP v2 encode_text method # Automatically handles tokenization internally embeddings = self.model.encode_text( text, truncate_dim=truncate_dim # Optional: 64, 128, 256, 512, 1024 ) # Convert to numpy if isinstance(embeddings, torch.Tensor): embeddings = embeddings.cpu().detach().numpy() # Normalize nếu cần if normalize: embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) return embeddings def encode_image( self, image: Union[Image.Image, bytes, List, str], truncate_dim: int = None, normalize: bool = True ) -> np.ndarray: """ Encode image thành vector embeddings Args: image: PIL Image, bytes, URL string, hoặc list of images truncate_dim: Matryoshka dimension (64-1024, None = full 1024) normalize: Có normalize embeddings không Returns: numpy array của embeddings """ # Convert bytes to PIL Image nếu cần if isinstance(image, bytes): image = Image.open(io.BytesIO(image)).convert('RGB') elif isinstance(image, list): processed_images = [] for img in image: if isinstance(img, bytes): processed_images.append(Image.open(io.BytesIO(img)).convert('RGB')) elif isinstance(img, str): # URL string - keep as is, Jina CLIP can handle URLs processed_images.append(img) else: processed_images.append(img) image = processed_images elif not isinstance(image, list) and not isinstance(image, str): # Single PIL Image image = [image] # Jina CLIP v2 encode_image method # Supports PIL Images, file paths, or URLs embeddings = self.model.encode_image( image, truncate_dim=truncate_dim # Optional: 64, 128, 256, 512, 1024 ) # Convert to numpy if isinstance(embeddings, torch.Tensor): embeddings = embeddings.cpu().detach().numpy() # Normalize nếu cần if normalize: embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) return embeddings def encode_multimodal( self, text: Union[str, List[str]] = None, image: Union[Image.Image, bytes, List] = None, truncate_dim: int = None, normalize: bool = True ) -> np.ndarray: """ Encode cả text và image, trả về embeddings kết hợp Args: text: Text hoặc list of texts (tiếng Việt) image: PIL Image, bytes, hoặc list of images truncate_dim: Matryoshka dimension (64-1024, None = full 1024) normalize: Có normalize embeddings không Returns: numpy array của embeddings """ embeddings = [] if text is not None: text_emb = self.encode_text(text, truncate_dim=truncate_dim, normalize=False) embeddings.append(text_emb) if image is not None: image_emb = self.encode_image(image, truncate_dim=truncate_dim, normalize=False) embeddings.append(image_emb) # Combine embeddings (average) if len(embeddings) == 2: # Average của text và image embeddings combined = np.mean(embeddings, axis=0) elif len(embeddings) == 1: combined = embeddings[0] else: raise ValueError("Phải cung cấp ít nhất text hoặc image") # Normalize nếu cần if normalize: combined = combined / np.linalg.norm(combined, axis=1, keepdims=True) return combined def get_embedding_dimension(self) -> int: """ Trả về dimension của embeddings (1024 cho Jina CLIP v2) """ return 1024