Spaces:

minhvtt
/

EBD_Fest

Sleeping

App Files Files Community

minhvtt commited on Oct 6

Commit

f056202

verified ·

1 Parent(s): 36cbe5f

Update embedding_service.py

Browse files

Files changed (1) hide show

embedding_service.py +61 -88

embedding_service.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import torch
 import numpy as np
 from PIL import Image
-from transformers import AutoTokenizer, AutoModel
-import onnxruntime as ort
 from typing import Union, List
 import io
@@ -10,7 +9,7 @@ import io
 class JinaClipEmbeddingService:
     """
     Jina CLIP v2 Embedding Service với hỗ trợ tiếng Việt
-    Sử dụng ONNX model để tăng tốc độ inference
     """
     def __init__(self, model_path: str = "jinaai/jina-clip-v2"):
@@ -22,40 +21,30 @@ class JinaClipEmbeddingService:
         """
         print(f"Loading Jina CLIP v2 model from {model_path}...")
-        # Load tokenizer và processor cho text (hỗ trợ tiếng Việt)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        # Load ONNX model cho vision encoder
-        self.onnx_model_path = f"{model_path}/onnx/model_fp16.onnx"
-        try:
-            # Thử load ONNX model nếu có
-            self.vision_session = ort.InferenceSession(
-                self.onnx_model_path,
-                providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
-            )
-            self.use_onnx = True
-            print("✓ Loaded ONNX model for vision encoder")
-        except:
-            # Fallback sang PyTorch model
-            self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
-            self.use_onnx = False
-            print("✓ Loaded PyTorch model (ONNX not available)")
-            # Chuyển sang eval mode
-            self.model.eval()
-            # Sử dụng GPU nếu có
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
-            self.model.to(self.device)
-            print(f"✓ Model running on: {self.device}")
-    def encode_text(self, text: Union[str, List[str]], normalize: bool = True) -> np.ndarray:
         """
         Encode text thành vector embeddings (hỗ trợ tiếng Việt)
         Args:
             text: Text hoặc list of texts (tiếng Việt)
             normalize: Có normalize embeddings không
         Returns:
@@ -64,28 +53,16 @@ class JinaClipEmbeddingService:
         if isinstance(text, str):
             text = [text]
-        # Tokenize text với max length cho Jina CLIP v2
-        inputs = self.tokenizer(
             text,
-            padding=True,
-            truncation=True,
-            max_length=512,
-            return_tensors="pt"
         )
-        if not self.use_onnx:
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        # Generate embeddings
-        with torch.no_grad():
-            if self.use_onnx:
-                # ONNX inference
-                onnx_inputs = {k: v.numpy() for k, v in inputs.items()}
-                embeddings = self.vision_session.run(None, onnx_inputs)[0]
-            else:
-                # PyTorch inference
-                outputs = self.model.encode_text(**inputs)
-                embeddings = outputs.cpu().numpy()
         # Normalize nếu cần
         if normalize:
@@ -93,12 +70,18 @@ class JinaClipEmbeddingService:
         return embeddings
-    def encode_image(self, image: Union[Image.Image, bytes, List], normalize: bool = True) -> np.ndarray:
         """
         Encode image thành vector embeddings
         Args:
-            image: PIL Image, bytes, hoặc list of images
             normalize: Có normalize embeddings không
         Returns:
@@ -112,40 +95,26 @@ class JinaClipEmbeddingService:
             for img in image:
                 if isinstance(img, bytes):
                     processed_images.append(Image.open(io.BytesIO(img)).convert('RGB'))
                 else:
                     processed_images.append(img)
             image = processed_images
-        else:
-            if not isinstance(image, list):
-                image = [image]
-        # Process images
-        if self.use_onnx:
-            # Preprocessing cho ONNX model
-            # Resize to 512x512 (Jina CLIP v2 high resolution)
-            from torchvision import transforms
-            preprocess = transforms.Compose([
-                transforms.Resize((512, 512)),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.48145466, 0.4578275, 0.40821073],
-                    std=[0.26862954, 0.26130258, 0.27577711]
-                )
-            ])
-            if isinstance(image, list):
-                pixel_values = torch.stack([preprocess(img) for img in image])
-            else:
-                pixel_values = preprocess(image).unsqueeze(0)
-            # ONNX inference
-            onnx_inputs = {"pixel_values": pixel_values.numpy()}
-            embeddings = self.vision_session.run(None, onnx_inputs)[0]
-        else:
-            # PyTorch inference
-            with torch.no_grad():
-                embeddings = self.model.encode_image(image)
-                embeddings = embeddings.cpu().numpy()
         # Normalize nếu cần
         if normalize:
@@ -157,6 +126,7 @@ class JinaClipEmbeddingService:
         self,
         text: Union[str, List[str]] = None,
         image: Union[Image.Image, bytes, List] = None,
         normalize: bool = True
     ) -> np.ndarray:
         """
@@ -165,6 +135,7 @@ class JinaClipEmbeddingService:
         Args:
             text: Text hoặc list of texts (tiếng Việt)
             image: PIL Image, bytes, hoặc list of images
             normalize: Có normalize embeddings không
         Returns:
@@ -173,19 +144,21 @@ class JinaClipEmbeddingService:
         embeddings = []
         if text is not None:
-            text_emb = self.encode_text(text, normalize=False)
             embeddings.append(text_emb)
         if image is not None:
-            image_emb = self.encode_image(image, normalize=False)
             embeddings.append(image_emb)
-        # Combine embeddings (average hoặc concat)
         if len(embeddings) == 2:
             # Average của text và image embeddings
             combined = np.mean(embeddings, axis=0)
-        else:
             combined = embeddings[0]
         # Normalize nếu cần
         if normalize:

 import torch
 import numpy as np
 from PIL import Image
+from transformers import AutoModel
 from typing import Union, List
 import io
 class JinaClipEmbeddingService:
     """
     Jina CLIP v2 Embedding Service với hỗ trợ tiếng Việt
+    Sử dụng AutoModel với trust_remote_code
     """
     def __init__(self, model_path: str = "jinaai/jina-clip-v2"):
         """
         print(f"Loading Jina CLIP v2 model from {model_path}...")
+        # Load model với trust_remote_code
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+        # Chuyển sang eval mode
+        self.model.eval()
+        # Sử dụng GPU nếu có
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+        print(f"✓ Loaded Jina CLIP v2 model on: {self.device}")
+    def encode_text(
+        self,
+        text: Union[str, List[str]],
+        truncate_dim: int = None,
+        normalize: bool = True
+    ) -> np.ndarray:
         """
         Encode text thành vector embeddings (hỗ trợ tiếng Việt)
         Args:
             text: Text hoặc list of texts (tiếng Việt)
+            truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
             normalize: Có normalize embeddings không
         Returns:
         if isinstance(text, str):
             text = [text]
+        # Jina CLIP v2 encode_text method
+        # Automatically handles tokenization internally
+        embeddings = self.model.encode_text(
             text,
+            truncate_dim=truncate_dim  # Optional: 64, 128, 256, 512, 1024
         )
+        # Convert to numpy
+        if isinstance(embeddings, torch.Tensor):
+            embeddings = embeddings.cpu().detach().numpy()
         # Normalize nếu cần
         if normalize:
         return embeddings
+    def encode_image(
+        self,
+        image: Union[Image.Image, bytes, List, str],
+        truncate_dim: int = None,
+        normalize: bool = True
+    ) -> np.ndarray:
         """
         Encode image thành vector embeddings
         Args:
+            image: PIL Image, bytes, URL string, hoặc list of images
+            truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
             normalize: Có normalize embeddings không
         Returns:
             for img in image:
                 if isinstance(img, bytes):
                     processed_images.append(Image.open(io.BytesIO(img)).convert('RGB'))
+                elif isinstance(img, str):
+                    # URL string - keep as is, Jina CLIP can handle URLs
+                    processed_images.append(img)
                 else:
                     processed_images.append(img)
             image = processed_images
+        elif not isinstance(image, list) and not isinstance(image, str):
+            # Single PIL Image
+            image = [image]
+        # Jina CLIP v2 encode_image method
+        # Supports PIL Images, file paths, or URLs
+        embeddings = self.model.encode_image(
+            image,
+            truncate_dim=truncate_dim  # Optional: 64, 128, 256, 512, 1024
+        )
+        # Convert to numpy
+        if isinstance(embeddings, torch.Tensor):
+            embeddings = embeddings.cpu().detach().numpy()
         # Normalize nếu cần
         if normalize:
         self,
         text: Union[str, List[str]] = None,
         image: Union[Image.Image, bytes, List] = None,
+        truncate_dim: int = None,
         normalize: bool = True
     ) -> np.ndarray:
         """
         Args:
             text: Text hoặc list of texts (tiếng Việt)
             image: PIL Image, bytes, hoặc list of images
+            truncate_dim: Matryoshka dimension (64-1024, None = full 1024)
             normalize: Có normalize embeddings không
         Returns:
         embeddings = []
         if text is not None:
+            text_emb = self.encode_text(text, truncate_dim=truncate_dim, normalize=False)
             embeddings.append(text_emb)
         if image is not None:
+            image_emb = self.encode_image(image, truncate_dim=truncate_dim, normalize=False)
             embeddings.append(image_emb)
+        # Combine embeddings (average)
         if len(embeddings) == 2:
             # Average của text và image embeddings
             combined = np.mean(embeddings, axis=0)
+        elif len(embeddings) == 1:
             combined = embeddings[0]
+        else:
+            raise ValueError("Phải cung cấp ít nhất text hoặc image")
         # Normalize nếu cần
         if normalize: