Spaces:

fahmiaziz
/

api-rerank-model

Running

App Files Files Community

fahmiaziz98 commited on Sep 28

Commit

7f8bfb2

1 Parent(s): 8786174

Refactor reranking models and configuration management; add YAML support for model settings

Browse files

Files changed (8) hide show

app.py +14 -328
config.yaml +28 -0
core/__init__.py +3 -0
core/base.py +21 -0
core/cross_encoder.py +239 -0
core/model_manager.py +137 -0
models/__init__.py +4 -0
models/model.py +52 -0

app.py CHANGED Viewed

@@ -1,326 +1,12 @@
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel, Field
-from typing import List, Optional, Dict, Any
-from loguru import logger
 import time
-import torch
 from contextlib import asynccontextmanager
-from sentence_transformers import CrossEncoder
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# -------------------------
-# Request/Response Models
-# -------------------------
-class RerankRequest(BaseModel):
-    """
-    Request model for document reranking.
-    Attributes:
-        query: The search query
-        documents: List of documents to rerank
-        model_id: Identifier of the reranking model to use
-        instruction: Optional instruction for instruction-based models
-        top_k: Maximum number of documents to return (optional)
-    """
-    query: str = Field(..., description="Search query text")
-    documents: List[str] = Field(..., min_items=1, description="List of documents to rerank")
-    model_id: str = Field(..., description="Model identifier for reranking")
-    instruction: Optional[str] = Field(None, description="Optional instruction for reranking task")
-    top_k: Optional[int] = Field(None, description="Maximum number of results to return")
-class RerankResult(BaseModel):
-    """
-    Single reranking result.
-    Attributes:
-        text: The document text
-        score: Relevance score from the reranking model
-        index: Original index of the document in input list
-    """
-    text: str
-    score: float
-    index: int
-class RerankResponse(BaseModel):
-    """
-    Response model for document reranking.
-    Attributes:
-        results: List of reranked documents with scores
-        query: The original search query
-        model_id: Identifier of the model used
-        processing_time: Time taken to process the request
-        total_documents: Total number of input documents
-        returned_documents: Number of documents returned
-    """
-    results: List[RerankResult]
-    query: str
-    model_id: str
-    processing_time: float
-    total_documents: int
-    returned_documents: int
-# -------------------------
-# Model Management
-# -------------------------
-class RerankerModel:
-    """Base class for reranking models."""
-    def __init__(self, model_id: str, model_name: str, model_type: str):
-        self.model_id = model_id
-        self.model_name = model_name
-        self.model_type = model_type
-        self.model = None
-        self.tokenizer = None
-        self.loaded = False
-    def load(self):
-        """Load the model. To be implemented by subclasses."""
-        raise NotImplementedError
-    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
-        """Rerank documents. To be implemented by subclasses."""
-        raise NotImplementedError
-class SentenceTransformersReranker(RerankerModel):
-    """Reranker using sentence-transformers CrossEncoder."""
-    def load(self):
-        """Load sentence-transformers CrossEncoder model."""
-        try:
-            logger.info(f"Loading SentenceTransformers model: {self.model_name}")
-            self.model = CrossEncoder(
-                self.model_name,
-                model_kwargs={"torch_dtype": "auto"},
-                trust_remote_code=True
-            )
-            self.loaded = True
-            logger.success(f"Successfully loaded {self.model_id}")
-        except Exception as e:
-            logger.error(f"Failed to load {self.model_id}: {e}")
-            raise
-    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
-        """Rerank documents using CrossEncoder."""
-        if not self.loaded:
-            raise RuntimeError(f"Model {self.model_id} not loaded")
-        try:
-            # For sentence-transformers, we can use the rank method directly
-            rankings = self.model.rank(query, documents, convert_to_tensor=True)
-            # Extract scores and maintain original order
-            scores = [0.0] * len(documents)
-            for ranking in rankings:
-                scores[ranking['corpus_id']] = float(ranking['score'])
-            return scores
-        except Exception as e:
-            logger.error(f"Reranking failed with {self.model_id}: {e}")
-            raise
-class QwenReranker(RerankerModel):
-    """Reranker using Qwen3-Reranker model."""
-    def load(self):
-        """Load Qwen reranker model."""
-        try:
-            logger.info(f"Loading Qwen model: {self.model_name}")
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.model_name,
-                padding_side='left'
-            )
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_name
-            ).eval()
-            # Set up Qwen-specific tokens
-            self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
-            self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
-            self.max_length = 8192
-            # Set up prompt templates
-            self.prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
-            self.suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
-            self.prefix_tokens = self.tokenizer.encode(self.prefix, add_special_tokens=False)
-            self.suffix_tokens = self.tokenizer.encode(self.suffix, add_special_tokens=False)
-            self.loaded = True
-            logger.success(f"Successfully loaded {self.model_id}")
-        except Exception as e:
-            logger.error(f"Failed to load {self.model_id}: {e}")
-            raise
-    def _format_instruction(self, instruction: str, query: str, doc: str) -> str:
-        """Format instruction for Qwen model."""
-        if instruction is None:
-            instruction = 'Given a web search query, retrieve relevant passages that answer the query'
-        return "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
-            instruction=instruction, query=query, doc=doc
-        )
-    def _process_inputs(self, pairs: List[str]):
-        """Process input pairs for Qwen model."""
-        inputs = self.tokenizer(
-            pairs,
-            padding=False,
-            truncation='longest_first',
-            return_attention_mask=False,
-            max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens)
-        )
-        for i, ele in enumerate(inputs['input_ids']):
-            inputs['input_ids'][i] = self.prefix_tokens + ele + self.suffix_tokens
-        inputs = self.tokenizer.pad(
-            inputs,
-            padding=True,
-            return_tensors="pt",
-            max_length=self.max_length
-        )
-        for key in inputs:
-            inputs[key] = inputs[key].to(self.model.device)
-        return inputs
-    @torch.no_grad()
-    def _compute_logits(self, inputs):
-        """Compute relevance scores from model logits."""
-        batch_scores = self.model(**inputs).logits[:, -1, :]
-        true_vector = batch_scores[:, self.token_true_id]
-        false_vector = batch_scores[:, self.token_false_id]
-        batch_scores = torch.stack([false_vector, true_vector], dim=1)
-        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
-        scores = batch_scores[:, 1].exp().tolist()
-        return scores
-    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
-        """Rerank documents using Qwen model."""
-        if not self.loaded:
-            raise RuntimeError(f"Model {self.model_id} not loaded")
-        try:
-            # Format instruction pairs
-            pairs = [
-                self._format_instruction(instruction, query, doc)
-                for doc in documents
-            ]
-            # Process inputs
-            inputs = self._process_inputs(pairs)
-            # Compute scores
-            scores = self._compute_logits(inputs)
-            return scores
-        except Exception as e:
-            logger.error(f"Reranking failed with {self.model_id}: {e}")
-            raise
-class ModelManager:
-    """Manager for reranking models with preloading."""
-    def __init__(self):
-        self.models: Dict[str, RerankerModel] = {}
-        self.model_configs = {
-            "jina-reranker-v2": {
-                "model_name": "jinaai/jina-reranker-v2-base-multilingual",
-                "model_type": "sentence_transformers",
-                "description": "Multilingual reranker from Jina AI"
-            },
-            "bge-reranker-v2": {
-                "model_name": "BAAI/bge-reranker-v2-m3",
-                "model_type": "sentence_transformers",
-                "description": "BGE multilingual reranker"
-            },
-            "qwen3-reranker": {
-                "model_name": "Qwen/Qwen3-Reranker-0.6B",
-                "model_type": "qwen",
-                "description": "Qwen3 instruction-based reranker"
-            }
-        }
-    async def preload_all_models(self):
-        """Preload all configured models."""
-        logger.info(f"Starting preload of {len(self.model_configs)} reranking models...")
-        for model_id, config in self.model_configs.items():
-            try:
-                logger.info(f"Loading {model_id}...")
-                if config["model_type"] == "sentence_transformers":
-                    model = SentenceTransformersReranker(
-                        model_id=model_id,
-                        model_name=config["model_name"],
-                        model_type=config["model_type"]
-                    )
-                elif config["model_type"] == "qwen":
-                    model = QwenReranker(
-                        model_id=model_id,
-                        model_name=config["model_name"],
-                        model_type=config["model_type"]
-                    )
-                else:
-                    logger.error(f"Unknown model type: {config['model_type']}")
-                    continue
-                model.load()
-                self.models[model_id] = model
-                logger.success(f"Successfully preloaded {model_id}")
-            except Exception as e:
-                logger.error(f"Failed to preload {model_id}: {e}")
-        loaded_count = len([m for m in self.models.values() if m.loaded])
-        logger.success(f"Preloaded {loaded_count}/{len(self.model_configs)} models successfully")
-    def get_model(self, model_id: str) -> RerankerModel:
-        """Get a loaded model by ID."""
-        if model_id not in self.models:
-            raise ValueError(f"Model {model_id} not found")
-        model = self.models[model_id]
-        if not model.loaded:
-            raise ValueError(f"Model {model_id} not loaded")
-        return model
-    def list_models(self) -> List[Dict[str, Any]]:
-        """List all available models with their status."""
-        models_info = []
-        for model_id, config in self.model_configs.items():
-            model = self.models.get(model_id)
-            info = {
-                "id": model_id,
-                "name": config["model_name"],
-                "type": config["model_type"],
-                "description": config["description"],
-                "loaded": model.loaded if model else False
-            }
-            models_info.append(info)
-        return models_info
-# -------------------------
-# Application Setup
-# -------------------------
 model_manager = None
 @asynccontextmanager
@@ -331,7 +17,7 @@ async def lifespan(app: FastAPI):
     # Startup
     logger.info("Starting reranking API...")
     try:
-        model_manager = ModelManager()
         await model_manager.preload_all_models()
         logger.success("Reranking API startup complete!")
     except Exception as e:
@@ -357,6 +43,7 @@ High-performance API for document reranking using multiple state-of-the-art mode
 🚀 **Features:**
 - Multiple reranking models preloaded at startup
 - Batch document reranking with relevance scoring
 - Optional instruction-based reranking (Qwen3)
 - Comprehensive performance metrics
 - Zero cold start delay
@@ -364,6 +51,8 @@ High-performance API for document reranking using multiple state-of-the-art mode
 📊 **Input/Output:**
 - Input: Query + documents + optional instruction
 - Output: Ranked documents with relevance scores
     """,
     version="1.0.0",
     lifespan=lifespan
@@ -407,7 +96,6 @@ async def rerank_documents(request: RerankRequest):
     if not request.documents:
         raise HTTPException(400, "Documents list cannot be empty")
-    # Filter out empty documents
     valid_docs = [(i, doc.strip()) for i, doc in enumerate(request.documents) if doc.strip()]
     if not valid_docs:
         raise HTTPException(400, "No valid documents found after filtering empty strings")
@@ -415,20 +103,16 @@ async def rerank_documents(request: RerankRequest):
     try:
         start_time = time.time()
-        # Get model
         model = model_manager.get_model(request.model_id)
-        # Extract valid documents and their indices
         original_indices, documents = zip(*valid_docs)
-        # Perform reranking
         scores = model.rerank(
             query=request.query.strip(),
             documents=list(documents),
             instruction=request.instruction
         )
-        # Create results with original indices
         results = []
         for i, (orig_idx, doc, score) in enumerate(zip(original_indices, documents, scores)):
             results.append(RerankResult(
@@ -437,10 +121,8 @@ async def rerank_documents(request: RerankRequest):
                 index=orig_idx
             ))
-        # Sort by score (descending)
         results.sort(key=lambda x: x.score, reverse=True)
-        # Apply top_k limit if specified
         if request.top_k:
             results = results[:request.top_k]
@@ -513,4 +195,8 @@ async def health_check():
             "status": "error",
             "error": str(e)
         }

 import time
+from loguru import logger
+from fastapi import FastAPI, HTTPException
 from contextlib import asynccontextmanager
+from models import RerankRequest, RerankResponse, RerankResult
+from core import ModelManager
 model_manager = None
 @asynccontextmanager
     # Startup
     logger.info("Starting reranking API...")
     try:
+        model_manager = ModelManager("config.yaml")
         await model_manager.preload_all_models()
         logger.success("Reranking API startup complete!")
     except Exception as e:
 🚀 **Features:**
 - Multiple reranking models preloaded at startup
 - Batch document reranking with relevance scoring
+- Fast prototyping app
 - Optional instruction-based reranking (Qwen3)
 - Comprehensive performance metrics
 - Zero cold start delay
 📊 **Input/Output:**
 - Input: Query + documents + optional instruction
 - Output: Ranked documents with relevance scores
+**Warning**: Not use production!.
     """,
     version="1.0.0",
     lifespan=lifespan
     if not request.documents:
         raise HTTPException(400, "Documents list cannot be empty")
     valid_docs = [(i, doc.strip()) for i, doc in enumerate(request.documents) if doc.strip()]
     if not valid_docs:
         raise HTTPException(400, "No valid documents found after filtering empty strings")
     try:
         start_time = time.time()
         model = model_manager.get_model(request.model_id)
         original_indices, documents = zip(*valid_docs)
+        logger.info(f"Query: {request.query.strip()}")
+        logger.info(f"Document: {list(documents)}")
         scores = model.rerank(
             query=request.query.strip(),
             documents=list(documents),
             instruction=request.instruction
         )
         results = []
         for i, (orig_idx, doc, score) in enumerate(zip(original_indices, documents, scores)):
             results.append(RerankResult(
                 index=orig_idx
             ))
         results.sort(key=lambda x: x.score, reverse=True)
         if request.top_k:
             results = results[:request.top_k]
             "status": "error",
             "error": str(e)
         }
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Multi-Model Reranking API. Visit /docs for API documentation.", "version": "1.0.0"}

config.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+# Model configuration for ModelManager
+# You can add or modify model entries as needed
+models:
+  jina-reranker-v2:
+    model_name: jinaai/jina-reranker-v2-base-multilingual
+    model_type: sentence_transformers
+    description: |
+      The Jina Reranker v2 (jina-reranker-v2-base-multilingual) is a transformer-based model that has been fine-tuned for text reranking task, which is a crucial component in many information retrieval systems. It is a cross-encoder model that takes a query and a document pair as input and outputs a score indicating the relevance of the document to the query. The model is trained on a large dataset of query-document pairs and is capable of reranking documents in multiple languages with high accuracy.
+    languages: ["multilingual"]
+    repository: https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual
+  bge-reranker-v2:
+    model_name: BAAI/bge-reranker-v2-m3
+    model_type: sentence_transformers
+    description: |
+      Different from embedding model, reranker uses question and document as input and directly output similarity instead of embedding. You can get a relevance score by inputting query and passage to the reranker. And the score can be mapped to a float value in [0,1] by sigmoid function.
+    languages: ["multilingual"]
+    repository: https://huggingface.co/BAAI/bge-reranker-v2-m3
+  qwen3-reranker:
+    model_name: Qwen/Qwen3-Reranker-0.6B
+    model_type: qwen
+    description: |
+      The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the exceptional multilingual capabilities, long-text understanding, and reasoning skills of its foundational model. The Qwen3 Embedding series represents significant advancements in multiple text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bitext mining.
+    languages: ["multilingual"]
+    repository: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
+default_model: bge-reranker-v2

core/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model_manager import ModelManager
2	+
3	+ __all__ = ["ModelManager"]

core/base.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from typing import List, Optional
+class RerankerModel:
+    """Base class for reranking models."""
+    def __init__(self, model_id: str, model_name: str, model_type: str):
+        self.model_id = model_id
+        self.model_name = model_name
+        self.model_type = model_type
+        self.model = None
+        self.tokenizer = None
+        self.loaded = False
+    def load(self):
+        """Load the model. To be implemented by subclasses."""
+        raise NotImplementedError
+    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
+        """Rerank documents. To be implemented by subclasses."""
+        raise NotImplementedError

core/cross_encoder.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import torch
+from typing import List, Optional
+from loguru import logger
+from sentence_transformers import CrossEncoder
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from .base import RerankerModel
+class SentenceTransformersReranker(RerankerModel):
+    """
+    Reranker using sentence-transformers CrossEncoder.
+    This class leverages the CrossEncoder model from the sentence-transformers library to score the relevance of documents given a query. It is suitable for reranking tasks in information retrieval pipelines.
+    Attributes:
+        model_name (str): Name or path of the model to load.
+        model (CrossEncoder): The loaded CrossEncoder model instance.
+        loaded (bool): Whether the model has been loaded.
+        model_id (str): Unique identifier for the model instance.
+    """
+    def load(self):
+        """
+        Load the sentence-transformers CrossEncoder model.
+        Loads the CrossEncoder model specified by self.model_name. Sets self.loaded to True if successful.
+        Raises:
+            Exception: If the model fails to load.
+        """
+        try:
+            logger.info(f"Loading SentenceTransformers model: {self.model_name}")
+            self.model = CrossEncoder(
+                self.model_name,
+                model_kwargs={"torch_dtype": "auto"},
+                trust_remote_code=True
+            )
+            self.loaded = True
+            logger.success(f"Successfully loaded {self.model_id}")
+        except Exception as e:
+            logger.error(f"Failed to load {self.model_id}: {e}")
+            raise
+    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
+        """
+        Rerank documents using the CrossEncoder model.
+        Args:
+            query (str): The search query string.
+            documents (List[str]): List of documents to be reranked.
+            instruction (Optional[str]): Additional instruction for reranking (not used in this implementation).
+        Returns:
+            List[float]: List of relevance scores for each document.
+        Raises:
+            RuntimeError: If the model is not loaded.
+            Exception: If reranking fails.
+        """
+        if not self.loaded:
+            raise RuntimeError(f"Model {self.model_id} not loaded")
+        try:
+            rankings = self.model.rank(query, documents, convert_to_tensor=True)
+            scores = [0.0] * len(documents)
+            for ranking in rankings:
+                scores[ranking['corpus_id']] = float(ranking['score'])
+            return scores
+        except Exception as e:
+            logger.error(f"Reranking failed with {self.model_id}: {e}")
+            raise
+class QwenReranker(RerankerModel):
+    """
+    Reranker using Qwen3-Reranker model (LLM-based).
+    This class uses a Qwen LLM to judge the relevance of documents to a query and instruction. The model outputs a probability that each document is relevant ("yes") or not ("no").
+    Attributes:
+        model_name (str): Name or path of the Qwen model.
+        tokenizer (AutoTokenizer): Tokenizer for the Qwen model.
+        model (AutoModelForCausalLM): Loaded Qwen model instance.
+        loaded (bool): Whether the model has been loaded.
+        model_id (str): Unique identifier for the model instance.
+        token_false_id (int): Token ID for "no".
+        token_true_id (int): Token ID for "yes".
+        max_length (int): Maximum input token length.
+        prefix (str): Prompt prefix for the system message.
+        suffix (str): Prompt suffix for the assistant message.
+        prefix_tokens (List[int]): Tokenized prefix.
+        suffix_tokens (List[int]): Tokenized suffix.
+    """
+    def load(self):
+        """
+        Load the Qwen reranker model and tokenizer, and initialize prompt templates and special tokens.
+        Raises:
+            Exception: If the model or tokenizer fails to load.
+        """
+        try:
+            logger.info(f"Loading Qwen model: {self.model_name}")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                padding_side='left'
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name
+            ).eval()
+            # Set up Qwen-specific tokens
+            self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+            self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
+            self.max_length = 8192
+            # Set up prompt templates
+            self.prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
+            self.suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+            self.prefix_tokens = self.tokenizer.encode(self.prefix, add_special_tokens=False)
+            self.suffix_tokens = self.tokenizer.encode(self.suffix, add_special_tokens=False)
+            self.loaded = True
+            logger.success(f"Successfully loaded {self.model_id}")
+        except Exception as e:
+            logger.error(f"Failed to load {self.model_id}: {e}")
+            raise
+    def _format_instruction(self, instruction: str, query: str, doc: str) -> str:
+        """
+        Format the instruction string for the Qwen model prompt.
+        Args:
+            instruction (str): The instruction for the reranker. If None, a default instruction is used.
+            query (str): The search query string.
+            doc (str): The document to be evaluated.
+        Returns:
+            str: Formatted prompt string for the model.
+        """
+        if instruction is None:
+            instruction = 'Given a web search query, retrieve relevant passages that answer the query'
+        return "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
+            instruction=instruction, query=query, doc=doc
+        )
+    def _process_inputs(self, pairs: List[str]):
+        """
+        Tokenize and prepare input pairs for the Qwen model.
+        Args:
+            pairs (List[str]): List of formatted prompt strings for each document.
+        Returns:
+            dict: Tokenized and padded input tensors for the model.
+        """
+        inputs = self.tokenizer(
+            pairs,
+            padding=False,
+            truncation='longest_first',
+            return_attention_mask=False,
+            max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens)
+        )
+        for i, ele in enumerate(inputs['input_ids']):
+            inputs['input_ids'][i] = self.prefix_tokens + ele + self.suffix_tokens
+        inputs = self.tokenizer.pad(
+            inputs,
+            padding=True,
+            return_tensors="pt",
+            max_length=self.max_length
+        )
+        for key in inputs:
+            inputs[key] = inputs[key].to(self.model.device)
+        return inputs
+    @torch.no_grad()
+    def _compute_logits(self, inputs):
+        """
+        Compute relevance scores from model logits.
+        Args:
+            inputs (dict): Tokenized and padded input tensors for the model.
+        Returns:
+            List[float]: List of probabilities that each document is relevant ("yes").
+        """
+        batch_scores = self.model(**inputs).logits[:, -1, :]
+        true_vector = batch_scores[:, self.token_true_id]
+        false_vector = batch_scores[:, self.token_false_id]
+        batch_scores = torch.stack([false_vector, true_vector], dim=1)
+        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+        scores = batch_scores[:, 1].exp().tolist()
+        return scores
+    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
+        """
+        Rerank documents using the Qwen model.
+        Args:
+            query (str): The search query string.
+            documents (List[str]): List of documents to be reranked.
+            instruction (Optional[str]): Additional instruction for reranking.
+        Returns:
+            List[float]: List of relevance scores for each document.
+        Raises:
+            RuntimeError: If the model is not loaded.
+            Exception: If reranking fails.
+        """
+        if not self.loaded:
+            raise RuntimeError(f"Model {self.model_id} not loaded")
+        try:
+            pairs = [
+                self._format_instruction(instruction, query, doc)
+                for doc in documents
+            ]
+            inputs = self._process_inputs(pairs)
+            scores = self._compute_logits(inputs)
+            return scores
+        except Exception as e:
+            logger.error(f"Reranking failed with {self.model_id}: {e}")
+            raise

core/model_manager.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import yaml
+from typing import List, Dict, Any
+from loguru import logger
+from .base import RerankerModel
+from .cross_encoder import SentenceTransformersReranker, QwenReranker
+class ModelManager:
+    """
+    Manager for reranking models with preloading and configuration.
+    This class loads model configurations from a YAML file (default: config.yaml),
+    instantiates and manages multiple reranker models, and provides methods to preload,
+    retrieve, and list the available models. Supports a default model if model_id is not provided.
+    Attributes:
+        models (Dict[str, RerankerModel]): Dictionary of loaded model instances keyed by model ID.
+        model_configs (Dict[str, Dict[str, Any]]): Model configuration loaded from YAML file.
+        default_model_id (str): The default model ID to use if none is provided.
+    """
+    def __init__(self, config_path: str = 'config.yaml'):
+        """
+        Initialize the ModelManager and load model configurations from a YAML file.
+        Args:
+            config_path (str): Path to the YAML configuration file. Defaults to 'config.yaml'.
+        Side Effects:
+            Loads model configuration into self.model_configs.
+            Initializes an empty dictionary for loaded models.
+            Sets the default model ID from config.
+        """
+        self.models: Dict[str, RerankerModel] = {}
+        try:
+            with open(config_path, 'r') as f:
+                config_data = yaml.safe_load(f)
+            self.model_configs = config_data.get('models', {})
+            self.default_model_id = config_data.get('default_model')
+            logger.info(f"Loaded model configs from {config_path}")
+        except Exception as e:
+            logger.error(f"Failed to load config.yaml: {e}")
+            self.model_configs = {}
+            self.default_model_id = None
+    async def preload_all_models(self):
+        """
+        Preload all models defined in the configuration file.
+        Iterates through all model configurations, instantiates the appropriate reranker class
+        (SentenceTransformersReranker or QwenReranker), loads the model, and stores it in self.models.
+        Logs the status of each model load and a summary at the end.
+        Raises:
+            Exception: If a model fails to load, logs the error and continues with the next model.
+        """
+        logger.info(f"Starting preload of {len(self.model_configs)} reranking models...")
+        for model_id, config in self.model_configs.items():
+            try:
+                logger.info(f"Loading {model_id}...")
+                if config["model_type"] == "sentence_transformers":
+                    model = SentenceTransformersReranker(
+                        model_id=model_id,
+                        model_name=config["model_name"],
+                        model_type=config["model_type"]
+                    )
+                elif config["model_type"] == "qwen":
+                    model = QwenReranker(
+                        model_id=model_id,
+                        model_name=config["model_name"],
+                        model_type=config["model_type"]
+                    )
+                else:
+                    logger.error(f"Unknown model type: {config['model_type']}")
+                    continue
+                model.load()
+                self.models[model_id] = model
+                logger.success(f"Successfully preloaded {model_id}")
+            except Exception as e:
+                logger.error(f"Failed to preload {model_id}: {e}")
+        loaded_count = len([m for m in self.models.values() if m.loaded])
+        logger.success(f"Preloaded {loaded_count}/{len(self.model_configs)} models successfully")
+    def get_model(self, model_id: str = None) -> RerankerModel:
+        """
+        Retrieve a loaded model instance by its ID, or use the default model if not specified.
+        Args:
+            model_id (str, optional): The unique identifier of the model to retrieve. If None, uses the default model.
+        Returns:
+            RerankerModel: The loaded reranker model instance.
+        Raises:
+            ValueError: If the model is not found or not loaded.
+        """
+        if model_id is None:
+            if not self.default_model_id:
+                raise ValueError("No model_id provided and no default_model set in config.yaml")
+            model_id = self.default_model_id
+        if model_id not in self.models:
+            raise ValueError(f"Model {model_id} not found")
+        model = self.models[model_id]
+        if not model.loaded:
+            raise ValueError(f"Model {model_id} not loaded")
+        return model
+    def list_models(self) -> List[Dict[str, Any]]:
+        """
+        List all available models with their configuration and load status.
+        Returns:
+            List[Dict[str, Any]]: A list of dictionaries, each containing model ID, name, type, description, and loaded status.
+        """
+        models_info = []
+        for model_id, config in self.model_configs.items():
+            model = self.models.get(model_id)
+            info = {
+                "id": model_id,
+                "name": config.get("model_name"),
+                "type": config.get("model_type"),
+                "language": config.get("language"),
+                "description": config.get("description"),
+                "repository": config.get("repository"),
+                "loaded": model.loaded if model else False
+            }
+            models_info.append(info)
+        return models_info

models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from .model import RerankRequest, RerankResponse, RerankResult
2	+
3	+
4	+ __all__ = ["RerankRequest", "RerankResponse", "RerankResult"]

models/model.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import List, Optional
+from pydantic import BaseModel, Field
+class RerankRequest(BaseModel):
+    """
+    Request model for document reranking.
+    Attributes:
+        query: The search query
+        documents: List of documents to rerank
+        model_id: Identifier of the reranking model to use
+        instruction: Optional instruction for instruction-based models
+        top_k: Maximum number of documents to return (optional)
+    """
+    query: str = Field(..., description="Search query text")
+    documents: List[str] = Field(..., min_items=1, description="List of documents to rerank")
+    model_id: Optional[str] = Field(..., description="Model identifier for reranking")
+    instruction: Optional[str] = Field(None, description="Optional instruction for reranking task")
+    top_k: Optional[int] = Field(None, description="Maximum number of results to return")
+class RerankResult(BaseModel):
+    """
+    Single reranking result.
+    Attributes:
+        text: The document text
+        score: Relevance score from the reranking model
+        index: Original index of the document in input list
+    """
+    text: str
+    score: float
+    index: int
+class RerankResponse(BaseModel):
+    """
+    Response model for document reranking.
+    Attributes:
+        results: List of reranked documents with scores
+        query: The original search query
+        model_id: Identifier of the model used
+        processing_time: Time taken to process the request
+        total_documents: Total number of input documents
+        returned_documents: Number of documents returned
+    """
+    results: List[RerankResult]
+    query: str
+    model_id: str
+    processing_time: float
+    total_documents: int
+    returned_documents: int