Spaces:
Configuration error
Configuration error
| from sentence_transformers import CrossEncoder, SentenceTransformer | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import faiss | |
| import numpy as np | |
| from typing import List, Dict | |
| class ArabicRAGSystem: | |
| def __init__(self): | |
| # Initialize models | |
| self.embedding_model = SentenceTransformer("aubmindlab/bert-base-arabertv2") | |
| self.cross_encoder = CrossEncoder("Arabic-Misc/roberta-base-arabic-camelbert-da-msa") | |
| self.tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b-chat") | |
| self.llm = AutoModelForCausalLM.from_pretrained("inception-mbzuai/jais-13b-chat") | |
| self.index = faiss.IndexFlatL2(768) | |
| def _create_index(self, documents: List[Dict]): | |
| texts = [doc["text"] for doc in documents] | |
| embeddings = self.embedding_model.encode(texts) | |
| self.index.add(np.array(embeddings)) | |
| def generate_answer(self, question: str, documents: List[Dict], | |
| top_k: int = 5, temperature: float = 0.7) -> tuple: | |
| # Indexing phase | |
| self._create_index(documents) | |
| # Two-stage retrieval | |
| query_embedding = self.embedding_model.encode([question]) | |
| distances, indices = self.index.search(query_embedding, top_k*2) | |
| # Re-ranking with cross-encoder | |
| pairs = [[question, documents[idx]["text"]] for idx in indices[0]] | |
| scores = self.cross_encoder.predict(pairs) | |
| ranked_indices = np.argsort(scores)[::-1][:top_k] | |
| # Prepare context | |
| context = "\n\n".join([ | |
| f"المصدر: {documents[idx]['source']}\n" | |
| f"الصفحة: {documents[idx]['page']}\n" | |
| f"النص: {documents[idx]['text']}" | |
| for idx in [indices[0][i] for i in ranked_indices] | |
| ]) | |
| # Generate answer | |
| prompt = f""" | |
| أنت خبير في التحليل الديني. قم بالإجابة على السؤال التالي بناءً على السياق المقدم فقط: | |
| السياق: | |
| {context} | |
| السؤال: | |
| {question} | |
| التعليمات: | |
| - أجب باللغة العربية الفصحى | |
| - استخدم علامات التنسيق المناسبة | |
| - أشر إلى المصادر باستخدام التنسيق [المصدر: اسم الملف، الصفحة: رقم] | |
| - إذا لم توجد إجابة واضحة، قل "لا تتوفر معلومات كافية" | |
| الإجابة: | |
| """.strip() | |
| inputs = self.tokenizer(prompt, return_tensors="pt") | |
| outputs = self.llm.generate( | |
| inputs.input_ids, | |
| max_new_tokens=512, | |
| temperature=temperature, | |
| do_sample=True, | |
| pad_token_id=self.tokenizer.eos_token_id | |
| ) | |
| answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| answer = answer.split("الإجابة:")[-1].strip() | |
| # Prepare sources | |
| sources = [] | |
| for idx in [indices[0][i] for i in ranked_indices]: | |
| sources.append({ | |
| "text": documents[idx]["text"], | |
| "source": documents[idx]["source"], | |
| "page": documents[idx]["page"], | |
| "score": float(scores[idx]) | |
| }) | |
| return answer, sources |