import json import warnings import re import os import google.generativeai as genai from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.schema import Document from langchain.prompts import PromptTemplate import gradio as gr # Suppress warnings for cleaner output warnings.filterwarnings("ignore") class ZarmaLanguageAnalyzer: def __init__(self, grammar_path: str, glossary_path: str): """ Initialize the Zarma Language Analyzer with grammar rules and glossary. Optimized for CPU usage on Hugging Face Spaces. """ print("Running on CPU for Hugging Face Spaces.") self.grammar_rules = self._load_json(grammar_path).get("grammar_rules", []) self.glossary_data = self._load_json(glossary_path) self._setup_models() self._setup_vectorstore() def _load_json(self, file_path: str) -> dict: """Load and parse a JSON file.""" with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) def _setup_models(self): """Set up the Gemini-2.0-flash model via Google Generative AI API.""" # Get API key from environment variable api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise ValueError("GOOGLE_API_KEY environment variable not set.") genai.configure(api_key=api_key) self.model = genai.GenerativeModel("gemini-2.0-flash") self.analysis_template = PromptTemplate( input_variables=["sentence", "grammar_check", "glossary_info", "language"], template=""" You are a Zarma language expert. Analyze this Zarma sentence: "{sentence}" Rely primarily on your expertise in Zarma grammar and meaning. Recognize proper nouns (e.g., names or places) as such unless the glossary explicitly contradicts this with a common Zarma meaning. Use the grammar check and glossary below as supplementary aids only—do not override your knowledge unless they provide clear, contextually relevant insight. Grammar check results (optional guide): {grammar_check} Glossary information (use it but prioritize your expertise to confirm): {glossary_info} Provide a detailed linguistic analysis in {language} in this exact format, with no extra text outside the sections: 1. WORD BREAKDOWN: - [List each word with its grammatical role and meaning, e.g., "Ay: 1st person singular pronoun, meaning 'I'."] 2. LINGUISTIC INSIGHT: - Word Order: [Describe typical Zarma word order (e.g., SOV, SVO) and how this sentence aligns or deviates] - Tense/Aspect Markers: [Explain tense/aspect markers like 'ga', 'goono ga', or none for past, with examples like "Ay ga koy" (I will go)] - Contextual Insight: [Discuss what the sentence might intend to convey and any external influences or errors] 3. CORRECTNESS ASSESSMENT: - Is the sentence correct? [Yes/No, with explanation] - Reason for Incorrectness (if applicable): [Detailed reason why it’s wrong, e.g., misplaced particle] - Corrections (depending on intended meaning): - [Option 1: Corrected sentence with explanation] - [Option 2: Corrected sentence with explanation] - [Option 3: Corrected sentence with explanation] """ ) def _setup_vectorstore(self): """Set up FAISS vector store with the glossary for retrieval.""" embed_model = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"} # Force CPU usage ) documents = [] for entry in self.glossary_data: fr_word = entry.get("fr", "") dje_word = entry.get("dje", "") notes = entry.get("notes", "No additional context available") content = f"French: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}" metadata = {"fr": fr_word, "dje": dje_word, "notes": notes} documents.append(Document(page_content=content, metadata=metadata)) self.vectorstore = FAISS.from_documents(documents, embed_model) def check_grammar(self, sentence: str) -> list: """Check if the sentence violates any grammar rules.""" issues = [] for rule in self.grammar_rules: rule_id = rule.get("rule_id", "") category = rule.get("category", "") subcategory = rule.get("subcategory", "") description = rule.get("description", "") examples = rule.get("examples", []) for example in examples: wrong_phrase = example.get("zarma", "") corrected_phrase = example.get("corrected_zarma", "") english_example = example.get("english", "") if wrong_phrase and wrong_phrase in sentence: explanation = ( f"This rule applies because '{wrong_phrase}' doesn't follow {category} norms in Zarma. " f"Specifically, it violates rules related to {subcategory}. " f"The correct form would be '{corrected_phrase or 'unknown'}'. " f"In English, this is similar to: '{english_example}'" ) issues.append({ "rule_id": rule_id, "category": category, "subcategory": subcategory, "description": description, "wrong_phrase": wrong_phrase, "corrected_phrase": corrected_phrase, "english_example": english_example, "explanation": explanation }) return issues def translate_and_explain_words(self, sentence: str) -> dict: """Break the sentence into words and find glossary entries.""" words = sentence.split() word_info = {} retrieved_context = [] for word in words: clean_word = word.strip(".,!?;:()\"'") if not clean_word: continue exact_match = None for entry in self.glossary_data: if entry.get("dje", "").lower() == clean_word.lower() or entry.get("fr", "").lower() == clean_word.lower(): exact_match = entry break if exact_match: fr_word = exact_match.get("fr", "") dje_word = exact_match.get("dje", "") notes = exact_match.get("notes", "No additional context available") word_info[clean_word] = { "french": fr_word, "djerma": dje_word, "notes": notes, "match_type": "exact" } context_entry = f"Word: {clean_word}\nFrench: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}" if context_entry not in retrieved_context: retrieved_context.append(context_entry) else: search_results = self.vectorstore.similarity_search(clean_word, k=1) if search_results: result = search_results[0] metadata = result.metadata word_info[clean_word] = { "french": metadata.get("fr", ""), "djerma": metadata.get("dje", ""), "notes": metadata.get("notes", "No additional context available"), "match_type": "semantic" } context_entry = f"Word: {clean_word}\nFrench: {metadata.get('fr', '')}\nDjerma: {metadata.get('dje', '')}\nNotes: {metadata.get('notes', 'No additional context available')}" if context_entry not in retrieved_context: retrieved_context.append(context_entry) sentence_results = self.vectorstore.similarity_search(sentence, k=5) for result in sentence_results: context_entry = result.page_content if context_entry not in retrieved_context: retrieved_context.append(context_entry) top_contexts = retrieved_context[:3] return {"word_info": word_info, "retrieved_context": top_contexts} def format_grammar_issues(self, issues: list) -> str: """Format grammar issues for display.""" if not issues: return "No grammar issues detected." result = "Grammar Issues Detected:\n\n" for i, issue in enumerate(issues, 1): result += f"Issue {i}:\n" result += f"Rule ID: {issue.get('rule_id', '')}\n" result += f"Category: {issue.get('category', '')}\n" result += f"Subcategory: {issue.get('subcategory', '')}\n" result += f"Description: {issue.get('description', '')}\n" result += f"Wrong phrase: '{issue.get('wrong_phrase', '')}'\n" result += f"Corrected phrase: '{issue.get('corrected_phrase', '')}'\n" result += f"English example: {issue.get('english_example', '')}\n" result += f"Explanation: {issue.get('explanation', '')}\n\n" return result def format_glossary_info(self, glossary_results: dict) -> str: """Format glossary information for model input.""" word_info = glossary_results.get("word_info", {}) if not word_info: return "No glossary matches found for words in the sentence." result = "Glossary information:\n\n" for word, info in word_info.items(): result += f"Word: {word}\n" result += f"French: {info.get('french', '')}\n" result += f"Djerma: {info.get('djerma', '')}\n" result += f"Notes: {info.get('notes', '')}\n\n" return result def filter_reliable_context(self, glossary_results: dict, analysis_result: str) -> list: """Filter glossary context to only show entries reliable in the context of Gemini's analysis.""" retrieved_context = glossary_results.get("retrieved_context", []) analysis_lower = analysis_result.lower() reliable_context = [] for context in retrieved_context: lines = context.split("\n") word_line = lines[0] word = word_line.split(": ")[1].lower() if word in analysis_lower: reliable_context.append(context) return reliable_context[:3] def extract_analysis(self, raw_output: str) -> str: """Extract the detailed analysis sections.""" pattern = ( r"(1\. WORD BREAKDOWN:\s*-\s*.+?)" + r"(2\. LINGUISTIC INSIGHT:\s*-\s*Word Order:\s*.+?)" + r"(3\. CORRECTNESS ASSESSMENT:\s*-\s*Is the sentence correct\?.+?)(?=\n\n|$)" ) match = re.search(pattern, raw_output, re.DOTALL) if match: return match.group(1) + "\n" + match.group(2) + "\n" + match.group(3) return ( "1. WORD BREAKDOWN:\n" " - Analysis incomplete due to model limitations.\n\n" "2. LINGUISTIC INSIGHT:\n" " - Word Order: Analysis incomplete.\n" " - Tense/Aspect Markers: Analysis incomplete.\n" " - Contextual Insight: Analysis incomplete.\n\n" "3. CORRECTNESS ASSESSMENT:\n" " - Is the sentence correct? Unknown due to model limitations.\n" " - Reason for Incorrectness (if applicable): Unknown.\n" " - Corrections: None provided." ) def analyze_sentence(self, sentence: str, lang: str = "en") -> dict: """Full analysis pipeline for a Zarma sentence using Gemini-2.0-flash.""" grammar_issues = self.check_grammar(sentence) formatted_grammar = self.format_grammar_issues(grammar_issues) glossary_results = self.translate_and_explain_words(sentence) formatted_glossary = self.format_glossary_info(glossary_results) language = "English" if lang == "en" else "French" prompt = self.analysis_template.format( sentence=sentence, grammar_check=formatted_grammar, glossary_info=formatted_glossary, language=language ) raw_analysis = "" try: response = self.model.generate_content(prompt) raw_analysis = response.text except Exception as e: raw_analysis = f"Error in analysis generation: {str(e)}" analysis_result = self.extract_analysis(raw_analysis) reliable_context = self.filter_reliable_context(glossary_results, analysis_result) return { "sentence": sentence, "grammar_issues": grammar_issues, "formatted_grammar": formatted_grammar, "analysis_result": analysis_result, "retrieved_context": reliable_context } def format_output(self, results: dict, lang: str = "en") -> str: """Format the analysis results for Gradio output in the selected language.""" if lang == "fr": output = "=" * 80 + "\n" output += "ANALYSEUR DE LANGUE ZARMA\n" output += "=" * 80 + "\n\n" output += f"Phrase Analysée: \"{results['sentence']}\"\n" output += f"État de la Grammaire: {'Problèmes détectés' if results['grammar_issues'] else 'Correct'}\n\n" output += "Analyse Détaillée:\n" output += "-" * 80 + "\n" output += results['analysis_result'] + "\n\n" output += "Sources de Contexte Fiables:\n" output += "-" * 80 + "\n" if results["retrieved_context"]: for i, context in enumerate(results["retrieved_context"], 1): output += f"Source {i}:\n{context}\n\n" else: output += "Aucune source de contexte fiable récupérée basée sur l'analyse.\n" output += "=" * 80 else: # Default to English output = "=" * 80 + "\n" output += "ZARMA LANGUAGE ANALYZER\n" output += "=" * 80 + "\n\n" output += f"Sentence Analyzed: \"{results['sentence']}\"\n" output += f"Grammar Status: {'Issues detected' if results['grammar_issues'] else 'Correct'}\n\n" output += "Detailed Analysis:\n" output += "-" * 80 + "\n" output += results['analysis_result'] + "\n\n" output += "Reliable Context Sources:\n" output += "-" * 80 + "\n" if results["retrieved_context"]: for i, context in enumerate(results["retrieved_context"], 1): output += f"Source {i}:\n{context}\n\n" else: output += "No reliable context sources retrieved based on the analysis.\n" output += "=" * 80 return output # Initialize the analyzer (adjust paths to match your Hugging Face Space structure) analyzer = ZarmaLanguageAnalyzer("grammar_rules.json", "glossary.json") # Gradio interface def analyze_zarma_sentence(sentence, output_in_english): if not sentence.strip(): return "Please enter a valid Zarma sentence." if output_in_english else "Veuillez entrer une phrase Zarma valide." lang = "en" if output_in_english else "fr" results = analyzer.analyze_sentence(sentence, lang=lang) return analyzer.format_output(results, lang=lang) # Define the Gradio UI with gr.Blocks(title="Zarma Language Analyzer") as demo: gr.Markdown("# Zarma Language Analyzer") gr.Markdown("Enter a Zarma sentence below to analyze its grammar and meaning.") sentence_input = gr.Textbox(label="Zarma Sentence", placeholder="e.g., Ay ga koy.") language_checkbox = gr.Checkbox(label="Output in English (uncheck for French)", value=True) analyze_button = gr.Button("Analyze") output_text = gr.Textbox(label="Analysis Result", lines=20) analyze_button.click( fn=analyze_zarma_sentence, inputs=[sentence_input, language_checkbox], outputs=output_text ) # Launch the app demo.launch(share=True)