Spaces:

27Group
/

Zarma_Language_Analyzer

Sleeping

App Files Files Community

Mamadou2727 commited on Apr 10

Commit

d9eb428

1 Parent(s): 7a90a1a

upload the files

Browse files

Files changed (4) hide show

app.py +357 -0
glossary.json +0 -0
grammar_rules.json +231 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import json
+import warnings
+import re
+import os
+from google import genai
+from google.genai import types
+from langchain.schema import Document
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.prompts import PromptTemplate
+import gradio as gr
+# Suppress warnings for cleaner output
+warnings.filterwarnings("ignore")
+class ZarmaLanguageAnalyzer:
+    def __init__(self, grammar_path: str, glossary_path: str):
+        """
+        Initialize the Zarma Language Analyzer with grammar rules and glossary.
+        Optimized for CPU usage on Hugging Face Spaces.
+        """
+        print("Running on CPU for Hugging Face Spaces.")
+        self.grammar_rules = self._load_json(grammar_path).get("grammar_rules", [])
+        self.glossary_data = self._load_json(glossary_path)
+        self._setup_models()
+        self._setup_vectorstore()
+    def _load_json(self, file_path: str) -> dict:
+        """Load and parse a JSON file."""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    def _setup_models(self):
+        """Set up the Gemini-2.0-Flash model via Google Generative AI API."""
+        # Get API key from environment variable
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("GOOGLE_API_KEY environment variable not set.")
+        self.client = genai.Client(api_key=api_key)
+        self.model = "gemini-2.0-flash"
+        self.analysis_template = PromptTemplate(
+            input_variables=["sentence", "grammar_check", "glossary_info"],
+            template="""
+            You are a Zarma language expert. Analyze this Zarma sentence: "{sentence}"
+            Rely primarily on your expertise in Zarma grammar and meaning. Recognize proper nouns (e.g., names or places) as such unless the glossary explicitly contradicts this with a common Zarma meaning. Use the grammar check and glossary below as supplementary aids only—do not override your knowledge unless they provide clear, contextually relevant insight.
+            Grammar check results (optional guide):
+            {grammar_check}
+            Glossary information (use it but prioritize your expertise to confirm):
+            {glossary_info}
+            Provide a detailed linguistic analysis in this exact format, with no extra text outside the sections:
+            1. WORD BREAKDOWN:
+               - [List each word with its grammatical role and meaning, e.g., "Ay: 1st person singular pronoun, meaning 'I'."]
+            2. LINGUISTIC INSIGHT:
+               - Word Order: [Describe typical Zarma word order (e.g., SOV, SVO) and how this sentence aligns or deviates]
+               - Tense/Aspect Markers: [Explain tense/aspect markers like 'ga', 'goono ga', or none for past, with examples like "Ay ga koy" (I will go)]
+               - Contextual Insight: [Discuss what the sentence might intend to convey and any external influences or errors]
+            3. CORRECTNESS ASSESSMENT:
+               - Is the sentence correct? [Yes/No, with explanation]
+               - Reason for Incorrectness (if applicable): [Detailed reason why it’s wrong, e.g., misplaced particle]
+               - Corrections (depending on intended meaning):
+                  - [Option 1: Corrected sentence with explanation]
+                  - [Option 2: Corrected sentence with explanation]
+                  - [Option 3: Corrected sentence with explanation]
+            """
+        )
+    def _setup_vectorstore(self):
+        """Set up FAISS vector store with the glossary for retrieval."""
+        embed_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={"device": "cpu"}  # Force CPU usage
+        )
+        documents = []
+        for entry in self.glossary_data:
+            fr_word = entry.get("fr", "")
+            dje_word = entry.get("dje", "")
+            notes = entry.get("notes", "No additional context available")
+            content = f"French: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
+            metadata = {"fr": fr_word, "dje": dje_word, "notes": notes}
+            documents.append(Document(page_content=content, metadata=metadata))
+        self.vectorstore = FAISS.from_documents(documents, embed_model)
+    def check_grammar(self, sentence: str) -> list:
+        """Check if the sentence violates any grammar rules."""
+        issues = []
+        for rule in self.grammar_rules:
+            rule_id = rule.get("rule_id", "")
+            category = rule.get("category", "")
+            subcategory = rule.get("subcategory", "")
+            description = rule.get("description", "")
+            examples = rule.get("examples", [])
+            for example in examples:
+                wrong_phrase = example.get("zarma", "")
+                corrected_phrase = example.get("corrected_zarma", "")
+                english_example = example.get("english", "")
+                if wrong_phrase and wrong_phrase in sentence:
+                    explanation = (
+                        f"This rule applies because '{wrong_phrase}' doesn't follow {category} norms in Zarma. "
+                        f"Specifically, it violates rules related to {subcategory}. "
+                        f"The correct form would be '{corrected_phrase or 'unknown'}'. "
+                        f"In English, this is similar to: '{english_example}'"
+                    )
+                    issues.append({
+                        "rule_id": rule_id,
+                        "category": category,
+                        "subcategory": subcategory,
+                        "description": description,
+                        "wrong_phrase": wrong_phrase,
+                        "corrected_phrase": corrected_phrase,
+                        "english_example": english_example,
+                        "explanation": explanation
+                    })
+        return issues
+    def translate_and_explain_words(self, sentence: str) -> dict:
+        """Break the sentence into words and find glossary entries."""
+        words = sentence.split()
+        word_info = {}
+        retrieved_context = []
+        for word in words:
+            clean_word = word.strip(".,!?;:()\"'")
+            if not clean_word:
+                continue
+            exact_match = None
+            for entry in self.glossary_data:
+                if entry.get("dje", "").lower() == clean_word.lower() or entry.get("fr", "").lower() == clean_word.lower():
+                    exact_match = entry
+                    break
+            if exact_match:
+                fr_word = exact_match.get("fr", "")
+                dje_word = exact_match.get("dje", "")
+                notes = entry.get("notes", "No additional context available")
+                word_info[clean_word] = {
+                    "french": fr_word,
+                    "djerma": dje_word,
+                    "notes": notes,
+                    "match_type": "exact"
+                }
+                context_entry = f"Word: {clean_word}\nFrench: {fr_word}\nDjerma: {dje_word}\nNotes: {notes}"
+                if context_entry not in retrieved_context:
+                    retrieved_context.append(context_entry)
+            else:
+                search_results = self.vectorstore.similarity_search(clean_word, k=1)
+                if search_results:
+                    result = search_results[0]
+                    metadata = result.metadata
+                    word_info[clean_word] = {
+                        "french": metadata.get("fr", ""),
+                        "djerma": metadata.get("dje", ""),
+                        "notes": metadata.get("notes", "No additional context available"),
+                        "match_type": "semantic"
+                    }
+                    context_entry = f"Word: {clean_word}\nFrench: {metadata.get('fr', '')}\nDjerma: {metadata.get('dje', '')}\nNotes: {metadata.get('notes', 'No additional context available')}"
+                    if context_entry not in retrieved_context:
+                        retrieved_context.append(context_entry)
+        sentence_results = self.vectorstore.similarity_search(sentence, k=5)
+        for result in sentence_results:
+            context_entry = result.page_content
+            if context_entry not in retrieved_context:
+                retrieved_context.append(context_entry)
+        top_contexts = retrieved_context[:3]
+        return {"word_info": word_info, "retrieved_context": top_contexts}
+    def format_grammar_issues(self, issues: list) -> str:
+        """Format grammar issues for display."""
+        if not issues:
+            return "No grammar issues detected."
+        result = "Grammar Issues Detected:\n\n"
+        for i, issue in enumerate(issues, 1):
+            result += f"Issue {i}:\n"
+            result += f"Rule ID: {issue.get('rule_id', '')}\n"
+            result += f"Category: {issue.get('category', '')}\n"
+            result += f"Subcategory: {issue.get('subcategory', '')}\n"
+            result += f"Description: {issue.get('description', '')}\n"
+            result += f"Wrong phrase: '{issue.get('wrong_phrase', '')}'\n"
+            result += f"Corrected phrase: '{issue.get('corrected_phrase', '')}'\n"
+            result += f"English example: {issue.get('english_example', '')}\n"
+            result += f"Explanation: {issue.get('explanation', '')}\n\n"
+        return result
+    def format_glossary_info(self, glossary_results: dict) -> str:
+        """Format glossary information for model input."""
+        word_info = glossary_results.get("word_info", {})
+        if not word_info:
+            return "No glossary matches found for words in the sentence."
+        result = "Glossary information:\n\n"
+        for word, info in word_info.items():
+            result += f"Word: {word}\n"
+            result += f"French: {info.get('french', '')}\n"
+            result += f"Djerma: {info.get('djerma', '')}\n"
+            result += f"Notes: {info.get('notes', '')}\n\n"
+        return result
+    def filter_reliable_context(self, glossary_results: dict, analysis_result: str) -> list:
+        """Filter glossary context to only show entries reliable in the context of Gemini's analysis."""
+        retrieved_context = glossary_results.get("retrieved_context", [])
+        analysis_lower = analysis_result.lower()
+        reliable_context = []
+        for context in retrieved_context:
+            lines = context.split("\n")
+            word_line = lines[0]
+            word = word_line.split(": ")[1].lower()
+            if word in analysis_lower:
+                reliable_context.append(context)
+        return reliable_context[:3]
+    def extract_analysis(self, raw_output: str) -> str:
+        """Extract the detailed analysis sections."""
+        pattern = (
+            r"(1\. WORD BREAKDOWN:\s*-\s*.+?)" +
+            r"(2\. LINGUISTIC INSIGHT:\s*-\s*Word Order:\s*.+?)" +
+            r"(3\. CORRECTNESS ASSESSMENT:\s*-\s*Is the sentence correct\?.+?)(?=\n\n|$)"
+        )
+        match = re.search(pattern, raw_output, re.DOTALL)
+        if match:
+            return match.group(1) + "\n" + match.group(2) + "\n" + match.group(3)
+        return (
+            "1. WORD BREAKDOWN:\n"
+            "   - Analysis incomplete due to model limitations.\n\n"
+            "2. LINGUISTIC INSIGHT:\n"
+            "   - Word Order: Analysis incomplete.\n"
+            "   - Tense/Aspect Markers: Analysis incomplete.\n"
+            "   - Contextual Insight: Analysis incomplete.\n\n"
+            "3. CORRECTNESS ASSESSMENT:\n"
+            "   - Is the sentence correct? Unknown due to model limitations.\n"
+            "   - Reason for Incorrectness (if applicable): Unknown.\n"
+            "   - Corrections: None provided."
+        )
+    def analyze_sentence(self, sentence: str) -> dict:
+        """Full analysis pipeline for a Zarma sentence using Gemini-2.0-Flash."""
+        grammar_issues = self.check_grammar(sentence)
+        formatted_grammar = self.format_grammar_issues(grammar_issues)
+        glossary_results = self.translate_and_explain_words(sentence)
+        formatted_glossary = self.format_glossary_info(glossary_results)
+        prompt = self.analysis_template.format(
+            sentence=sentence,
+            grammar_check=formatted_grammar,
+            glossary_info=formatted_glossary
+        )
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)],
+            ),
+        ]
+        generate_content_config = types.GenerateContentConfig(
+            response_mime_type="text/plain",
+        )
+        raw_analysis = ""
+        try:
+            for chunk in self.client.models.generate_content_stream(
+                model=self.model,
+                contents=contents,
+                config=generate_content_config,
+            ):
+                raw_analysis += chunk.text
+        except Exception as e:
+            raw_analysis = f"Error in analysis generation: {str(e)}"
+        analysis_result = self.extract_analysis(raw_analysis)
+        reliable_context = self.filter_reliable_context(glossary_results, analysis_result)
+        return {
+            "sentence": sentence,
+            "grammar_issues": grammar_issues,
+            "formatted_grammar": formatted_grammar,
+            "analysis_result": analysis_result,
+            "retrieved_context": reliable_context
+        }
+    def format_output(self, results: dict) -> str:
+        """Format the analysis results for Gradio output."""
+        output = "=" * 80 + "\n"
+        output += "ZARMA LANGUAGE ANALYZER\n"
+        output += "=" * 80 + "\n\n"
+        output += "SENTENCE ANALYZED:\n"
+        output += f"  \"{results['sentence']}\"\n\n"
+        output += "GRAMMAR STATUS:\n"
+        output += f"  {'Issues detected' if results['grammar_issues'] else 'Correct'}\n"
+        output += "-" * 80 + "\n"
+        output += "DETAILED ANALYSIS:\n"
+        output += results["analysis_result"] + "\n"
+        output += "-" * 80 + "\n"
+        output += "RELIABLE CONTEXT SOURCES:\n"
+        if results["retrieved_context"]:
+            for i, context in enumerate(results["retrieved_context"], 1):
+                output += f"Source {i}:\n"
+                output += context + "\n\n"
+        else:
+            output += "  No reliable context sources retrieved based on the analysis.\n"
+        output += "=" * 80
+        return output
+# Initialize the analyzer (adjust paths to match your Hugging Face Space structure)
+analyzer = ZarmaLanguageAnalyzer("grammar_rules.json", "glossary.json")
+# Gradio interface
+def analyze_zarma_sentence(sentence):
+    if not sentence.strip():
+        return "Please enter a valid Zarma sentence."
+    results = analyzer.analyze_sentence(sentence)
+    return analyzer.format_output(results)
+# Define the Gradio UI
+with gr.Blocks(title="Zarma Language Analyzer") as demo:
+    gr.Markdown("# Zarma Language Analyzer")
+    gr.Markdown("Enter a Zarma sentence below to analyze its grammar and meaning.")
+    sentence_input = gr.Textbox(label="Zarma Sentence", placeholder="e.g., Ay ga koy.")
+    analyze_button = gr.Button("Analyze")
+    output_text = gr.Textbox(label="Analysis Result", lines=20)
+    analyze_button.click(
+        fn=analyze_zarma_sentence,
+        inputs=sentence_input,
+        outputs=output_text
+    )
+# Launch the app
+demo.launch()

glossary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

grammar_rules.json ADDED Viewed

	@@ -0,0 +1,231 @@

+{
+  "grammar_rules": [
+    {
+      "rule_id": 1,
+      "category": "Pronouns",
+      "subcategory": "Personal Pronouns",
+      "description": "Personal pronouns in Zarma are invariable across nominative, objective, and possessive cases.",
+      "examples": [
+        {"zarma": "ay", "english": "I, me, my"},
+        {"zarma": "ni", "english": "you, your (singular)"},
+        {"zarma": "a (nga)", "english": "he, she, it; his, her, its"},
+        {"zarma": "iri (ir)", "english": "we, us, our"},
+        {"zarma": "araŋ", "english": "you (plural), your"},
+        {"zarma": "i (ngey, ey)", "english": "they, them, their"}
+      ]
+    },
+    {
+      "rule_id": 2,
+      "category": "Pronouns",
+      "subcategory": "Demonstrative Pronouns",
+      "description": "Demonstrative pronouns indicate specific items; 'din' suffix can be added to nouns for specificity.",
+      "examples": [
+        {"zarma": "wo", "english": "this, that"},
+        {"zarma": "wey", "english": "these, those"}
+      ]
+    },
+    {
+      "rule_id": 3,
+      "category": "Pronouns",
+      "subcategory": "Indefinite Pronouns",
+      "description": "Indefinite pronouns refer to non-specific entities.",
+      "examples": [
+        {"zarma": "boro", "english": "someone, one (person)"},
+        {"zarma": "hay kulu", "english": "everything"},
+        {"zarma": "hay fo", "english": "something"}
+      ]
+    },
+    {
+      "rule_id": 4,
+      "category": "Nouns",
+      "subcategory": "Definite Article",
+      "description": "Definite articles are expressed by adding 'a' or 'o' to the noun based on its ending.",
+      "patterns": [
+        {"ending": "a", "rule": "add 'a' (e.g., zanka → zankaa)", "exceptions": "Pre-1999 texts may not change"},
+        {"ending": "o", "rule": "change to 'a' or add 'a' (e.g., wayboro → waybora)"},
+        {"ending": "ko", "rule": "change to 'kwa' (e.g., darbayko → darbaykwa)"},
+        {"ending": "e, i, u, consonant", "rule": "change to 'o' or add 'o' (e.g., wande → wando)"},
+        {"ending": "ay", "rule": "change 'ay' to 'a' or add 'o' (e.g., farkay → farka or farkayo)"}
+      ],
+      "examples": [
+        {"zarma": "zanka → zankaa", "english": "a child → the child"},
+        {"zarma": "wayboro → waybora", "english": "a woman → the woman"},
+        {"zarma": "darbayko → darbaykwa", "english": "a fisherman → the fisherman"},
+        {"zarma": "hansi → hanso", "english": "a dog → the dog"},
+        {"zarma": "farkay → farka", "english": "a donkey → the donkey"}
+      ]
+    },
+    {
+      "rule_id": 5,
+      "category": "Nouns",
+      "subcategory": "Definite Plural",
+      "description": "Definite plural is formed by replacing the definite singular vowel with 'ey'.",
+      "pattern": "Replace final vowel with 'ey' (e.g., zankaa → zankey)",
+      "examples": [
+        {"zarma": "zankaa → zankey", "english": "the child → the children"},
+        {"zarma": "hanso → hansey", "english": "the dog → the dogs"},
+        {"zarma": "farka → farkey", "english": "the donkey → the donkeys"}
+      ]
+    },
+    {
+      "rule_id": 6,
+      "category": "Nouns",
+      "subcategory": "Indefinite Article",
+      "description": "No explicit indefinite article; 'fo' (one) is used to specify 'a certain' or 'one'.",
+      "pattern": "Add 'fo' after noun for specificity (e.g., musu → musu fo)",
+      "examples": [
+        {"zarma": "musu", "english": "a cat"},
+        {"zarma": "musu fo", "english": "a (certain) cat, one cat"}
+      ]
+    },
+    {
+      "rule_id": 7,
+      "category": "Nouns",
+      "subcategory": "Gender",
+      "description": "No grammatical gender; specific words indicate male/female for living beings.",
+      "examples": [
+        {"zarma": "alboro", "english": "man"},
+        {"zarma": "wayboro", "english": "woman"}
+      ]
+    },
+    {
+      "rule_id": 8,
+      "category": "Verbs",
+      "subcategory": "Completed Action (Past Tense)",
+      "description": "Verbs without auxiliaries indicate completed actions (past tense).",
+      "pattern": "Subject + Verb (e.g., ay neera)",
+      "examples": [
+        {"zarma": "ay neera", "english": "I sold"},
+        {"zarma": "a neera", "english": "he/she sold"},
+        {"zarma": "zankaa kani", "english": "the child went to bed"}
+      ]
+    },
+    {
+      "rule_id": 9,
+      "category": "Verbs",
+      "subcategory": "Uncompleted Action (Future Tense)",
+      "description": "Future tense uses auxiliary 'ga' before the verb.",
+      "pattern": "Subject + ga + Verb (e.g., ay ga neera)",
+      "examples": [
+        {"zarma": "ay ga neera", "english": "I will sell"},
+        {"zarma": "i ga neera", "english": "they will sell"}
+      ]
+    },
+    {
+      "rule_id": 10,
+      "category": "Verbs",
+      "subcategory": "Continuous Aspect",
+      "description": "Continuous aspect uses 'go no ga' before the verb for ongoing actions.",
+      "pattern": "Subject + go no ga + Verb (e.g., ay go no ga neera)",
+      "examples": [
+        {"zarma": "ay go no ga neera", "english": "I am selling"},
+        {"zarma": "a go no ga neera", "english": "he/she is selling"}
+      ]
+    },
+    {
+      "rule_id": 11,
+      "category": "Verbs",
+      "subcategory": "Subjunctive",
+      "description": "Subjunctive uses 'ma' to indicate possible actions.",
+      "pattern": "Subject + ma + Verb (e.g., ay ma neera)",
+      "examples": [
+        {"zarma": "ay ma neera", "english": "I should sell"},
+        {"zarma": "ni ma neera", "english": "you should sell"}
+      ]
+    },
+    {
+      "rule_id": 12,
+      "category": "Verbs",
+      "subcategory": "Imperative",
+      "description": "Imperative uses 'ma' or 'wa' before the verb, or just the verb alone.",
+      "pattern": "[Ma/Wa] + Verb or Verb alone (e.g., Ma haŋ or Haŋ)",
+      "examples": [
+        {"zarma": "Haŋ!", "english": "Drink!"},
+        {"zarma": "Ma haŋ!", "english": "Drink!"},
+        {"zarma": "Araŋ ma di!", "english": "You (plural) see!"}
+      ]
+    },
+    {
+      "rule_id": 13,
+      "category": "Verbs",
+      "subcategory": "To Be",
+      "description": "The verb 'to be' varies by context: 'go', 'ya...no', or 'ga ti'.",
+      "examples": [
+        {"zarma": "A go fu", "english": "He/she is at home"},
+        {"zarma": "Ay ya alfa no", "english": "I am a teacher"},
+        {"zarma": "Nga ga ti wayboro", "english": "She is a woman"}
+      ]
+    },
+    {
+      "rule_id": 14,
+      "category": "Verbs",
+      "subcategory": "Irregular Verbs",
+      "description": "Some verbs place objects unusually (e.g., direct object before verb without 'na').",
+      "examples": [
+        {"zarma": "Ay di a", "english": "I saw him/her"},
+        {"zarma": "A ne ay se", "english": "He/she said to me"}
+      ]
+    },
+    {
+      "rule_id": 15,
+      "category": "Adjectives",
+      "subcategory": "Qualifying Adjectives",
+      "description": "Adjectives follow the noun they modify.",
+      "pattern": "Noun + Adjective (e.g., fu beeri)",
+      "examples": [
+        {"zarma": "fu beeri", "english": "a big house"},
+        {"zarma": "hansi kayna", "english": "a small dog"}
+      ]
+    },
+    {
+      "rule_id": 16,
+      "category": "Sentence Structure",
+      "subcategory": "Basic Order",
+      "description": "Basic sentence order is Subject-Verb-Object (SVO).",
+      "pattern": "S + V + O (e.g., Ay neera bari)",
+      "examples": [
+        {"zarma": "Ay neera bari", "english": "I sold a horse"}
+      ]
+    },
+    {
+      "rule_id": 17,
+      "category": "Sentence Structure",
+      "subcategory": "Direct Object",
+      "description": "Direct object before verb requires 'na' in past positive.",
+      "pattern": "S + na + O + V (e.g., Ay na bari neera)",
+      "examples": [
+        {"zarma": "Ay na bari neera", "english": "I sold a horse"}
+      ]
+    },
+    {
+      "rule_id": 18,
+      "category": "Sentence Structure",
+      "subcategory": "Indirect Object",
+      "description": "Indirect object is marked with 'se' after the object.",
+      "pattern": "S + V + O + IO + se (e.g., Ay no bari wayboro se)",
+      "examples": [
+        {"zarma": "Ay no bari wayboro se", "english": "I gave a horse to the woman"}
+      ]
+    },
+    {
+      "rule_id": 19,
+      "category": "Negation",
+      "subcategory": "Past Negative",
+      "description": "Past negative uses 'mana' after the subject.",
+      "pattern": "S + mana + V (e.g., Ay mana neera)",
+      "examples": [
+        {"zarma": "Ay mana neera", "english": "I did not sell"}
+      ]
+    },
+    {
+      "rule_id": 20,
+      "category": "Negation",
+      "subcategory": "Present/Future Negative",
+      "description": "Present/future negative uses 'si' instead of 'ga'.",
+      "pattern": "S + si + V (e.g., Ay si neera)",
+      "examples": [
+        {"zarma": "Ay si neera", "english": "I do not/will not sell"}
+      ]
+    }
+  ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==4.44.0
+google-generativeai==0.8.2
+langchain==0.3.0
+langchain-community==0.3.0
+faiss-cpu==1.8.0
+sentence-transformers==3.1.1
+torch==2.4.1