Spaces:

tahamueed23
/

Sentiment-Analyzer

Sleeping

App Files Files Community

tahamueed23 commited on 5 days ago

Commit

0780c88

verified ·

1 Parent(s): 6974eeb

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -244

app.py CHANGED Viewed

@@ -13,18 +13,18 @@ english_model = pipeline(
     model="siebert/sentiment-roberta-large-english"
 )
-# same model but we'll ensemble results for Roman+Urdu
 urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
 )
 roman_urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
 )
 # -----------------------------
-# CSV Setup (persistent)
 # -----------------------------
 SAVE_FILE = "sentiment_logs.csv"
 LOCK_FILE = SAVE_FILE + ".lock"
@@ -37,200 +37,34 @@ if not os.path.exists(SAVE_FILE):
 # -----------------------------
 # Improved Language Detection
 # -----------------------------
-roman_urdu_keywords = {
-    # General Feedback Tone
-    "acha", "bohot_acha", "bhot_acha", "bahut_acha", "bura", "theek", "behtareen", "zabardast", "umda", "ghalit", "galat",
-    "accha", "awesome", "perfect", "kamzor", "behtar", "sahi", "ghalat", "faida", "nuksan",
-    # Study / Performance / Behavior
-    "parhai", "parhayi", "parhta", "parhti", "parhne", "parho", "assignment", "homework", "test", "imtihaan", "grade",
-    "result", "mehnat", "kaam", "performance", "focus", "dhyaan", "attendance", "class", "lecture",
-    "samajh", "samajhna", "samjhaya", "samajh_aya", "nahi_samajh_aya", "barhta", "seekhna", "seekh", "seekh_rha", "seekh_rhi",
-    "kaam_chor", "mehnati", "active", "lazy", "shararti", "tawajjo", "discipline", "behavior",
-    # Teacher / Student Relationship
-    "ustad", "teacher", "sir", "madam", "miss", "meray_ustad", "respect", "izzat", "ikhtiyar",
-    "mohabbat", "pyar", "taluq", "taaluq", "thoda", "ziyada", "kam", "bohot", "acha_sulook",
-    # Feedback Expressions
-    "kyun", "kese", "kaisa", "kaisi", "kyu", "hain", "hai", "tha", "thi", "the", "hoga", "hogaya", "hogi",
-    "karna", "karta", "kartay", "karti", "karne", "kerna", "hoza", "hona", "hota", "hotay", "hoti", "hona_chahiye",
-    "try", "koshish", "koshish_karna", "lagataar", "barhawa", "improve", "improvement", "masla", "problem", "issue",
-    # Emotion / Reaction Words
-    "khushi", "dukh", "tension", "fikr", "relax", "comfortable", "confidence", "yaqeen", "jazba", "motivation",
-    "interest", "boriyat", "thakan", "ghussa", "naraz", "khush", "preshan", "shukriya",
-    # School / Class Words
-    "school", "college", "university", "classroom", "class_fellow", "principal", "registration", "semester", "assignment_submit",
-    "presentation", "group_work", "project", "notebook", "copy", "kitab", "pencil", "pen", "bag",
-    # Time / Experience
-    "aaj", "kal", "kal_tak", "pehle", "baad_mein", "hamesha", "roz", "rozana", "abi", "abhi", "der", "jaldi",
-    "guzra", "raftar", "barh_gayi", "kam_hogi",
-    # Misc useful connectors
-    "mera", "meri", "mere", "tera", "teri", "tum", "aap", "hum", "wo", "yahan", "wahan", "ka", "ki", "ke",
-    "se", "tak", "par", "liye", "bhi", "magar", "lekin", "aur"
-}
 def detect_language(text):
-    urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
-    clean = re.sub(r"[^A-Za-z\u0600-\u06FF]+", " ", text)
-    # rule 1: actual Urdu characters
-    if any(ch in urdu_chars for ch in clean):
         return "Urdu"
-    # rule 2: roman urdu keyword ratio
-    tokens = clean.lower().split()
-    roman_hits = sum(w in roman_urdu_keywords for w in tokens)
-    if roman_hits / max(len(tokens), 1) > 0.2 or roman_hits > 0:
-        return "Roman Urdu"
     return "English"
 # -----------------------------
 # Roman Urdu Normalization
 # -----------------------------
 def normalize_roman_urdu(text):
-    replacements = {
-        # Common "acha/bura" & sentiment phrases
-        "acha ni": "acha nahi",
-        "acha nai": "acha nahi",
-        "acha hy": "acha hai",
-        "acha h": "acha hai",
-        "accha hy": "acha hai",
-        "achha hy": "acha hai",
-        "bura hy": "bura hai",
-        "bura h": "bura hai",
-        "bohot acha": "bohot acha",
-        "bohat acha": "bohot acha",
-        "boht acha": "bohot acha",
-        "zabrdast": "zabardast",
-        "zabardst": "zabardast",
-        "thek": "theek",
-        "thik": "theek",
-        # Negation variations
-        "ni": "nahi",
-        "nai": "nahi",
-        "nehi": "nahi",
-        "nahe": "nahi",
-        "nae": "nahi",
-        "nhe": "nahi",
-        "nhi": "nahi",
-        # Auxiliary verbs
-        "hy": "hai",
-        "h": "hai",
-        "haii": "hai",
-        "ha": "hai",
-        "hh": "hai",
-        "hu": "hu",
-        "hun": "hoon",
-        "hn": "hain",
-        "hainn": "hain",
-        "hyn": "hain",
-        # Pronoun & possessive normalizations
-        "mera": "mera",
-        "meri": "meri",
-        "mere": "mere",
-        "tera": "tera",
-        "teri": "teri",
-        "tumhara": "tumhara",
-        "apna": "apna",
-        "aapka": "aapka",
-        # Common teacher/student terms
-        "ustad": "ustad",
-        "ustaad": "ustad",
-        "ostad": "ustad",
-        "ostaad": "ustad",
-        "teacher": "teacher",
-        "sir": "sir",
-        "madam": "madam",
-        "miss": "madam",
-        "student": "student",
-        "talib e ilm": "talib_e_ilm",
-        # Study/learning phrases
-        "parhai": "parhai",
-        "parhayi": "parhai",
-        "parhne": "parhne",
-        "parhta": "parhta",
-        "parhti": "parhti",
-        "parhny": "parhne",
-        "parho": "parho",
-        "seekhta": "seekhta",
-        "seekhti": "seekhti",
-        "seekh rha": "seekh raha",
-        "seekh rhi": "seekh rahi",
-        # Effort/performance
-        "mehnat kr": "mehnat kar",
-        "mehnat kro": "mehnat karo",
-        "mehnat karna": "mehnat karna",
-        "kaam kr": "kaam kar",
-        "kaam kro": "kaam karo",
-        "koshish kr": "koshish kar",
-        "koshish kro": "koshish karo",
-        "improve kr": "improve kar",
-        "improve kro": "improve karo",
-        # Time/experience
-        "aj": "aaj",
-        "kal": "kal",
-        "kl": "kal",
-        "pehly": "pehle",
-        "bad me": "baad mein",
-        "abhi tk": "abhi tak",
-        # Common expressions
-        "shukriya": "shukriya",
-        "thanks": "thanks",
-        "thanku": "thankyou",
-        "thanx": "thankyou",
-        "plz": "please",
-        "pls": "please",
-        "okey": "ok",
-        "okk": "ok",
-        "oky": "ok",
-        # Misheard or alternate forms
-        "acha lagta": "acha lagta",
-        "bura lagta": "bura lagta",
-        "samjh ni aya": "samajh nahi aya",
-        "samjh nai aya": "samajh nahi aya",
-        "samjh nh aya": "samajh nahi aya",
-        "smjh ni aya": "samajh nahi aya",
-        "smjh gya": "samajh gaya",
-        "smjh gayi": "samajh gayi",
-        # Short common fixes
-        "kr": "kar",
-        "kro": "karo",
-        "krta": "karta",
-        "krti": "karti",
-        "kra": "kara",
-        "kia": "kiya",
-        "kiya tha": "kiya tha",
-        "ki thi": "ki thi",
-        "krna": "karna",
-        "krne": "karne",
-        "krny": "karne",
-    }
-    for k, v in replacements.items():
-        text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
     return text
 # -----------------------------
-# Label Normalization
 # -----------------------------
 def normalize_label(label):
     label = label.lower()
@@ -242,78 +76,73 @@ def normalize_label(label):
         return "Neutral"
 # -----------------------------
-# Add Emojis + Tips
 # -----------------------------
-def sentiment_with_tips(sentiment):
-    tips = {
-        "Positive": "😊 Great! Keep spreading positivity.",
-        "Negative": "😞 Looks negative — maybe reflect and improve things.",
-        "Neutral":  "😐 Neutral observation — balanced view."
     }
-    return tips.get(sentiment, "")
 # -----------------------------
-# Neutral Adjuster (Urdu/Descriptive)
-# -----------------------------
-def adjust_for_neutral(text, sentiment, score):
-    neutral_triggers = ["ہورہی ہے", "ہو رہی ہے", "ہے", "tha", "thi"]
-    if sentiment != "Neutral" and any(p in text for p in neutral_triggers):
-        if score < 0.9:  # descriptive statements, low emotional intensity
-            return "Neutral", 0.7
-    return sentiment, score
-# -----------------------------
-# Combine Roman Urdu & Urdu Models (Ensemble)
 # -----------------------------
 def ensemble_roman_urdu(text):
     ru = roman_urdu_model(text)[0]
     ur = urdu_model(text)[0]
     ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
     if ru_sent == ur_sent:
-        result = ru if ru["score"] >= ur["score"] else ur
-    else:
-        result = ru if ru["score"] * 0.9 >= ur["score"] else ur
-    return result
 # -----------------------------
-# Main Sentiment Function
 # -----------------------------
-def analyze_sentiment(text, lang_hint):
-    try:
-        if not text.strip():
-            return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
-        # auto detect if needed
-        lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
-        # select & possibly normalize
-        if lang == "English":
-            result = english_model(text)[0]
-        elif lang == "Urdu":
-            result = urdu_model(text)[0]
-        else:  # Roman Urdu
-            text = normalize_roman_urdu(text)
-            result = ensemble_roman_urdu(text)
-        # get normalized sentiment
-        sentiment = normalize_label(result["label"])
-        score = round(float(result["score"]), 3)
-        sentiment, score = adjust_for_neutral(text, sentiment, score)
-        explanation = sentiment_with_tips(sentiment)
-        # store results (thread-safe)
-        with FileLock(LOCK_FILE):
-            df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
-                if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
-            new_row = pd.DataFrame([[text, lang, sentiment, score]],
-                                   columns=["Sentence", "Language", "Sentiment", "Confidence"])
-            df = pd.concat([df, new_row], ignore_index=True)
-            df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
-        return sentiment, str(score), explanation, SAVE_FILE
-    except Exception as e:
-        return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
 # -----------------------------
 # Show Logs
@@ -331,13 +160,12 @@ with gr.Blocks() as demo:
     gr.Markdown(
         "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
         "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
-        "🪶 **Improvements:** refined Urdu/Roman Urdu detection, better Roman Urdu normalization, ensemble correction, and neutral balancing.\n\n"
-        "💾 All analyzed text is stored permanently in the same CSV, even across shared sessions."
     )
     with gr.Row():
         with gr.Column():
-            user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or Roman Urdu...")
             lang_dropdown = gr.Dropdown(
                 ["Auto Detect", "English", "Urdu", "Roman Urdu"],
                 value="Auto Detect", label="🌐 Language"
@@ -347,13 +175,13 @@ with gr.Blocks() as demo:
         with gr.Column():
             out_sent = gr.Textbox(label="Sentiment")
-            out_conf = gr.Textbox(label="Confidence (0–1)")
-            out_exp = gr.Textbox(label="Explanation")
-            out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
     logs_df = gr.Dataframe(
         headers=["Sentence", "Language", "Sentiment", "Confidence"],
-        label="🧾 Sentiment Logs", interactive=False
     )
     btn_analyze.click(analyze_sentiment,
@@ -363,4 +191,4 @@ with gr.Blocks() as demo:
     btn_show.click(show_logs, outputs=[logs_df])
 if __name__ == "__main__":
-    demo.launch()

     model="siebert/sentiment-roberta-large-english"
 )
 urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
 )
 roman_urdu_model = pipeline(
     "sentiment-analysis",
     model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
 )
 # -----------------------------
+# CSV Setup
 # -----------------------------
 SAVE_FILE = "sentiment_logs.csv"
 LOCK_FILE = SAVE_FILE + ".lock"
 # -----------------------------
 # Improved Language Detection
 # -----------------------------
 def detect_language(text):
+    urdu_script = re.compile(r"[\u0600-\u06FF]")
+    if urdu_script.search(text):
         return "Urdu"
+    roman_urdu_patterns = [
+        r"\b(hai|hain|tha|thi|parhta|parhai|acha|bura|bohot|zabardast)\b",
+        r"\b(sir|madam|ustad|class|parh|samajh)\b",
+    ]
+    text_l = text.lower()
+    for p in roman_urdu_patterns:
+        if re.search(p, text_l):
+            return "Roman Urdu"
     return "English"
 # -----------------------------
 # Roman Urdu Normalization
 # -----------------------------
 def normalize_roman_urdu(text):
+    text = text.lower()
+    text = text.replace("hy", "hai").replace("h", "hai")
+    text = re.sub(r"\bnhi\b|\bnai\b|\bnhi\b", "nahi", text)
     return text
 # -----------------------------
+# Normalize Labels
 # -----------------------------
 def normalize_label(label):
     label = label.lower()
         return "Neutral"
 # -----------------------------
+# Polarity Explanation
 # -----------------------------
+def polarity_explanation(text, sentiment):
+    explanations = {
+        "Positive": "Contains praise words or positive evaluation.",
+        "Negative": "Contains criticism or negative expressions.",
+        "Neutral": "Factual statement or balanced observation."
     }
+    return explanations.get(sentiment, "")
 # -----------------------------
+# Ensemble Roman Urdu + Urdu
 # -----------------------------
 def ensemble_roman_urdu(text):
     ru = roman_urdu_model(text)[0]
     ur = urdu_model(text)[0]
     ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
     if ru_sent == ur_sent:
+        return ru if ru["score"] >= ur["score"] else ur
+    # Weight Roman Urdu higher for Roman Urdu input
+    weight_ru = ru["score"] * 1.25
+    weight_ur = ur["score"]
+    return ru if weight_ru >= weight_ur else ur
 # -----------------------------
+# Adjust sentiment if low intensity
 # -----------------------------
+def adjust_for_neutral(text, sentiment, score):
+    if sentiment in ["Positive", "Negative"] and score < 0.7:
+        return "Neutral", score
+    return sentiment, score
+# -----------------------------
+# Main Analysis Function
+# -----------------------------
+def analyze_sentiment(text, lang_hint):
+    if not text.strip():
+        return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
+    lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
+    if lang == "English":
+        result = english_model(text)[0]
+    elif lang == "Urdu":
+        result = urdu_model(text)[0]
+    else:
+        text = normalize_roman_urdu(text)
+        result = ensemble_roman_urdu(text)
+    sentiment = normalize_label(result["label"])
+    score = round(float(result["score"]), 3)
+    sentiment, score = adjust_for_neutral(text, sentiment, score)
+    explanation = polarity_explanation(text, sentiment)
+    # Save logs
+    with FileLock(LOCK_FILE):
+        df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
+            if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
+        new_row = pd.DataFrame([[text, lang, sentiment, score]],
+                               columns=["Sentence", "Language", "Sentiment", "Confidence"])
+        df = pd.concat([df, new_row], ignore_index=True)
+        df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
+    return sentiment, str(score), explanation, SAVE_FILE
 # -----------------------------
 # Show Logs
     gr.Markdown(
         "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
         "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
+        "🪶 Improved Roman Urdu normalization + ensemble + polarity explanation.\n"
     )
     with gr.Row():
         with gr.Column():
+            user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type English, Urdu, or Roman Urdu...")
             lang_dropdown = gr.Dropdown(
                 ["Auto Detect", "English", "Urdu", "Roman Urdu"],
                 value="Auto Detect", label="🌐 Language"
         with gr.Column():
             out_sent = gr.Textbox(label="Sentiment")
+            out_conf = gr.Textbox(label="Confidence (0–1)")
+            out_exp = gr.Textbox(label="Polarity Explanation")
+            out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
     logs_df = gr.Dataframe(
         headers=["Sentence", "Language", "Sentiment", "Confidence"],
+        label="🧾 Sentiment Logs", interactive=False
     )
     btn_analyze.click(analyze_sentiment,
     btn_show.click(show_logs, outputs=[logs_df])
 if __name__ == "__main__":
+    demo.launch()