import gradio as gr from transformers import pipeline import pandas as pd import os import re from filelock import FileLock # ----------------------------- # Load Transformer Models # ----------------------------- english_model = pipeline( "sentiment-analysis", model="siebert/sentiment-roberta-large-english" ) urdu_model = pipeline( "sentiment-analysis", model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu" ) roman_urdu_model = pipeline( "sentiment-analysis", model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu" ) # ----------------------------- # CSV Setup # ----------------------------- SAVE_FILE = "sentiment_logs.csv" LOCK_FILE = SAVE_FILE + ".lock" if not os.path.exists(SAVE_FILE): pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"]).to_csv( SAVE_FILE, index=False, encoding="utf-8-sig" ) # ----------------------------- # Improved Language Detection # ----------------------------- def detect_language(text): urdu_script = re.compile(r"[\u0600-\u06FF]") if urdu_script.search(text): return "Urdu" roman_urdu_patterns = [ r"\b(hai|hain|tha|thi|parhta|parhai|acha|bura|bohot|zabardast)\b", r"\b(sir|madam|ustad|class|parh|samajh)\b", ] text_l = text.lower() for p in roman_urdu_patterns: if re.search(p, text_l): return "Roman Urdu" return "English" # ----------------------------- # Roman Urdu Normalization # ----------------------------- def normalize_roman_urdu(text): text = text.lower() text = text.replace("hy", "hai").replace("h", "hai") text = re.sub(r"\bnhi\b|\bnai\b|\bnhi\b", "nahi", text) return text # ----------------------------- # Normalize Labels # ----------------------------- def normalize_label(label): label = label.lower() if "pos" in label or "positive" in label: return "Positive" elif "neg" in label or "negative" in label: return "Negative" else: return "Neutral" # ----------------------------- # Polarity Explanation # ----------------------------- def polarity_explanation(text, sentiment): explanations = { "Positive": "Contains praise words or positive evaluation.", "Negative": "Contains criticism or negative expressions.", "Neutral": "Factual statement or balanced observation." } return explanations.get(sentiment, "") # ----------------------------- # Ensemble Roman Urdu + Urdu # ----------------------------- def ensemble_roman_urdu(text): ru = roman_urdu_model(text)[0] ur = urdu_model(text)[0] ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"]) if ru_sent == ur_sent: return ru if ru["score"] >= ur["score"] else ur # Weight Roman Urdu higher for Roman Urdu input weight_ru = ru["score"] * 1.25 weight_ur = ur["score"] return ru if weight_ru >= weight_ur else ur # ----------------------------- # Adjust sentiment if low intensity # ----------------------------- def adjust_for_neutral(text, sentiment, score): if sentiment in ["Positive", "Negative"] and score < 0.7: return "Neutral", score return sentiment, score # ----------------------------- # Main Analysis Function # ----------------------------- def analyze_sentiment(text, lang_hint): if not text.strip(): return "⚠️ Please enter a sentence.", "", "", SAVE_FILE lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text) if lang == "English": result = english_model(text)[0] elif lang == "Urdu": result = urdu_model(text)[0] else: text = normalize_roman_urdu(text) result = ensemble_roman_urdu(text) sentiment = normalize_label(result["label"]) score = round(float(result["score"]), 3) sentiment, score = adjust_for_neutral(text, sentiment, score) explanation = polarity_explanation(text, sentiment) # Save logs with FileLock(LOCK_FILE): df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \ if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"]) new_row = pd.DataFrame([[text, lang, sentiment, score]], columns=["Sentence", "Language", "Sentiment", "Confidence"]) df = pd.concat([df, new_row], ignore_index=True) df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig") return sentiment, str(score), explanation, SAVE_FILE # ----------------------------- # Show Logs # ----------------------------- def show_logs(): if os.path.exists(SAVE_FILE): return pd.read_csv(SAVE_FILE, encoding="utf-8-sig") else: return pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"]) # ----------------------------- # Gradio UI # ----------------------------- with gr.Blocks() as demo: gr.Markdown( "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n" "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n" "🪶 Improved Roman Urdu normalization + ensemble + polarity explanation.\n" ) with gr.Row(): with gr.Column(): user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type English, Urdu, or Roman Urdu...") lang_dropdown = gr.Dropdown( ["Auto Detect", "English", "Urdu", "Roman Urdu"], value="Auto Detect", label="🌐 Language" ) btn_analyze = gr.Button("🔍 Analyze Sentiment") btn_show = gr.Button("📂 Show Saved Logs") with gr.Column(): out_sent = gr.Textbox(label="Sentiment") out_conf = gr.Textbox(label="Confidence (0–1)") out_exp = gr.Textbox(label="Polarity Explanation") out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath") logs_df = gr.Dataframe( headers=["Sentence", "Language", "Sentiment", "Confidence"], label="🧾 Sentiment Logs", interactive=False ) btn_analyze.click(analyze_sentiment, inputs=[user_text, lang_dropdown], outputs=[out_sent, out_conf, out_exp, out_file]) btn_show.click(show_logs, outputs=[logs_df]) if __name__ == "__main__": demo.launch()