tahamueed23 commited on
Commit
0780c88
·
verified ·
1 Parent(s): 6974eeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -244
app.py CHANGED
@@ -13,18 +13,18 @@ english_model = pipeline(
13
  model="siebert/sentiment-roberta-large-english"
14
  )
15
 
16
- # same model but we'll ensemble results for Roman+Urdu
17
  urdu_model = pipeline(
18
  "sentiment-analysis",
19
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
20
  )
 
21
  roman_urdu_model = pipeline(
22
  "sentiment-analysis",
23
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
24
  )
25
 
26
  # -----------------------------
27
- # CSV Setup (persistent)
28
  # -----------------------------
29
  SAVE_FILE = "sentiment_logs.csv"
30
  LOCK_FILE = SAVE_FILE + ".lock"
@@ -37,200 +37,34 @@ if not os.path.exists(SAVE_FILE):
37
  # -----------------------------
38
  # Improved Language Detection
39
  # -----------------------------
40
- roman_urdu_keywords = {
41
- # General Feedback Tone
42
- "acha", "bohot_acha", "bhot_acha", "bahut_acha", "bura", "theek", "behtareen", "zabardast", "umda", "ghalit", "galat",
43
- "accha", "awesome", "perfect", "kamzor", "behtar", "sahi", "ghalat", "faida", "nuksan",
44
-
45
- # Study / Performance / Behavior
46
- "parhai", "parhayi", "parhta", "parhti", "parhne", "parho", "assignment", "homework", "test", "imtihaan", "grade",
47
- "result", "mehnat", "kaam", "performance", "focus", "dhyaan", "attendance", "class", "lecture",
48
- "samajh", "samajhna", "samjhaya", "samajh_aya", "nahi_samajh_aya", "barhta", "seekhna", "seekh", "seekh_rha", "seekh_rhi",
49
- "kaam_chor", "mehnati", "active", "lazy", "shararti", "tawajjo", "discipline", "behavior",
50
-
51
- # Teacher / Student Relationship
52
- "ustad", "teacher", "sir", "madam", "miss", "meray_ustad", "respect", "izzat", "ikhtiyar",
53
- "mohabbat", "pyar", "taluq", "taaluq", "thoda", "ziyada", "kam", "bohot", "acha_sulook",
54
-
55
- # Feedback Expressions
56
- "kyun", "kese", "kaisa", "kaisi", "kyu", "hain", "hai", "tha", "thi", "the", "hoga", "hogaya", "hogi",
57
- "karna", "karta", "kartay", "karti", "karne", "kerna", "hoza", "hona", "hota", "hotay", "hoti", "hona_chahiye",
58
- "try", "koshish", "koshish_karna", "lagataar", "barhawa", "improve", "improvement", "masla", "problem", "issue",
59
-
60
- # Emotion / Reaction Words
61
- "khushi", "dukh", "tension", "fikr", "relax", "comfortable", "confidence", "yaqeen", "jazba", "motivation",
62
- "interest", "boriyat", "thakan", "ghussa", "naraz", "khush", "preshan", "shukriya",
63
-
64
- # School / Class Words
65
- "school", "college", "university", "classroom", "class_fellow", "principal", "registration", "semester", "assignment_submit",
66
- "presentation", "group_work", "project", "notebook", "copy", "kitab", "pencil", "pen", "bag",
67
-
68
- # Time / Experience
69
- "aaj", "kal", "kal_tak", "pehle", "baad_mein", "hamesha", "roz", "rozana", "abi", "abhi", "der", "jaldi",
70
- "guzra", "raftar", "barh_gayi", "kam_hogi",
71
-
72
- # Misc useful connectors
73
- "mera", "meri", "mere", "tera", "teri", "tum", "aap", "hum", "wo", "yahan", "wahan", "ka", "ki", "ke",
74
- "se", "tak", "par", "liye", "bhi", "magar", "lekin", "aur"
75
- }
76
-
77
-
78
  def detect_language(text):
79
- urdu_chars = set("ابتثجحخدذرزسشصضطظعغفقکلمنوہیءآؤئۀ")
80
- clean = re.sub(r"[^A-Za-z\u0600-\u06FF]+", " ", text)
81
- # rule 1: actual Urdu characters
82
- if any(ch in urdu_chars for ch in clean):
83
  return "Urdu"
84
 
85
- # rule 2: roman urdu keyword ratio
86
- tokens = clean.lower().split()
87
- roman_hits = sum(w in roman_urdu_keywords for w in tokens)
88
- if roman_hits / max(len(tokens), 1) > 0.2 or roman_hits > 0:
89
- return "Roman Urdu"
 
 
 
 
90
 
91
  return "English"
92
 
93
  # -----------------------------
94
  # Roman Urdu Normalization
95
  # -----------------------------
96
-
97
  def normalize_roman_urdu(text):
98
- replacements = {
99
- # Common "acha/bura" & sentiment phrases
100
- "acha ni": "acha nahi",
101
- "acha nai": "acha nahi",
102
- "acha hy": "acha hai",
103
- "acha h": "acha hai",
104
- "accha hy": "acha hai",
105
- "achha hy": "acha hai",
106
- "bura hy": "bura hai",
107
- "bura h": "bura hai",
108
- "bohot acha": "bohot acha",
109
- "bohat acha": "bohot acha",
110
- "boht acha": "bohot acha",
111
- "zabrdast": "zabardast",
112
- "zabardst": "zabardast",
113
- "thek": "theek",
114
- "thik": "theek",
115
-
116
- # Negation variations
117
- "ni": "nahi",
118
- "nai": "nahi",
119
- "nehi": "nahi",
120
- "nahe": "nahi",
121
- "nae": "nahi",
122
- "nhe": "nahi",
123
- "nhi": "nahi",
124
-
125
- # Auxiliary verbs
126
- "hy": "hai",
127
- "h": "hai",
128
- "haii": "hai",
129
- "ha": "hai",
130
- "hh": "hai",
131
- "hu": "hu",
132
- "hun": "hoon",
133
- "hn": "hain",
134
- "hainn": "hain",
135
- "hyn": "hain",
136
-
137
- # Pronoun & possessive normalizations
138
- "mera": "mera",
139
- "meri": "meri",
140
- "mere": "mere",
141
- "tera": "tera",
142
- "teri": "teri",
143
- "tumhara": "tumhara",
144
- "apna": "apna",
145
- "aapka": "aapka",
146
-
147
- # Common teacher/student terms
148
- "ustad": "ustad",
149
- "ustaad": "ustad",
150
- "ostad": "ustad",
151
- "ostaad": "ustad",
152
- "teacher": "teacher",
153
- "sir": "sir",
154
- "madam": "madam",
155
- "miss": "madam",
156
- "student": "student",
157
- "talib e ilm": "talib_e_ilm",
158
-
159
- # Study/learning phrases
160
- "parhai": "parhai",
161
- "parhayi": "parhai",
162
- "parhne": "parhne",
163
- "parhta": "parhta",
164
- "parhti": "parhti",
165
- "parhny": "parhne",
166
- "parho": "parho",
167
- "seekhta": "seekhta",
168
- "seekhti": "seekhti",
169
- "seekh rha": "seekh raha",
170
- "seekh rhi": "seekh rahi",
171
-
172
- # Effort/performance
173
- "mehnat kr": "mehnat kar",
174
- "mehnat kro": "mehnat karo",
175
- "mehnat karna": "mehnat karna",
176
- "kaam kr": "kaam kar",
177
- "kaam kro": "kaam karo",
178
- "koshish kr": "koshish kar",
179
- "koshish kro": "koshish karo",
180
- "improve kr": "improve kar",
181
- "improve kro": "improve karo",
182
-
183
- # Time/experience
184
- "aj": "aaj",
185
- "kal": "kal",
186
- "kl": "kal",
187
- "pehly": "pehle",
188
- "bad me": "baad mein",
189
- "abhi tk": "abhi tak",
190
-
191
- # Common expressions
192
- "shukriya": "shukriya",
193
- "thanks": "thanks",
194
- "thanku": "thankyou",
195
- "thanx": "thankyou",
196
- "plz": "please",
197
- "pls": "please",
198
- "okey": "ok",
199
- "okk": "ok",
200
- "oky": "ok",
201
-
202
- # Misheard or alternate forms
203
- "acha lagta": "acha lagta",
204
- "bura lagta": "bura lagta",
205
- "samjh ni aya": "samajh nahi aya",
206
- "samjh nai aya": "samajh nahi aya",
207
- "samjh nh aya": "samajh nahi aya",
208
- "smjh ni aya": "samajh nahi aya",
209
- "smjh gya": "samajh gaya",
210
- "smjh gayi": "samajh gayi",
211
-
212
- # Short common fixes
213
- "kr": "kar",
214
- "kro": "karo",
215
- "krta": "karta",
216
- "krti": "karti",
217
- "kra": "kara",
218
- "kia": "kiya",
219
- "kiya tha": "kiya tha",
220
- "ki thi": "ki thi",
221
- "krna": "karna",
222
- "krne": "karne",
223
- "krny": "karne",
224
- }
225
-
226
- for k, v in replacements.items():
227
- text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
228
-
229
  return text
230
 
231
-
232
  # -----------------------------
233
- # Label Normalization
234
  # -----------------------------
235
  def normalize_label(label):
236
  label = label.lower()
@@ -242,78 +76,73 @@ def normalize_label(label):
242
  return "Neutral"
243
 
244
  # -----------------------------
245
- # Add Emojis + Tips
246
  # -----------------------------
247
- def sentiment_with_tips(sentiment):
248
- tips = {
249
- "Positive": "😊 Great! Keep spreading positivity.",
250
- "Negative": "😞 Looks negative — maybe reflect and improve things.",
251
- "Neutral": "😐 Neutral observation balanced view."
252
  }
253
- return tips.get(sentiment, "")
254
 
255
  # -----------------------------
256
- # Neutral Adjuster (Urdu/Descriptive)
257
- # -----------------------------
258
- def adjust_for_neutral(text, sentiment, score):
259
- neutral_triggers = ["ہورہی ہے", "ہو رہی ہے", "ہے", "tha", "thi"]
260
- if sentiment != "Neutral" and any(p in text for p in neutral_triggers):
261
- if score < 0.9: # descriptive statements, low emotional intensity
262
- return "Neutral", 0.7
263
- return sentiment, score
264
-
265
- # -----------------------------
266
- # Combine Roman Urdu & Urdu Models (Ensemble)
267
  # -----------------------------
268
  def ensemble_roman_urdu(text):
269
  ru = roman_urdu_model(text)[0]
270
  ur = urdu_model(text)[0]
 
271
  ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
 
272
  if ru_sent == ur_sent:
273
- result = ru if ru["score"] >= ur["score"] else ur
274
- else:
275
- result = ru if ru["score"] * 0.9 >= ur["score"] else ur
276
- return result
 
 
277
 
278
  # -----------------------------
279
- # Main Sentiment Function
280
  # -----------------------------
281
- def analyze_sentiment(text, lang_hint):
282
- try:
283
- if not text.strip():
284
- return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
285
 
286
- # auto detect if needed
287
- lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
 
 
 
 
288
 
289
- # select & possibly normalize
290
- if lang == "English":
291
- result = english_model(text)[0]
292
- elif lang == "Urdu":
293
- result = urdu_model(text)[0]
294
- else: # Roman Urdu
295
- text = normalize_roman_urdu(text)
296
- result = ensemble_roman_urdu(text)
297
 
298
- # get normalized sentiment
299
- sentiment = normalize_label(result["label"])
300
- score = round(float(result["score"]), 3)
301
- sentiment, score = adjust_for_neutral(text, sentiment, score)
302
- explanation = sentiment_with_tips(sentiment)
 
 
303
 
304
- # store results (thread-safe)
305
- with FileLock(LOCK_FILE):
306
- df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
307
- if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
308
- new_row = pd.DataFrame([[text, lang, sentiment, score]],
309
- columns=["Sentence", "Language", "Sentiment", "Confidence"])
310
- df = pd.concat([df, new_row], ignore_index=True)
311
- df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
312
 
313
- return sentiment, str(score), explanation, SAVE_FILE
 
 
 
 
 
 
 
314
 
315
- except Exception as e:
316
- return f"⚠️ Error: {str(e)}", "", "", SAVE_FILE
317
 
318
  # -----------------------------
319
  # Show Logs
@@ -331,13 +160,12 @@ with gr.Blocks() as demo:
331
  gr.Markdown(
332
  "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
333
  "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
334
- "🪶 **Improvements:** refined Urdu/Roman Urdu detection, better Roman Urdu normalization, ensemble correction, and neutral balancing.\n\n"
335
- "💾 All analyzed text is stored permanently in the same CSV, even across shared sessions."
336
  )
337
 
338
  with gr.Row():
339
  with gr.Column():
340
- user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type in English, Urdu, or RomanUrdu...")
341
  lang_dropdown = gr.Dropdown(
342
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
343
  value="Auto Detect", label="🌐 Language"
@@ -347,13 +175,13 @@ with gr.Blocks() as demo:
347
 
348
  with gr.Column():
349
  out_sent = gr.Textbox(label="Sentiment")
350
- out_conf = gr.Textbox(label="Confidence(0–1)")
351
- out_exp = gr.Textbox(label="Explanation")
352
- out_file = gr.File(label="⬇️ DownloadLogs(.csv)", type="filepath")
353
 
354
  logs_df = gr.Dataframe(
355
  headers=["Sentence", "Language", "Sentiment", "Confidence"],
356
- label="🧾SentimentLogs", interactive=False
357
  )
358
 
359
  btn_analyze.click(analyze_sentiment,
@@ -363,4 +191,4 @@ with gr.Blocks() as demo:
363
  btn_show.click(show_logs, outputs=[logs_df])
364
 
365
  if __name__ == "__main__":
366
- demo.launch()
 
13
  model="siebert/sentiment-roberta-large-english"
14
  )
15
 
 
16
  urdu_model = pipeline(
17
  "sentiment-analysis",
18
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
19
  )
20
+
21
  roman_urdu_model = pipeline(
22
  "sentiment-analysis",
23
  model="tahamueed23/fine_tuned_cardiffnlp_urdu_and_roman-urdu"
24
  )
25
 
26
  # -----------------------------
27
+ # CSV Setup
28
  # -----------------------------
29
  SAVE_FILE = "sentiment_logs.csv"
30
  LOCK_FILE = SAVE_FILE + ".lock"
 
37
  # -----------------------------
38
  # Improved Language Detection
39
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def detect_language(text):
41
+ urdu_script = re.compile(r"[\u0600-\u06FF]")
42
+ if urdu_script.search(text):
 
 
43
  return "Urdu"
44
 
45
+ roman_urdu_patterns = [
46
+ r"\b(hai|hain|tha|thi|parhta|parhai|acha|bura|bohot|zabardast)\b",
47
+ r"\b(sir|madam|ustad|class|parh|samajh)\b",
48
+ ]
49
+
50
+ text_l = text.lower()
51
+ for p in roman_urdu_patterns:
52
+ if re.search(p, text_l):
53
+ return "Roman Urdu"
54
 
55
  return "English"
56
 
57
  # -----------------------------
58
  # Roman Urdu Normalization
59
  # -----------------------------
 
60
  def normalize_roman_urdu(text):
61
+ text = text.lower()
62
+ text = text.replace("hy", "hai").replace("h", "hai")
63
+ text = re.sub(r"\bnhi\b|\bnai\b|\bnhi\b", "nahi", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  return text
65
 
 
66
  # -----------------------------
67
+ # Normalize Labels
68
  # -----------------------------
69
  def normalize_label(label):
70
  label = label.lower()
 
76
  return "Neutral"
77
 
78
  # -----------------------------
79
+ # Polarity Explanation
80
  # -----------------------------
81
+ def polarity_explanation(text, sentiment):
82
+ explanations = {
83
+ "Positive": "Contains praise words or positive evaluation.",
84
+ "Negative": "Contains criticism or negative expressions.",
85
+ "Neutral": "Factual statement or balanced observation."
86
  }
87
+ return explanations.get(sentiment, "")
88
 
89
  # -----------------------------
90
+ # Ensemble Roman Urdu + Urdu
 
 
 
 
 
 
 
 
 
 
91
  # -----------------------------
92
  def ensemble_roman_urdu(text):
93
  ru = roman_urdu_model(text)[0]
94
  ur = urdu_model(text)[0]
95
+
96
  ru_sent, ur_sent = normalize_label(ru["label"]), normalize_label(ur["label"])
97
+
98
  if ru_sent == ur_sent:
99
+ return ru if ru["score"] >= ur["score"] else ur
100
+
101
+ # Weight Roman Urdu higher for Roman Urdu input
102
+ weight_ru = ru["score"] * 1.25
103
+ weight_ur = ur["score"]
104
+ return ru if weight_ru >= weight_ur else ur
105
 
106
  # -----------------------------
107
+ # Adjust sentiment if low intensity
108
  # -----------------------------
109
+ def adjust_for_neutral(text, sentiment, score):
110
+ if sentiment in ["Positive", "Negative"] and score < 0.7:
111
+ return "Neutral", score
112
+ return sentiment, score
113
 
114
+ # -----------------------------
115
+ # Main Analysis Function
116
+ # -----------------------------
117
+ def analyze_sentiment(text, lang_hint):
118
+ if not text.strip():
119
+ return "⚠️ Please enter a sentence.", "", "", SAVE_FILE
120
 
121
+ lang = lang_hint if lang_hint != "Auto Detect" else detect_language(text)
 
 
 
 
 
 
 
122
 
123
+ if lang == "English":
124
+ result = english_model(text)[0]
125
+ elif lang == "Urdu":
126
+ result = urdu_model(text)[0]
127
+ else:
128
+ text = normalize_roman_urdu(text)
129
+ result = ensemble_roman_urdu(text)
130
 
131
+ sentiment = normalize_label(result["label"])
132
+ score = round(float(result["score"]), 3)
133
+ sentiment, score = adjust_for_neutral(text, sentiment, score)
134
+ explanation = polarity_explanation(text, sentiment)
 
 
 
 
135
 
136
+ # Save logs
137
+ with FileLock(LOCK_FILE):
138
+ df = pd.read_csv(SAVE_FILE, encoding="utf-8-sig") \
139
+ if os.path.exists(SAVE_FILE) else pd.DataFrame(columns=["Sentence", "Language", "Sentiment", "Confidence"])
140
+ new_row = pd.DataFrame([[text, lang, sentiment, score]],
141
+ columns=["Sentence", "Language", "Sentiment", "Confidence"])
142
+ df = pd.concat([df, new_row], ignore_index=True)
143
+ df.to_csv(SAVE_FILE, index=False, encoding="utf-8-sig")
144
 
145
+ return sentiment, str(score), explanation, SAVE_FILE
 
146
 
147
  # -----------------------------
148
  # Show Logs
 
160
  gr.Markdown(
161
  "## 🌍 Multilingual Sentiment Analysis (English • Urdu • Roman Urdu)\n"
162
  "Detect **Positive**, **Negative**, or **Neutral** tone with confidence score.\n\n"
163
+ "🪶 Improved Roman Urdu normalization + ensemble + polarity explanation.\n"
 
164
  )
165
 
166
  with gr.Row():
167
  with gr.Column():
168
+ user_text = gr.Textbox(label="✍️ Enter text", placeholder="Type English, Urdu, or Roman Urdu...")
169
  lang_dropdown = gr.Dropdown(
170
  ["Auto Detect", "English", "Urdu", "Roman Urdu"],
171
  value="Auto Detect", label="🌐 Language"
 
175
 
176
  with gr.Column():
177
  out_sent = gr.Textbox(label="Sentiment")
178
+ out_conf = gr.Textbox(label="Confidence (0–1)")
179
+ out_exp = gr.Textbox(label="Polarity Explanation")
180
+ out_file = gr.File(label="⬇️ Download Logs (.csv)", type="filepath")
181
 
182
  logs_df = gr.Dataframe(
183
  headers=["Sentence", "Language", "Sentiment", "Confidence"],
184
+ label="🧾 Sentiment Logs", interactive=False
185
  )
186
 
187
  btn_analyze.click(analyze_sentiment,
 
191
  btn_show.click(show_logs, outputs=[logs_df])
192
 
193
  if __name__ == "__main__":
194
+ demo.launch()