File size: 13,518 Bytes
4f4e064 f536a8a 63f1c24 b97cadb 63f1c24 4f4e064 63f1c24 93683f0 4f4e064 63f1c24 93683f0 f536a8a 93683f0 f536a8a 93683f0 63f1c24 4f4e064 b97cadb 725d97d b97cadb 725d97d 93683f0 b97cadb 725d97d b97cadb 725d97d b97cadb 725d97d 63f1c24 725d97d 63f1c24 725d97d 93683f0 63f1c24 725d97d 63f1c24 725d97d 63f1c24 725d97d 63f1c24 725d97d 63f1c24 93683f0 63f1c24 93683f0 b97cadb 93683f0 b97cadb 63f1c24 b97cadb 93683f0 b97cadb 725d97d 93683f0 725d97d 93683f0 725d97d 93683f0 725d97d 93683f0 964322b 63f1c24 725d97d b97cadb 725d97d 93683f0 725d97d b97cadb 93683f0 725d97d 63f1c24 725d97d 93683f0 725d97d b97cadb 93683f0 b97cadb f536a8a 63f1c24 f536a8a 93683f0 f536a8a 63f1c24 f536a8a 725d97d b97cadb 63f1c24 a0cbc8f 725d97d 63f1c24 725d97d b97cadb 63f1c24 725d97d 63f1c24 a0cbc8f 63f1c24 725d97d f536a8a 93683f0 f536a8a 93683f0 f536a8a 63f1c24 b97cadb 725d97d 63f1c24 725d97d 63f1c24 725d97d 63f1c24 725d97d f536a8a 63f1c24 725d97d 4f4e064 f536a8a 4f4e064 f536a8a 63f1c24 f536a8a 63f1c24 f536a8a 63f1c24 b97cadb 63f1c24 f536a8a b97cadb f536a8a b97cadb f536a8a b97cadb 725d97d 93683f0 4f4e064 0615894 4f4e064 f536a8a 4f4e064 f536a8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
import gradio as gr
import torch
from transformers import pipeline
import requests
import re
import os
from huggingface_hub import login
# Authenticate with Hugging Face
if "HF_TOKEN" in os.environ:
login(token=os.environ["HF_TOKEN"])
# Global variable to store the Atlas-Chat model
atlas_pipe = None
def load_atlas_model():
"""Load only the Atlas-Chat model locally"""
global atlas_pipe
if atlas_pipe is None:
print("🏔️ Loading Atlas-Chat-2B model...")
atlas_pipe = pipeline(
"text-generation",
model="MBZUAI-Paris/Atlas-Chat-2B",
model_kwargs={"torch_dtype": torch.bfloat16},
device="cuda" if torch.cuda.is_available() else "cpu"
)
print("✅ Atlas-Chat model loaded!")
return atlas_pipe
def detect_arabizi(text):
"""
Detect if input text is written in Arabizi (Latin script with numbers)
Returns True if Arabizi is detected
"""
if not text or len(text.strip()) < 2:
return False
# Check for Arabic script - if present, it's NOT Arabizi
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
if re.search(arabic_pattern, text):
return False
# Arabizi indicators - numbers used as letters
arabizi_numbers = ['2', '3', '7', '9', '5', '6', '8']
has_arabizi_numbers = any(num in text for num in arabizi_numbers)
# Common Arabizi words and patterns
arabizi_patterns = [
'wach', 'wash', 'ach', 'achno', 'chno', 'shno', 'shkoun', 'chkoun',
'kif', 'kifash', 'ki', 'kayf', 'kien', 'kima',
'feen', 'fin', 'fen', 'fain', 'mnin',
'imta', 'meta', 'waqt', 'mata', 'emta',
'hna', 'ahna', 'ana', 'nta', 'nti', 'ntuma', 'ntouma',
'howa', 'hiya', 'huma', 'houma', 'hoa', 'hia',
'had', 'hadchi', 'hada', 'hadi', 'hadou', 'hadouk',
'bghit', 'bghiti', 'bgha', 'bghina', 'bghitiou',
'galt', 'galti', 'gal', 'galet', 'galou',
'rah', 'raha', 'rahi', 'rahom', 'rahin',
'kan', 'kanu', 'kana', 'kanet', 'kano',
'ghadi', 'ghad', 'gha', 'ghadia', 'ghadiyin',
'daba', 'dak', 'dakchi', 'dik', 'dok',
'bzf', 'bzzaf', 'bezzaf', 'bzaaaaf',
'chway', 'chwiya', 'shwiya', 'chwia',
'khoya', 'khuya', 'akhi', 'kho',
'khti', 'khtiya', 'ukhti', 'kht',
'mama', 'baba', 'lwaldin', 'lwalidin',
'salam', 'salamu aleikum', 'slm',
'yallah', 'yalla', 'hya', 'aji',
'mabghitsh', 'mabghach', 'makansh', 'machi',
'walakin', 'walaken', 'ama', 'mais',
'kayn', 'makaynsh', 'chi', 'tayi'
]
text_lower = text.lower()
has_arabizi_words = any(pattern in text_lower for pattern in arabizi_patterns)
# Decision logic
if has_arabizi_numbers and has_arabizi_words:
return True
if has_arabizi_numbers and len([c for c in text if c.isalpha()]) > len(text) * 0.6:
return True
if has_arabizi_words and len([c for c in text if c.isalpha()]) > len(text) * 0.7:
return True
return False
def arabizi_to_arabic_api(arabizi_text):
"""
Convert Arabizi text to Arabic using Hugging Face Inference API
"""
try:
# Check if HF_TOKEN is available
if "HF_TOKEN" not in os.environ:
print("❌ HF_TOKEN not found, falling back to original text")
return arabizi_text
API_URL = "https://api-inference.huggingface.co/models/atlasia/Transliteration-Moroccan-Darija"
headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
# Prepare the payload
payload = {
"inputs": arabizi_text,
"parameters": {
"max_length": 512,
"num_beams": 4,
"early_stopping": True
}
}
# Make API request with timeout
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
# Check if request was successful
if response.status_code == 200:
result = response.json()
# Handle different response formats
if isinstance(result, list) and len(result) > 0:
if "generated_text" in result[0]:
return result[0]["generated_text"].strip()
elif isinstance(result[0], str):
return result[0].strip()
elif isinstance(result, dict) and "generated_text" in result:
return result["generated_text"].strip()
elif isinstance(result, str):
return result.strip()
else:
print(f"❌ Unexpected API response format: {result}")
return arabizi_text
elif response.status_code == 503:
print("⏳ Model is loading, falling back to original text")
return arabizi_text
else:
print(f"❌ API error {response.status_code}: {response.text}")
return arabizi_text
except requests.exceptions.Timeout:
print("⏰ API timeout, falling back to original text")
return arabizi_text
except requests.exceptions.RequestException as e:
print(f"❌ API request failed: {e}")
return arabizi_text
except Exception as e:
print(f"❌ Unexpected error in API conversion: {e}")
return arabizi_text
def arabic_to_arabizi(arabic_text):
"""
Convert Arabic script to Arabizi using comprehensive hard-coded mappings
"""
if not arabic_text:
return arabic_text
# COMPREHENSIVE WORD MAPPINGS (Arabic → Arabizi)
word_mappings = {
# Common words first (most likely to appear)
'أنا': 'ana', 'نتا': 'nta', 'نتي': 'nti', 'هوا': 'howa', 'هيا': 'hiya',
'حنا': 'hna', 'أحنا': 'ahna', 'نتوما': 'ntuma', 'هوما': 'huma',
'شكون': 'shkoun', 'أشنو': 'achno', 'شنو': 'chno', 'واش': 'wach',
'كيفاش': 'kifash', 'كيف': 'kif', 'فين': 'feen', 'منين': 'mnin',
'إمتا': 'imta', 'متا': 'meta', 'علاش': '3lach', 'أش': 'ach',
'بغيت': 'bghit', 'بغيتي': 'bghiti', 'بغا': 'bgha', 'بغينا': 'bghina',
'كان': 'kan', 'كانا': 'kana', 'كانت': 'kanet', 'كانو': 'kanu',
'قلت': 'galt', 'قلتي': 'galti', 'قال': 'gal', 'قالت': 'galet',
'راح': 'rah', 'راها': 'raha', 'راهي': 'rahi', 'راهم': 'rahom',
'غادي': 'ghadi', 'غاد': 'ghad', 'غا': 'gha',
'هاد': 'had', 'هادا': 'hada', 'هادي': 'hadi', 'هادشي': 'hadchi',
'داك': 'dak', 'ديك': 'dik', 'داكشي': 'dakchi',
'بزاف': 'bzzaf', 'شوياة': 'chwiya', 'كولشي': 'kolchi',
'ماشي': 'machi', 'مابغيتش': 'mabghitsh', 'ماكاينش': 'makainch',
'دابا': 'daba', 'توا': 'tawa', 'غدا': 'ghda',
'ماما': 'mama', 'بابا': 'baba', 'خويا': 'khoya', 'ختي': 'khti',
'سلام': 'salam', 'يالاه': 'yallah', 'هيا': 'hya',
'المغرب': 'lmaghrib', 'مغرب': 'maghrib',
'طاجين': 'tajine', 'أتاي': 'atay', 'خوبز': 'khobz',
'كاين': 'kayn', 'ماكاينش': 'makaynsh', 'شي': 'chi',
'زوين': 'zwin', 'زوينا': 'zwina', 'مزيان': 'mzyan', 'مزيانا': 'mzyana',
'كاينين': 'kaynin', 'مطعم': 'ma63am', 'مطاعم': 'ma6a3im',
'مشهور': 'mashhur', 'مشهورين': 'mashhurin', 'وسط': 'wost',
'المدينة': 'lmdina', 'مدينة': 'mdina', 'إيطالي': 'italiy',
'ياباني': 'yabani', 'مغربي': 'maghribi', 'فرنسي': 'fransi',
'أمريكي': 'amriki', 'صيني': 'sini', 'هندي': 'hindi',
'لحم': 'la7m', 'دجاج': 'djaj', 'حوت': '7ut', 'خضرة': 'khodra',
'فواكه': 'fawakeh', 'جبن': 'jben', 'زبدة': 'zebda', 'حليب': '7lib',
'قهوة': 'qahwa', 'شاي': 'atay', 'ماء': 'ma', 'عصير': '3asir',
'خبز': 'khobz', 'رز': 'roz', 'مكرونة': 'makarona', 'بطاطا': 'batata',
'طماطم': 'toma6im', 'بصل': 'basal', 'ثوم': 'tum', 'فلفل': 'felfel',
'ملح': 'mel7', 'سكر': 'sokkar', 'زيت': 'zit', 'خل': 'khall'
}
# CHARACTER MAPPINGS (Arabic → Arabizi)
char_mappings = {
'ا': 'a', 'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j', 'ح': '7',
'خ': 'kh', 'د': 'd', 'ذ': 'dh', 'ر': 'r', 'ز': 'z', 'س': 's',
'ش': 'sh', 'ص': 's', 'ض': 'd', 'ط': '6', 'ظ': 'z', 'ع': '3',
'غ': 'gh', 'ف': 'f', 'ق': '9', 'ك': 'k', 'ل': 'l', 'م': 'm',
'ن': 'n', 'ه': 'h', 'و': 'w', 'ي': 'y', 'ء': '2',
'آ': 'aa', 'أ': 'a', 'إ': 'i', 'ة': 'a', 'ى': 'a',
'؟': '?', '،': ',', '؛': ';', ':': ':', '!': '!',
'َ': 'a', 'ُ': 'o', 'ِ': 'i', 'ً': 'an', 'ٌ': 'on', 'ٍ': 'in'
}
result = arabic_text
# Step 1: Apply word mappings (most specific first)
for arabic_word, arabizi_word in word_mappings.items():
# Use word boundaries to avoid partial matches
result = re.sub(r'\b' + re.escape(arabic_word) + r'\b', arabizi_word, result)
# Step 2: Apply character mappings
for arabic_char, arabizi_char in char_mappings.items():
result = result.replace(arabic_char, arabizi_char)
return result.strip()
def chat_with_atlas(message, history):
"""Generate response from Atlas-Chat model with API-powered Arabizi conversion"""
if not message.strip():
return "ahlan wa sahlan! kifash n9der n3awnek? / مرحبا! كيفاش نقدر نعاونك؟"
try:
# Load Atlas-Chat model
atlas_model = load_atlas_model()
# Detect if input is Arabizi
is_arabizi_input = detect_arabizi(message)
print("\n" + "="*50)
print("🔍 ATLAS-CHAT DEBUG LOG")
print("="*50)
print(f"📥 INPUT: '{message}'")
print(f"🔍 ARABIZI: {is_arabizi_input}")
# Prepare input for the model
if is_arabizi_input:
print("🔄 Converting Arabizi→Arabic via API...")
arabic_input = arabizi_to_arabic_api(message)
print(f"✅ ARABIC: '{arabic_input}'")
model_input = arabic_input
else:
print("➡️ No conversion needed")
model_input = message
print(f"🤖 Sending to Atlas-Chat...")
# Generate response using Atlas-Chat
messages = [{"role": "user", "content": model_input}]
outputs = atlas_model(
messages,
max_new_tokens=256,
temperature=0.1,
do_sample=True,
pad_token_id=atlas_model.tokenizer.eos_token_id
)
# Extract the response
response = outputs[0]["generated_text"][-1]["content"].strip()
print(f"✅ RESPONSE: '{response[:100]}{'...' if len(response) > 100 else ''}'")
# Convert response back to Arabizi if input was Arabizi
if is_arabizi_input:
print("🔄 Converting Arabic→Arabizi...")
arabizi_response = arabic_to_arabizi(response)
print(f"✅ FINAL: '{arabizi_response[:100]}{'...' if len(arabizi_response) > 100 else ''}'")
print("="*50 + "\n")
return arabizi_response
else:
print("="*50 + "\n")
return response
except Exception as e:
print(f"\n❌ ERROR: {str(e)}")
print("="*50 + "\n")
# Return error in appropriate language
if detect_arabizi(message):
return f"sorry, kan chi mochkil: {str(e)}. 3awd jar'b!"
else:
return f"عذراً، واجهت خطأ: {str(e)}. جرب مرة أخرى! / Sorry, error occurred: {str(e)}. Try again!"
# Create the Gradio interface
demo = gr.ChatInterface(
fn=chat_with_atlas,
title="🏔️ Atlas-Chat: AI-Powered Moroccan Arabic Assistant",
description="""
**مرحبا بك في أطلس شات!** Welcome to Atlas-Chat! 🇲🇦
**🚀 Powered by Hugging Face Inference API:**
- **Arabic Script (العربية)** → Direct conversation
- **Arabizi (3arabi bi 7oruf latin)** → API conversion → Arabizi response
- **English** → Direct conversation
**⚡ Features:**
- Professional AI Arabizi conversion via API
- No local model conflicts
- Fast and reliable responses
- Comprehensive language detection
**جرب هذه الأسئلة / Try these questions:**
""",
examples=[
"شكون لي صنعك؟",
"shkoun li sna3ek?",
"اشنو هو الطاجين؟",
"achno howa tajine?",
"شنو كيتسمى المنتخب المغربي؟",
"chno kaytsma lmontakhab lmaghribi?",
"What is Morocco famous for?",
"كيفاش نقدر نتعلم الدارجة؟",
"kifash n9der nt3elem darija?",
"wach kayn atay f lmaghrib?",
"3lach lmaghrib zwien bzzaf?",
"kifash nsali tajine?",
"chno homa l2aklat lmaghribiya?",
"kayn chi restaurants zwinin f casa?",
"mr7ba! kif dayr?"
],
cache_examples=False
)
# Launch the app
if __name__ == "__main__":
demo.launch() |