File size: 13,518 Bytes
4f4e064
f536a8a
63f1c24
 
b97cadb
63f1c24
 
4f4e064
63f1c24
 
 
 
 
93683f0
4f4e064
63f1c24
 
 
93683f0
f536a8a
93683f0
f536a8a
 
 
 
 
93683f0
63f1c24
4f4e064
b97cadb
 
 
 
 
 
 
 
 
 
 
 
 
725d97d
 
 
 
 
b97cadb
725d97d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93683f0
 
b97cadb
 
725d97d
 
b97cadb
725d97d
 
 
 
 
 
 
b97cadb
725d97d
 
63f1c24
725d97d
63f1c24
725d97d
93683f0
63f1c24
 
 
 
725d97d
63f1c24
 
725d97d
63f1c24
 
 
 
 
 
 
 
 
725d97d
63f1c24
 
725d97d
63f1c24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93683f0
63f1c24
93683f0
b97cadb
93683f0
b97cadb
63f1c24
b97cadb
93683f0
 
b97cadb
725d97d
 
93683f0
725d97d
 
93683f0
 
 
725d97d
 
 
 
93683f0
 
 
 
 
 
725d97d
93683f0
 
 
 
964322b
63f1c24
 
 
 
 
 
 
 
 
 
 
725d97d
b97cadb
725d97d
 
 
 
 
 
 
93683f0
 
 
725d97d
b97cadb
93683f0
725d97d
63f1c24
725d97d
 
93683f0
725d97d
 
 
 
b97cadb
93683f0
b97cadb
f536a8a
63f1c24
f536a8a
93683f0
f536a8a
 
63f1c24
 
f536a8a
725d97d
 
b97cadb
63f1c24
 
 
 
 
a0cbc8f
725d97d
 
63f1c24
 
 
725d97d
b97cadb
63f1c24
725d97d
 
63f1c24
a0cbc8f
63f1c24
725d97d
f536a8a
93683f0
f536a8a
 
 
 
93683f0
f536a8a
 
 
 
63f1c24
b97cadb
725d97d
 
63f1c24
725d97d
63f1c24
 
725d97d
 
63f1c24
725d97d
f536a8a
 
63f1c24
 
725d97d
 
 
 
 
4f4e064
f536a8a
4f4e064
f536a8a
63f1c24
f536a8a
63f1c24
f536a8a
63f1c24
 
 
 
b97cadb
63f1c24
 
 
 
 
f536a8a
 
 
 
 
b97cadb
 
 
f536a8a
b97cadb
f536a8a
b97cadb
725d97d
 
 
 
93683f0
 
 
4f4e064
0615894
4f4e064
 
f536a8a
4f4e064
f536a8a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import gradio as gr
import torch
from transformers import pipeline
import requests
import re
import os
from huggingface_hub import login

# Authenticate with Hugging Face
if "HF_TOKEN" in os.environ:
    login(token=os.environ["HF_TOKEN"])

# Global variable to store the Atlas-Chat model
atlas_pipe = None

def load_atlas_model():
    """Load only the Atlas-Chat model locally"""
    global atlas_pipe
    if atlas_pipe is None:
        print("🏔️ Loading Atlas-Chat-2B model...")
        atlas_pipe = pipeline(
            "text-generation",
            model="MBZUAI-Paris/Atlas-Chat-2B",
            model_kwargs={"torch_dtype": torch.bfloat16},
            device="cuda" if torch.cuda.is_available() else "cpu"
        )
        print("✅ Atlas-Chat model loaded!")
    return atlas_pipe

def detect_arabizi(text):
    """
    Detect if input text is written in Arabizi (Latin script with numbers)
    Returns True if Arabizi is detected
    """
    if not text or len(text.strip()) < 2:
        return False
    
    # Check for Arabic script - if present, it's NOT Arabizi
    arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
    if re.search(arabic_pattern, text):
        return False
    
    # Arabizi indicators - numbers used as letters
    arabizi_numbers = ['2', '3', '7', '9', '5', '6', '8']
    has_arabizi_numbers = any(num in text for num in arabizi_numbers)
    
    # Common Arabizi words and patterns
    arabizi_patterns = [
        'wach', 'wash', 'ach', 'achno', 'chno', 'shno', 'shkoun', 'chkoun',
        'kif', 'kifash', 'ki', 'kayf', 'kien', 'kima',
        'feen', 'fin', 'fen', 'fain', 'mnin',
        'imta', 'meta', 'waqt', 'mata', 'emta',
        'hna', 'ahna', 'ana', 'nta', 'nti', 'ntuma', 'ntouma',
        'howa', 'hiya', 'huma', 'houma', 'hoa', 'hia',
        'had', 'hadchi', 'hada', 'hadi', 'hadou', 'hadouk',
        'bghit', 'bghiti', 'bgha', 'bghina', 'bghitiou',
        'galt', 'galti', 'gal', 'galet', 'galou',
        'rah', 'raha', 'rahi', 'rahom', 'rahin',
        'kan', 'kanu', 'kana', 'kanet', 'kano',
        'ghadi', 'ghad', 'gha', 'ghadia', 'ghadiyin',
        'daba', 'dak', 'dakchi', 'dik', 'dok',
        'bzf', 'bzzaf', 'bezzaf', 'bzaaaaf',
        'chway', 'chwiya', 'shwiya', 'chwia',
        'khoya', 'khuya', 'akhi', 'kho',
        'khti', 'khtiya', 'ukhti', 'kht',
        'mama', 'baba', 'lwaldin', 'lwalidin',
        'salam', 'salamu aleikum', 'slm',
        'yallah', 'yalla', 'hya', 'aji',
        'mabghitsh', 'mabghach', 'makansh', 'machi',
        'walakin', 'walaken', 'ama', 'mais',
        'kayn', 'makaynsh', 'chi', 'tayi'
    ]
    
    text_lower = text.lower()
    has_arabizi_words = any(pattern in text_lower for pattern in arabizi_patterns)
    
    # Decision logic
    if has_arabizi_numbers and has_arabizi_words:
        return True
    if has_arabizi_numbers and len([c for c in text if c.isalpha()]) > len(text) * 0.6:
        return True
    if has_arabizi_words and len([c for c in text if c.isalpha()]) > len(text) * 0.7:
        return True
    
    return False

def arabizi_to_arabic_api(arabizi_text):
    """
    Convert Arabizi text to Arabic using Hugging Face Inference API
    """
    try:
        # Check if HF_TOKEN is available
        if "HF_TOKEN" not in os.environ:
            print("❌ HF_TOKEN not found, falling back to original text")
            return arabizi_text
        
        API_URL = "https://api-inference.huggingface.co/models/atlasia/Transliteration-Moroccan-Darija"
        headers = {"Authorization": f"Bearer {os.environ['HF_TOKEN']}"}
        
        # Prepare the payload
        payload = {
            "inputs": arabizi_text,
            "parameters": {
                "max_length": 512,
                "num_beams": 4,
                "early_stopping": True
            }
        }
        
        # Make API request with timeout
        response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
        
        # Check if request was successful
        if response.status_code == 200:
            result = response.json()
            
            # Handle different response formats
            if isinstance(result, list) and len(result) > 0:
                if "generated_text" in result[0]:
                    return result[0]["generated_text"].strip()
                elif isinstance(result[0], str):
                    return result[0].strip()
            elif isinstance(result, dict) and "generated_text" in result:
                return result["generated_text"].strip()
            elif isinstance(result, str):
                return result.strip()
            else:
                print(f"❌ Unexpected API response format: {result}")
                return arabizi_text
                
        elif response.status_code == 503:
            print("⏳ Model is loading, falling back to original text")
            return arabizi_text
        else:
            print(f"❌ API error {response.status_code}: {response.text}")
            return arabizi_text
            
    except requests.exceptions.Timeout:
        print("⏰ API timeout, falling back to original text")
        return arabizi_text
    except requests.exceptions.RequestException as e:
        print(f"❌ API request failed: {e}")
        return arabizi_text
    except Exception as e:
        print(f"❌ Unexpected error in API conversion: {e}")
        return arabizi_text

def arabic_to_arabizi(arabic_text):
    """
    Convert Arabic script to Arabizi using comprehensive hard-coded mappings
    """
    if not arabic_text:
        return arabic_text
    
    # COMPREHENSIVE WORD MAPPINGS (Arabic → Arabizi)
    word_mappings = {
        # Common words first (most likely to appear)
        'أنا': 'ana', 'نتا': 'nta', 'نتي': 'nti', 'هوا': 'howa', 'هيا': 'hiya',
        'حنا': 'hna', 'أحنا': 'ahna', 'نتوما': 'ntuma', 'هوما': 'huma',
        'شكون': 'shkoun', 'أشنو': 'achno', 'شنو': 'chno', 'واش': 'wach',
        'كيفاش': 'kifash', 'كيف': 'kif', 'فين': 'feen', 'منين': 'mnin',
        'إمتا': 'imta', 'متا': 'meta', 'علاش': '3lach', 'أش': 'ach',
        'بغيت': 'bghit', 'بغيتي': 'bghiti', 'بغا': 'bgha', 'بغينا': 'bghina',
        'كان': 'kan', 'كانا': 'kana', 'كانت': 'kanet', 'كانو': 'kanu',
        'قلت': 'galt', 'قلتي': 'galti', 'قال': 'gal', 'قالت': 'galet',
        'راح': 'rah', 'راها': 'raha', 'راهي': 'rahi', 'راهم': 'rahom',
        'غادي': 'ghadi', 'غاد': 'ghad', 'غا': 'gha',
        'هاد': 'had', 'هادا': 'hada', 'هادي': 'hadi', 'هادشي': 'hadchi',
        'داك': 'dak', 'ديك': 'dik', 'داكشي': 'dakchi',
        'بزاف': 'bzzaf', 'شوياة': 'chwiya', 'كولشي': 'kolchi',
        'ماشي': 'machi', 'مابغيتش': 'mabghitsh', 'ماكاينش': 'makainch',
        'دابا': 'daba', 'توا': 'tawa', 'غدا': 'ghda',
        'ماما': 'mama', 'بابا': 'baba', 'خويا': 'khoya', 'ختي': 'khti',
        'سلام': 'salam', 'يالاه': 'yallah', 'هيا': 'hya',
        'المغرب': 'lmaghrib', 'مغرب': 'maghrib',
        'طاجين': 'tajine', 'أتاي': 'atay', 'خوبز': 'khobz',
        'كاين': 'kayn', 'ماكاينش': 'makaynsh', 'شي': 'chi',
        'زوين': 'zwin', 'زوينا': 'zwina', 'مزيان': 'mzyan', 'مزيانا': 'mzyana',
        'كاينين': 'kaynin', 'مطعم': 'ma63am', 'مطاعم': 'ma6a3im',
        'مشهور': 'mashhur', 'مشهورين': 'mashhurin', 'وسط': 'wost',
        'المدينة': 'lmdina', 'مدينة': 'mdina', 'إيطالي': 'italiy',
        'ياباني': 'yabani', 'مغربي': 'maghribi', 'فرنسي': 'fransi',
        'أمريكي': 'amriki', 'صيني': 'sini', 'هندي': 'hindi',
        'لحم': 'la7m', 'دجاج': 'djaj', 'حوت': '7ut', 'خضرة': 'khodra',
        'فواكه': 'fawakeh', 'جبن': 'jben', 'زبدة': 'zebda', 'حليب': '7lib',
        'قهوة': 'qahwa', 'شاي': 'atay', 'ماء': 'ma', 'عصير': '3asir',
        'خبز': 'khobz', 'رز': 'roz', 'مكرونة': 'makarona', 'بطاطا': 'batata',
        'طماطم': 'toma6im', 'بصل': 'basal', 'ثوم': 'tum', 'فلفل': 'felfel',
        'ملح': 'mel7', 'سكر': 'sokkar', 'زيت': 'zit', 'خل': 'khall'
    }
    
    # CHARACTER MAPPINGS (Arabic → Arabizi)
    char_mappings = {
        'ا': 'a', 'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j', 'ح': '7',
        'خ': 'kh', 'د': 'd', 'ذ': 'dh', 'ر': 'r', 'ز': 'z', 'س': 's',
        'ش': 'sh', 'ص': 's', 'ض': 'd', 'ط': '6', 'ظ': 'z', 'ع': '3',
        'غ': 'gh', 'ف': 'f', 'ق': '9', 'ك': 'k', 'ل': 'l', 'م': 'm',
        'ن': 'n', 'ه': 'h', 'و': 'w', 'ي': 'y', 'ء': '2',
        'آ': 'aa', 'أ': 'a', 'إ': 'i', 'ة': 'a', 'ى': 'a',
        '؟': '?', '،': ',', '؛': ';', ':': ':', '!': '!',
        'َ': 'a', 'ُ': 'o', 'ِ': 'i', 'ً': 'an', 'ٌ': 'on', 'ٍ': 'in'
    }
    
    result = arabic_text
    
    # Step 1: Apply word mappings (most specific first)
    for arabic_word, arabizi_word in word_mappings.items():
        # Use word boundaries to avoid partial matches
        result = re.sub(r'\b' + re.escape(arabic_word) + r'\b', arabizi_word, result)
    
    # Step 2: Apply character mappings
    for arabic_char, arabizi_char in char_mappings.items():
        result = result.replace(arabic_char, arabizi_char)
    
    return result.strip()

def chat_with_atlas(message, history):
    """Generate response from Atlas-Chat model with API-powered Arabizi conversion"""
    if not message.strip():
        return "ahlan wa sahlan! kifash n9der n3awnek? / مرحبا! كيفاش نقدر نعاونك؟"
    
    try:
        # Load Atlas-Chat model
        atlas_model = load_atlas_model()
        
        # Detect if input is Arabizi
        is_arabizi_input = detect_arabizi(message)
        
        print("\n" + "="*50)
        print("🔍 ATLAS-CHAT DEBUG LOG")
        print("="*50)
        print(f"📥 INPUT: '{message}'")
        print(f"🔍 ARABIZI: {is_arabizi_input}")
        
        # Prepare input for the model
        if is_arabizi_input:
            print("🔄 Converting Arabizi→Arabic via API...")
            arabic_input = arabizi_to_arabic_api(message)
            print(f"✅ ARABIC: '{arabic_input}'")
            model_input = arabic_input
        else:
            print("➡️  No conversion needed")
            model_input = message
        
        print(f"🤖 Sending to Atlas-Chat...")
        
        # Generate response using Atlas-Chat
        messages = [{"role": "user", "content": model_input}]
        
        outputs = atlas_model(
            messages, 
            max_new_tokens=256, 
            temperature=0.1,
            do_sample=True,
            pad_token_id=atlas_model.tokenizer.eos_token_id
        )
        
        # Extract the response
        response = outputs[0]["generated_text"][-1]["content"].strip()
        print(f"✅ RESPONSE: '{response[:100]}{'...' if len(response) > 100 else ''}'")
        
        # Convert response back to Arabizi if input was Arabizi
        if is_arabizi_input:
            print("🔄 Converting Arabic→Arabizi...")
            arabizi_response = arabic_to_arabizi(response)
            print(f"✅ FINAL: '{arabizi_response[:100]}{'...' if len(arabizi_response) > 100 else ''}'")
            print("="*50 + "\n")
            return arabizi_response
        else:
            print("="*50 + "\n")
            return response
        
    except Exception as e:
        print(f"\n❌ ERROR: {str(e)}")
        print("="*50 + "\n")
        # Return error in appropriate language
        if detect_arabizi(message):
            return f"sorry, kan chi mochkil: {str(e)}. 3awd jar'b!"
        else:
            return f"عذراً، واجهت خطأ: {str(e)}. جرب مرة أخرى! / Sorry, error occurred: {str(e)}. Try again!"

# Create the Gradio interface
demo = gr.ChatInterface(
    fn=chat_with_atlas,
    title="🏔️ Atlas-Chat: AI-Powered Moroccan Arabic Assistant",
    description="""
    **مرحبا بك في أطلس شات!** Welcome to Atlas-Chat! 🇲🇦
    
    **🚀 Powered by Hugging Face Inference API:**
    - **Arabic Script (العربية)** → Direct conversation
    - **Arabizi (3arabi bi 7oruf latin)** → API conversion → Arabizi response  
    - **English** → Direct conversation
    
    **⚡ Features:**
    - Professional AI Arabizi conversion via API
    - No local model conflicts
    - Fast and reliable responses
    - Comprehensive language detection
    
    **جرب هذه الأسئلة / Try these questions:**
    """,
    examples=[
        "شكون لي صنعك؟",
        "shkoun li sna3ek?",
        "اشنو هو الطاجين؟",
        "achno howa tajine?", 
        "شنو كيتسمى المنتخب المغربي؟",
        "chno kaytsma lmontakhab lmaghribi?",
        "What is Morocco famous for?",
        "كيفاش نقدر نتعلم الدارجة؟",
        "kifash n9der nt3elem darija?",
        "wach kayn atay f lmaghrib?",
        "3lach lmaghrib zwien bzzaf?",
        "kifash nsali tajine?",
        "chno homa l2aklat lmaghribiya?",
        "kayn chi restaurants zwinin f casa?",
        "mr7ba! kif dayr?"
    ],
    cache_examples=False
)

# Launch the app
if __name__ == "__main__":
    demo.launch()