File size: 11,390 Bytes
c6c2928
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# filepath: [sentiment_api.py](http://_vscodecontentref_/0)
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware  # ← TAMBAH INI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
import re
import os

app = FastAPI(title="Indonesian Sentiment Analysis API", 
              description="API untuk analisis sentimen bahasa Indonesia dengan dukungan bahasa gaul",
              version="1.0.0")

# ← TAMBAH CORS MIDDLEWARE
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allows all origins
    allow_credentials=True,
    allow_methods=["*"],  # Allows all methods
    allow_headers=["*"],  # Allows all headers
)

# Global variable untuk model (akan diload jika tersedia)
model = None
tokenizer = None
model_loaded = False

def load_model():
    """Try to load IndoBERT sentiment model, fallback to enhanced keyword if failed"""
    global model, tokenizer, model_loaded
    
    # List model alternatif yang bisa dicoba
    model_options = [
        "indolem/indobert-base-uncased",  # Model yang lebih umum dan pasti tersedia
        "cahya/bert-base-indonesian-1.5G",  # Alternative Indonesian BERT
        "mdhugol/indonesia-bert-sentiment-classification"  # Specific sentiment model
    ]
    
    for model_name in model_options:
        try:
            print(f"πŸ”„ Trying to load model: {model_name}")
            from transformers import AutoTokenizer, AutoModelForSequenceClassification
            
            # Load tokenizer
            print(f"πŸ“₯ Downloading tokenizer for {model_name}...")
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            print("βœ… Tokenizer loaded successfully!")
            
            # Load model
            print(f"πŸ“₯ Downloading model {model_name} (this may take a while)...")
            model = AutoModelForSequenceClassification.from_pretrained(model_name)
            print("βœ… Model loaded successfully!")
            
            model_loaded = True
            print(f"πŸŽ‰ {model_name} ready for sentiment analysis!")
            return  # Exit jika berhasil
            
        except Exception as e:
            print(f"❌ Failed to load {model_name}: {e}")
            continue  # Coba model berikutnya
    
    # Jika semua model gagal
    print("❌ All models failed to load")
    print("πŸ”„ Using enhanced keyword-based analysis instead")
    model_loaded = False

# Try to load model on startup
load_model()

@app.get("/")
async def root():
    """Root endpoint"""
    model_name = "Unknown"
    if model_loaded and model is not None:
        model_name = model.config.name_or_path if hasattr(model.config, 'name_or_path') else "Indonesian BERT Model"
    
    return {
        "message": "Indonesian Sentiment Analysis API", 
        "version": "1.0.0",
        "docs": "/docs",
        "model_loaded": model_loaded,
        "model_name": model_name if model_loaded else "Enhanced Keyword Analysis",
        "model_type": "πŸ€– AI Model" if model_loaded else "πŸ“ Keyword Analysis",
        "status": "πŸŽ‰ Ready!" if model_loaded else "πŸ“ Keyword Ready!"
    }

@app.get("/health")
async def health_check():
    """Health check endpoint"""
    return {
        "status": "healthy", 
        "model_loaded": model_loaded,
        "model_type": "IndoBERTweet" if model_loaded else "Enhanced Keyword Analysis",
        "ready": True
    }

class TextRequest(BaseModel):
    text: str

def normalize_slang(text):
    """Normalisasi kata gaul/slang ke bahasa baku"""
    slang_dict = {
        'gw': 'saya', 'gue': 'saya', 'w': 'saya',
        'lu': 'kamu', 'elu': 'kamu', 'lo': 'kamu',
        'gk': 'tidak', 'ga': 'tidak', 'gak': 'tidak', 'engga': 'tidak',
        'bgt': 'banget', 'bgt': 'sangat',
        'btw': 'ngomong ngomong', 'fyi': 'informasi',
        'yg': 'yang', 'yng': 'yang',
        'dgn': 'dengan', 'dg': 'dengan',
        'org': 'orang', 'orng': 'orang',
        'udh': 'sudah', 'udah': 'sudah', 'dah': 'sudah',
        'blm': 'belum', 'blom': 'belum',
        'bkn': 'bukan', 'bukan': 'bukan',
        'krn': 'karena', 'krna': 'karena',
        'trs': 'terus', 'trus': 'terus',
        'jg': 'juga', 'jga': 'juga',
        'aja': 'saja', 'ajah': 'saja',
        'emg': 'memang', 'emang': 'memang',
        'tp': 'tapi', 'tapi': 'tetapi',
        'kalo': 'kalau', 'klo': 'kalau',
        'gimana': 'bagaimana', 'gmn': 'bagaimana',
        'knp': 'kenapa', 'knapa': 'kenapa',
        'mantap': 'bagus', 'mantul': 'bagus',
        'anjay': 'wah', 'anjir': 'wah',
        'gabut': 'tidak ada kegiatan',
        'mager': 'malas gerak',
        'baper': 'bawa perasaan',
        'santuy': 'santai',
        'kepo': 'ingin tahu',
        'php': 'pemberi harapan palsu',
        'bucin': 'budak cinta',
        # Tambahan kata positif yang sering dipakai
        'seneng': 'senang', 'senang': 'senang',
        'bahagia': 'bahagia', 'happy': 'senang',
        'kamaren': 'kemarin', 'kemaren': 'kemarin'
    }
    
    # Convert to lowercase
    text = text.lower()
    
    # Replace slang words
    for slang, formal in slang_dict.items():
        text = re.sub(r'\b' + slang + r'\b', formal, text)
    
    return text

def analyze_sentiment(text):
    """Analisis sentimen dengan IndoBERTweet atau enhanced keyword"""
    global model, tokenizer, model_loaded
    
    # Normalisasi kata gaul
    normalized_text = normalize_slang(text)
    
    # Debug info
    print(f"πŸ” Analyzing: '{text}'")
    print(f"πŸ”§ Normalized: '{normalized_text}'")
    print(f"πŸ€– Model loaded: {model_loaded}")
    
    # Coba gunakan IndoBERTweet jika tersedia
    if model_loaded and model is not None and tokenizer is not None:
        try:
            import torch
            print("🎯 Using IndoBERTweet model...")
            
            # Tokenize input
            inputs = tokenizer(normalized_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            
            # Get prediction
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
                probabilities = torch.softmax(logits, dim=1)
                pred = torch.argmax(logits, dim=1).item()
                confidence = torch.max(probabilities).item()
            
            print(f"πŸ“Š IndoBERTweet prediction: {pred} (confidence: {confidence:.3f})")
            print(f"πŸ“Š Probabilities: {probabilities.numpy()}")
            
            # Mapping label IndoBERTweet ke rating bintang
            # IndoBERTweet: 0=negative, 1=neutral, 2=positive
            if pred == 2:  # positive
                result = 5 if confidence > 0.8 else 4
                print(f"βœ… Result: {result} stars (Positive)")
                return result
            elif pred == 1:  # neutral
                result = 3
                print(f"😐 Result: {result} stars (Neutral)")
                return result
            else:  # negative (pred == 0)
                result = 1 if confidence > 0.8 else 2
                print(f"❌ Result: {result} stars (Negative)")
                return result
                
        except Exception as e:
            print(f"⚠️ Error using IndoBERTweet: {e}")
            print("πŸ”„ Falling back to keyword analysis...")
    
    # Enhanced keyword-based analysis (fallback)
    print("πŸ”€ Using enhanced keyword analysis...")
    result = enhanced_keyword_analysis(normalized_text, text)
    print(f"πŸ“ Keyword analysis result: {result} stars")
    return result

def enhanced_keyword_analysis(normalized_text, original_text):
    """Enhanced keyword analysis untuk bahasa Indonesia + slang"""
    text_lower = normalized_text.lower()
    
    # Positive keywords (diperbanyak dan lebih sensitif)
    positive_words = [
        "senang", "bahagia", "happy", "mantap", "bagus", "keren", "suka", "cinta", "love",
        "amazing", "luar biasa", "hebat", "fantastis", "sempurna", "excellent", "good",
        "positif", "optimis", "gembiraan", "kebahagiaan", "sukses", "berhasil", "menang",
        "excited", "antusias", "semangat", "motivasi", "inspirasi", "grateful", "bersyukur",
        "mantul", "jos", "top", "juara", "recommended", "worth it", "puas", "satisfied",
        "gembira", "asyik", "asik", "cool", "nice", "wonderful", "great", "awesome"
    ]
    
    # Strong positive words (kata yang sangat positif)
    strong_positive_words = [
        "banget", "sangat", "luar biasa", "fantastis", "sempurna", "amazing", "awesome",
        "gembira", "bahagia banget", "senang banget", "happy banget"
    ]
    
    # Negative keywords (diperbanyak)
    negative_words = [
        "marah", "kesal", "benci", "jelek", "buruk", "jahat", "sedih", "kecewa", "galau",
        "frustrated", "angry", "hate", "bad", "terrible", "awful", "horrible", "disgusting",
        "menyebalkan", "annoying", "stress", "depresi", "down", "hopeless", "putus asa",
        "fail", "gagal", "rugi", "loss", "disappointed", "broken heart", "sakit hati",
        "toxic", "drama", "problem", "masalah", "susah", "sulit", "capek", "tired"
    ]
    
    # Neutral/Mixed keywords
    neutral_words = [
        "biasa", "standard", "normal", "okay", "ok", "fine", "lumayan", "so so",
        "average", "medium", "moderate", "netral", "balanced", "mixed feelings"
    ]
    
    # Negation words (kata negasi)
    negation_words = ["tidak", "bukan", "jangan", "gak", "ga", "engga", "no", "nope", "never"]
    
    # Count sentiment words
    positive_count = sum(1 for word in positive_words if word in text_lower)
    strong_positive_count = sum(1 for word in strong_positive_words if word in text_lower)
    negative_count = sum(1 for word in negative_words if word in text_lower)
    neutral_count = sum(1 for word in neutral_words if word in text_lower)
    
    # Check for combinations like "senang banget"
    if "senang banget" in text_lower or "bahagia banget" in text_lower or "happy banget" in text_lower:
        strong_positive_count += 2
    
    # Check for negations
    has_negation = any(neg in text_lower for neg in negation_words)
    
    # Advanced scoring with context
    if has_negation:
        # If there's negation, flip the sentiment partially
        if positive_count > negative_count:
            return 3  # Neutral instead of positive
        elif negative_count > positive_count:
            return 4  # Less negative
    
    # Calculate sentiment score with strong positive bonus
    total_positive = positive_count + (strong_positive_count * 2)  # Strong words worth double
    
    if total_positive > negative_count + neutral_count:
        return 5  # Strong positive
    elif total_positive > negative_count:
        return 4  # Mild positive
    elif negative_count > total_positive + neutral_count:
        return 1  # Strong negative
    elif negative_count > total_positive:
        return 2  # Mild negative
    else:
        return 3  # Neutral

@app.post("/predict")
async def predict(req: TextRequest):
    stars = analyze_sentiment(req.text)
    return JSONResponse(content={"stars": stars})