indonesian-sentiment-api / sentiment_api.py
rifal2024's picture
Upload sentiment_api.py
c6c2928 verified
# filepath: [sentiment_api.py](http://_vscodecontentref_/0)
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware # ← TAMBAH INI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
import re
import os
app = FastAPI(title="Indonesian Sentiment Analysis API",
description="API untuk analisis sentimen bahasa Indonesia dengan dukungan bahasa gaul",
version="1.0.0")
# ← TAMBAH CORS MIDDLEWARE
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allows all origins
allow_credentials=True,
allow_methods=["*"], # Allows all methods
allow_headers=["*"], # Allows all headers
)
# Global variable untuk model (akan diload jika tersedia)
model = None
tokenizer = None
model_loaded = False
def load_model():
"""Try to load IndoBERT sentiment model, fallback to enhanced keyword if failed"""
global model, tokenizer, model_loaded
# List model alternatif yang bisa dicoba
model_options = [
"indolem/indobert-base-uncased", # Model yang lebih umum dan pasti tersedia
"cahya/bert-base-indonesian-1.5G", # Alternative Indonesian BERT
"mdhugol/indonesia-bert-sentiment-classification" # Specific sentiment model
]
for model_name in model_options:
try:
print(f"πŸ”„ Trying to load model: {model_name}")
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Load tokenizer
print(f"πŸ“₯ Downloading tokenizer for {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("βœ… Tokenizer loaded successfully!")
# Load model
print(f"πŸ“₯ Downloading model {model_name} (this may take a while)...")
model = AutoModelForSequenceClassification.from_pretrained(model_name)
print("βœ… Model loaded successfully!")
model_loaded = True
print(f"πŸŽ‰ {model_name} ready for sentiment analysis!")
return # Exit jika berhasil
except Exception as e:
print(f"❌ Failed to load {model_name}: {e}")
continue # Coba model berikutnya
# Jika semua model gagal
print("❌ All models failed to load")
print("πŸ”„ Using enhanced keyword-based analysis instead")
model_loaded = False
# Try to load model on startup
load_model()
@app.get("/")
async def root():
"""Root endpoint"""
model_name = "Unknown"
if model_loaded and model is not None:
model_name = model.config.name_or_path if hasattr(model.config, 'name_or_path') else "Indonesian BERT Model"
return {
"message": "Indonesian Sentiment Analysis API",
"version": "1.0.0",
"docs": "/docs",
"model_loaded": model_loaded,
"model_name": model_name if model_loaded else "Enhanced Keyword Analysis",
"model_type": "πŸ€– AI Model" if model_loaded else "πŸ“ Keyword Analysis",
"status": "πŸŽ‰ Ready!" if model_loaded else "πŸ“ Keyword Ready!"
}
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"model_loaded": model_loaded,
"model_type": "IndoBERTweet" if model_loaded else "Enhanced Keyword Analysis",
"ready": True
}
class TextRequest(BaseModel):
text: str
def normalize_slang(text):
"""Normalisasi kata gaul/slang ke bahasa baku"""
slang_dict = {
'gw': 'saya', 'gue': 'saya', 'w': 'saya',
'lu': 'kamu', 'elu': 'kamu', 'lo': 'kamu',
'gk': 'tidak', 'ga': 'tidak', 'gak': 'tidak', 'engga': 'tidak',
'bgt': 'banget', 'bgt': 'sangat',
'btw': 'ngomong ngomong', 'fyi': 'informasi',
'yg': 'yang', 'yng': 'yang',
'dgn': 'dengan', 'dg': 'dengan',
'org': 'orang', 'orng': 'orang',
'udh': 'sudah', 'udah': 'sudah', 'dah': 'sudah',
'blm': 'belum', 'blom': 'belum',
'bkn': 'bukan', 'bukan': 'bukan',
'krn': 'karena', 'krna': 'karena',
'trs': 'terus', 'trus': 'terus',
'jg': 'juga', 'jga': 'juga',
'aja': 'saja', 'ajah': 'saja',
'emg': 'memang', 'emang': 'memang',
'tp': 'tapi', 'tapi': 'tetapi',
'kalo': 'kalau', 'klo': 'kalau',
'gimana': 'bagaimana', 'gmn': 'bagaimana',
'knp': 'kenapa', 'knapa': 'kenapa',
'mantap': 'bagus', 'mantul': 'bagus',
'anjay': 'wah', 'anjir': 'wah',
'gabut': 'tidak ada kegiatan',
'mager': 'malas gerak',
'baper': 'bawa perasaan',
'santuy': 'santai',
'kepo': 'ingin tahu',
'php': 'pemberi harapan palsu',
'bucin': 'budak cinta',
# Tambahan kata positif yang sering dipakai
'seneng': 'senang', 'senang': 'senang',
'bahagia': 'bahagia', 'happy': 'senang',
'kamaren': 'kemarin', 'kemaren': 'kemarin'
}
# Convert to lowercase
text = text.lower()
# Replace slang words
for slang, formal in slang_dict.items():
text = re.sub(r'\b' + slang + r'\b', formal, text)
return text
def analyze_sentiment(text):
"""Analisis sentimen dengan IndoBERTweet atau enhanced keyword"""
global model, tokenizer, model_loaded
# Normalisasi kata gaul
normalized_text = normalize_slang(text)
# Debug info
print(f"πŸ” Analyzing: '{text}'")
print(f"πŸ”§ Normalized: '{normalized_text}'")
print(f"πŸ€– Model loaded: {model_loaded}")
# Coba gunakan IndoBERTweet jika tersedia
if model_loaded and model is not None and tokenizer is not None:
try:
import torch
print("🎯 Using IndoBERTweet model...")
# Tokenize input
inputs = tokenizer(normalized_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
# Get prediction
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
pred = torch.argmax(logits, dim=1).item()
confidence = torch.max(probabilities).item()
print(f"πŸ“Š IndoBERTweet prediction: {pred} (confidence: {confidence:.3f})")
print(f"πŸ“Š Probabilities: {probabilities.numpy()}")
# Mapping label IndoBERTweet ke rating bintang
# IndoBERTweet: 0=negative, 1=neutral, 2=positive
if pred == 2: # positive
result = 5 if confidence > 0.8 else 4
print(f"βœ… Result: {result} stars (Positive)")
return result
elif pred == 1: # neutral
result = 3
print(f"😐 Result: {result} stars (Neutral)")
return result
else: # negative (pred == 0)
result = 1 if confidence > 0.8 else 2
print(f"❌ Result: {result} stars (Negative)")
return result
except Exception as e:
print(f"⚠️ Error using IndoBERTweet: {e}")
print("πŸ”„ Falling back to keyword analysis...")
# Enhanced keyword-based analysis (fallback)
print("πŸ”€ Using enhanced keyword analysis...")
result = enhanced_keyword_analysis(normalized_text, text)
print(f"πŸ“ Keyword analysis result: {result} stars")
return result
def enhanced_keyword_analysis(normalized_text, original_text):
"""Enhanced keyword analysis untuk bahasa Indonesia + slang"""
text_lower = normalized_text.lower()
# Positive keywords (diperbanyak dan lebih sensitif)
positive_words = [
"senang", "bahagia", "happy", "mantap", "bagus", "keren", "suka", "cinta", "love",
"amazing", "luar biasa", "hebat", "fantastis", "sempurna", "excellent", "good",
"positif", "optimis", "gembiraan", "kebahagiaan", "sukses", "berhasil", "menang",
"excited", "antusias", "semangat", "motivasi", "inspirasi", "grateful", "bersyukur",
"mantul", "jos", "top", "juara", "recommended", "worth it", "puas", "satisfied",
"gembira", "asyik", "asik", "cool", "nice", "wonderful", "great", "awesome"
]
# Strong positive words (kata yang sangat positif)
strong_positive_words = [
"banget", "sangat", "luar biasa", "fantastis", "sempurna", "amazing", "awesome",
"gembira", "bahagia banget", "senang banget", "happy banget"
]
# Negative keywords (diperbanyak)
negative_words = [
"marah", "kesal", "benci", "jelek", "buruk", "jahat", "sedih", "kecewa", "galau",
"frustrated", "angry", "hate", "bad", "terrible", "awful", "horrible", "disgusting",
"menyebalkan", "annoying", "stress", "depresi", "down", "hopeless", "putus asa",
"fail", "gagal", "rugi", "loss", "disappointed", "broken heart", "sakit hati",
"toxic", "drama", "problem", "masalah", "susah", "sulit", "capek", "tired"
]
# Neutral/Mixed keywords
neutral_words = [
"biasa", "standard", "normal", "okay", "ok", "fine", "lumayan", "so so",
"average", "medium", "moderate", "netral", "balanced", "mixed feelings"
]
# Negation words (kata negasi)
negation_words = ["tidak", "bukan", "jangan", "gak", "ga", "engga", "no", "nope", "never"]
# Count sentiment words
positive_count = sum(1 for word in positive_words if word in text_lower)
strong_positive_count = sum(1 for word in strong_positive_words if word in text_lower)
negative_count = sum(1 for word in negative_words if word in text_lower)
neutral_count = sum(1 for word in neutral_words if word in text_lower)
# Check for combinations like "senang banget"
if "senang banget" in text_lower or "bahagia banget" in text_lower or "happy banget" in text_lower:
strong_positive_count += 2
# Check for negations
has_negation = any(neg in text_lower for neg in negation_words)
# Advanced scoring with context
if has_negation:
# If there's negation, flip the sentiment partially
if positive_count > negative_count:
return 3 # Neutral instead of positive
elif negative_count > positive_count:
return 4 # Less negative
# Calculate sentiment score with strong positive bonus
total_positive = positive_count + (strong_positive_count * 2) # Strong words worth double
if total_positive > negative_count + neutral_count:
return 5 # Strong positive
elif total_positive > negative_count:
return 4 # Mild positive
elif negative_count > total_positive + neutral_count:
return 1 # Strong negative
elif negative_count > total_positive:
return 2 # Mild negative
else:
return 3 # Neutral
@app.post("/predict")
async def predict(req: TextRequest):
stars = analyze_sentiment(req.text)
return JSONResponse(content={"stars": stars})