Spaces:

NLong
/

FakeNews_Detector

Running

App Files Files Community

NLong commited on Sep 27

Commit

6c48551

verified ·

1 Parent(s): 72f7858

Upload app.py

Browse files

Files changed (1) hide show

app.py +118 -36

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import re
 import os
 import numpy as np
-GOOGLE_API_KEY = "AIzaSyDu0819TPX_Z1AcAT5xT1SNjjmb64PSc1I"
 SEARCH_ENGINE_ID = "f34f8a4816771488b"
 GEMINI_API_KEY = "AIzaSyAHPzJ_VjTX3gZLBV28d3sq97SdER2qfkc"
 MODEL_PATH = "./vietnamese_fake_news_model"
@@ -137,8 +137,36 @@ def process_search_results(items):
     return search_results
 def google_search_fallback(news_text):
-    print("Google Search is unavailable due to quota exceeded")
-    return []
@@ -344,29 +372,42 @@ def analyze_with_gemini(news_text, search_results, distilbert_prediction, distil
         else:
             search_summary = "Không tìm thấy kết quả tìm kiếm Google cho tin tức này. Điều này có thể do API bị giới hạn hoặc tin tức quá mới/chưa được đăng tải."
-        # Include our model results in the analysis
-        distilbert_analysis = ""
-        if distilbert_prediction:
-            distilbert_analysis = f"Phân tích DistilBERT: Dự đoán '{distilbert_prediction}' với độ tin cậy {distilbert_confidence:.3f}"
-        else:
-            distilbert_analysis = "DistilBERT: Không thể phân tích"
         prompt = f"""
-Hãy phân tích tin tức sau và đánh giá độ tin cậy của nó một cách đơn giản, dễ hiểu:
 "{news_text}"
 {search_summary}
-{distilbert_analysis}
-Hãy trả lời bằng tiếng Việt, ngắn gọn và dễ hiểu cho người dùng bình thường:
-1. Tin tức này có vẻ THẬT hay GIẢ? (Chỉ trả lời THẬT hoặc GIẢ)
-2. Tại sao bạn nghĩ vậy? (Giải thích ngắn gọn, dễ hiểu)
-3. Người đọc nên làm gì? (Lời khuyên đơn giản)
-Tránh dùng thuật ngữ kỹ thuật, hãy viết như đang nói chuyện với bạn bè.
 """
         print("Calling Gemini API...")
@@ -375,12 +416,12 @@ Tránh dùng thuật ngữ kỹ thuật, hãy viết như đang nói chuyện v
         if search_results:
             print(f"DEBUG - First search result title: {search_results[0].get('title', 'No title')}")
-        # Use consistent settings to get reliable results
         generation_config = genai.types.GenerationConfig(
-            temperature=0.1,  # Low temperature for more consistent results
-            top_p=0.8,        # Focus on most likely tokens
-            top_k=20,         # Limit vocabulary choices
-            max_output_tokens=1000
         )
         response = model.generate_content(prompt, generation_config=generation_config)
         print("Gemini API response received successfully")
@@ -392,20 +433,59 @@ Tránh dùng thuật ngữ kỹ thuật, hãy viết như đang nói chuyện v
         # If we hit the API limit, provide a basic analysis
         if "429" in str(e) or "quota" in str(e).lower():
-            print("Gemini API quota exceeded, providing fallback analysis...")
-            fallback_analysis = f"""
-**Phân tích cơ bản (do giới hạn API):**
-🤖 **Kết quả AI:** {'Tin tức này có vẻ THẬT' if distilbert_prediction == 'REAL' else 'Tin tức này có vẻ GIẢ' if distilbert_prediction == 'FAKE' else 'Không thể xác định'}
-📊 **Độ tin cậy:** {f"{distilbert_confidence:.0%}" if distilbert_confidence else 'Không có'}
-🌐 **Nguồn tin:** {len(search_results) if search_results else 0} nguồn được tìm thấy
-⚠️ **Lưu ý:** Google Search không khả dụng do hết quota. Phân tích chỉ dựa trên AI.
-💡 **Khuyến nghị:** Hãy kiểm tra thêm từ các nguồn tin chính thống trước khi tin tưởng hoàn toàn.
-"""
             return fallback_analysis
         # For other errors, see what models are available
@@ -634,9 +714,11 @@ def analyze_news(news_text):
 </div>
 ### 🧠 **Phân tích thông minh**
-<div style="background: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #ffc107; margin: 10px 0;">
 {gemini_analysis}
 </div>
 ### 📊 **KẾT LUẬN CUỐI CÙNG**
 <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); color: white; padding: 15px; border-radius: 8px; margin: 20px 0;">

 import os
 import numpy as np
+GOOGLE_API_KEY = "AIzaSyASwqVh3ELFVKH-W3WuHtmjg3XgtwjJQKg"
 SEARCH_ENGINE_ID = "f34f8a4816771488b"
 GEMINI_API_KEY = "AIzaSyAHPzJ_VjTX3gZLBV28d3sq97SdER2qfkc"
 MODEL_PATH = "./vietnamese_fake_news_model"
     return search_results
 def google_search_fallback(news_text):
+    print("Google Search is unavailable - using enhanced content analysis")
+    # Enhanced content analysis without external search
+    fake_indicators = ['giả', 'sai', 'không đúng', 'bịa đặt', 'lừa đảo', 'fake news', 'tin đồn', 'nghi vấn']
+    real_indicators = ['chính thức', 'xác nhận', 'chính xác', 'đúng', 'verified', 'chính phủ', 'bộ y tế', 'cơ quan']
+    text_lower = news_text.lower()
+    fake_count = sum(1 for word in fake_indicators if word in text_lower)
+    real_count = sum(1 for word in real_indicators if word in text_lower)
+    # Create more detailed analysis
+    analysis_details = []
+    if fake_count > real_count:
+        analysis_details.append("Nhiều từ khóa nghi ngờ được sử dụng")
+    elif real_count > fake_count:
+        analysis_details.append("Có từ khóa xác thực từ nguồn chính thức")
+    # Check for other patterns
+    if len(news_text) < 100:
+        analysis_details.append("Tin tức quá ngắn, thiếu thông tin chi tiết")
+    if '!' in news_text or '!!!' in news_text:
+        analysis_details.append("Sử dụng dấu chấm than thái quá")
+    snippet = f"Phân tích nội dung: {fake_count} từ nghi ngờ, {real_count} từ xác thực. "
+    snippet += "; ".join(analysis_details) if analysis_details else "Không phát hiện dấu hiệu đặc biệt"
+    return [{
+        'title': 'Phân tích nội dung chi tiết (không có tìm kiếm Google)',
+        'snippet': snippet,
+        'link': 'content-analysis-only'
+    }]
         else:
             search_summary = "Không tìm thấy kết quả tìm kiếm Google cho tin tức này. Điều này có thể do API bị giới hạn hoặc tin tức quá mới/chưa được đăng tải."
+        # Note: We're not including DistilBERT results to keep Gemini analysis independent
         prompt = f"""
+Bạn là một chuyên gia phân tích tin tức chuyên nghiệp. Hãy phân tích chi tiết tin tức sau và đánh giá độ tin cậy của nó:
 "{news_text}"
 {search_summary}
+Hãy thực hiện phân tích toàn diện theo các tiêu chí sau:
+1. **Phân tích nội dung**: Kiểm tra tính logic, mâu thuẫn, ngôn ngữ cảm xúc thái quá
+2. **Phân tích nguồn tin**: Đánh giá uy tín và độ tin cậy của nguồn
+3. **Phân tích ngữ cảnh**: So sánh với thông tin có sẵn và kiến thức thực tế
+4. **Phân tích ngôn ngữ**: Tìm dấu hiệu của tin giả như từ ngữ gây sốc, cảm xúc
+5. **Phân tích thời gian**: Kiểm tra tính hợp lý về mặt thời gian
+Trả lời theo định dạng sau (chỉ bằng tiếng Việt, viết chi tiết và chuyên nghiệp):
+**1. KẾT LUẬN:** [THẬT/GIẢ/KHÔNG XÁC ĐỊNH]
+**2. PHÂN TÍCH CHI TIẾT:**
+- **Nội dung:** [Phân tích chi tiết về nội dung tin tức]
+- **Nguồn tin:** [Đánh giá về nguồn và độ tin cậy]
+- **Ngữ cảnh:** [So sánh với thông tin có sẵn]
+- **Ngôn ngữ:** [Phân tích cách sử dụng từ ngữ]
+- **Thời gian:** [Kiểm tra tính hợp lý về mặt thời gian]
+**3. CÁC DẤU HIỆU CẢNH BÁO:** [Liệt kê các dấu hiệu đáng ngờ nếu có]
+**4. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:**
+- [Hướng dẫn cụ thể để kiểm chứng thông tin]
+- [Các nguồn tin đáng tin cậy để tham khảo]
+- [Cách phân biệt tin thật và tin giả]
+Viết chi tiết, chuyên nghiệp và hữu ích cho người đọc.
 """
         print("Calling Gemini API...")
         if search_results:
             print(f"DEBUG - First search result title: {search_results[0].get('title', 'No title')}")
+        # Use settings optimized for detailed analysis
         generation_config = genai.types.GenerationConfig(
+            temperature=0.3,  # Slightly higher for more creative analysis
+            top_p=0.9,        # Allow more diverse vocabulary
+            top_k=40,         # More vocabulary choices for detailed writing
+            max_output_tokens=2000  # Allow much longer responses
         )
         response = model.generate_content(prompt, generation_config=generation_config)
         print("Gemini API response received successfully")
         # If we hit the API limit, provide a basic analysis
         if "429" in str(e) or "quota" in str(e).lower():
+            print("Gemini API quota exceeded, providing enhanced fallback analysis...")
+            # Enhanced analysis based on content patterns
+            fake_patterns = ['giả', 'sai', 'không đúng', 'bịa đặt', 'lừa đảo', 'fake news', 'tin đồn']
+            real_patterns = ['chính thức', 'xác nhận', 'chính xác', 'đúng', 'verified', 'chính phủ', 'bộ y tế']
+            news_lower = news_text.lower()
+            fake_score = sum(1 for pattern in fake_patterns if pattern in news_lower)
+            real_score = sum(1 for pattern in real_patterns if pattern in news_lower)
+            # Adjust prediction based on patterns
+            if fake_score > real_score and distilbert_prediction == 'FAKE':
+                confidence_boost = "Cao (có từ khóa nghi ngờ)"
+            elif real_score > fake_score and distilbert_prediction == 'REAL':
+                confidence_boost = "Cao (có từ khóa xác thực)"
+            else:
+                confidence_boost = "Trung bình"
+            # Create detailed fallback analysis
+            conclusion = 'THẬT' if distilbert_prediction == 'REAL' else 'GIẢ' if distilbert_prediction == 'FAKE' else 'KHÔNG XÁC ĐỊNH'
+            # Enhanced analysis based on content patterns
+            suspicious_patterns = []
+            if fake_score > 0:
+                suspicious_patterns.append(f"Tìm thấy {fake_score} từ khóa nghi ngờ")
+            if real_score > 0:
+                suspicious_patterns.append(f"Tìm thấy {real_score} từ khóa xác thực")
+            warning_signs = []
+            if 'cảnh báo' in news_lower or 'nguy hiểm' in news_lower:
+                warning_signs.append("Sử dụng từ ngữ gây sợ hãi")
+            if 'ngay lập tức' in news_lower or 'khẩn cấp' in news_lower:
+                warning_signs.append("Tạo cảm giác cấp bách không cần thiết")
+            if len(news_text) < 100:
+                warning_signs.append("Tin tức quá ngắn, thiếu thông tin chi tiết")
+            fallback_analysis = f"""**1. KẾT LUẬN:** {conclusion}
+**2. PHÂN TÍCH CHI TIẾT:**
+- **Nội dung:** {'Tin tức có vẻ hợp lý' if distilbert_prediction == 'REAL' else 'Tin tức có nhiều dấu hiệu đáng ngờ' if distilbert_prediction == 'FAKE' else 'Nội dung không rõ ràng'}
+- **Nguồn tin:** Google Search không khả dụng (hết quota) - không thể kiểm tra nguồn
+- **Ngữ cảnh:** Phân tích từ khóa: {confidence_boost}
+- **Ngôn ngữ:** {'Ngôn ngữ trung tính' if fake_score == real_score else 'Có dấu hiệu cảm xúc thái quá' if fake_score > real_score else 'Ngôn ngữ khách quan'}
+- **Thời gian:** Không thể xác minh do thiếu thông tin bổ sung
+**3. CÁC DẤU HIỆU CẢNH BÁO:**
+{chr(10).join([f"- {sign}" for sign in warning_signs]) if warning_signs else "- Không phát hiện dấu hiệu cảnh báo rõ ràng"}
+**4. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:**
+- **Kiểm tra nguồn:** Tìm kiếm thông tin tương tự trên các trang báo uy tín như VnExpress, Tuổi Trẻ, Thanh Niên
+- **Xác minh thời gian:** Kiểm tra xem tin tức có được đăng tải đồng thời trên nhiều nguồn không
+- **Đánh giá ngôn ngữ:** Tránh chia sẻ tin tức có ngôn ngữ cảm xúc thái quá hoặc tạo cảm giác cấp bách
+- **Lưu ý:** Do hệ thống API tạm thời không khả dụng, kết quả phân tích có thể không hoàn toàn chính xác"""
             return fallback_analysis
         # For other errors, see what models are available
 </div>
 ### 🧠 **Phân tích thông minh**
+<div style="background: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #ffc107; margin: 15px 0; font-family: 'Segoe UI', Arial, sans-serif; line-height: 1.6;">
+<div style="white-space: pre-line; color: #333;">
 {gemini_analysis}
 </div>
+</div>
 ### 📊 **KẾT LUẬN CUỐI CÙNG**
 <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); color: white; padding: 15px; border-radius: 8px; margin: 20px 0;">