Spaces:

NLong
/

FakeNews_Detector

Running

App Files Files Community

NLong commited on Sep 27

Commit

59dfe15

verified ·

1 Parent(s): 6c48551

Upload app.py

Browse files

Files changed (1) hide show

app.py +219 -52

app.py CHANGED Viewed

@@ -393,20 +393,24 @@ Trả lời theo định dạng sau (chỉ bằng tiếng Việt, viết chi ti
 **1. KẾT LUẬN:** [THẬT/GIẢ/KHÔNG XÁC ĐỊNH]
-**2. PHÂN TÍCH CHI TIẾT:**
 - **Nội dung:** [Phân tích chi tiết về nội dung tin tức]
 - **Nguồn tin:** [Đánh giá về nguồn và độ tin cậy]
 - **Ngữ cảnh:** [So sánh với thông tin có sẵn]
 - **Ngôn ngữ:** [Phân tích cách sử dụng từ ngữ]
 - **Thời gian:** [Kiểm tra tính hợp lý về mặt thời gian]
-**3. CÁC DẤU HIỆU CẢNH BÁO:** [Liệt kê các dấu hiệu đáng ngờ nếu có]
-**4. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:**
 - [Hướng dẫn cụ thể để kiểm chứng thông tin]
 - [Các nguồn tin đáng tin cậy để tham khảo]
 - [Cách phân biệt tin thật và tin giả]
 Viết chi tiết, chuyên nghiệp và hữu ích cho người đọc.
 """
@@ -471,6 +475,121 @@ Viết chi tiết, chuyên nghiệp và hữu ích cho người đọc.
             fallback_analysis = f"""**1. KẾT LUẬN:** {conclusion}
 **2. PHÂN TÍCH CHI TIẾT:**
 - **Nội dung:** {'Tin tức có vẻ hợp lý' if distilbert_prediction == 'REAL' else 'Tin tức có nhiều dấu hiệu đáng ngờ' if distilbert_prediction == 'FAKE' else 'Nội dung không rõ ràng'}
 - **Nguồn tin:** Google Search không khả dụng (hết quota) - không thể kiểm tra nguồn
@@ -500,67 +619,115 @@ Viết chi tiết, chuyên nghiệp và hữu ích cho người đọc.
         return f"Lỗi phân tích Gemini: {e}"
 def calculate_combined_confidence(distilbert_prediction, distilbert_confidence, source_credibility, popularity_score, gemini_analysis, source_support=0.5):
-    """Calculate combined confidence from all three tools"""
-    # Base confidence from DistilBERT
     if distilbert_prediction == "REAL":
-        base_confidence = distilbert_confidence
     else:
-        base_confidence = 1 - distilbert_confidence
-    # Adjust based on source credibility (stronger adjustments)
-    if source_credibility > 0.7:
-        credibility_adjustment = 0.2  # Increased from 0.1
-    elif source_credibility > 0.4:
-        credibility_adjustment = 0.05  # Small positive adjustment
-    else:
-        credibility_adjustment = -0.1
-    # Adjust based on popularity
-    if popularity_score > 0.7:
-        popularity_adjustment = 0.1  # Increased from 0.05
-    elif popularity_score > 0.4:
-        popularity_adjustment = 0.0
     else:
-        popularity_adjustment = -0.05
-    # Adjust based on source support (whether sources support or contradict the news)
-    if source_support > 0.7:
-        support_adjustment = 0.15  # Sources strongly support
-    elif source_support > 0.4:
-        support_adjustment = 0.0   # Sources are neutral
     else:
-        support_adjustment = -0.15  # Sources contradict
-    # Adjust based on Gemini analysis (stronger adjustments)
-    gemini_lower = gemini_analysis.lower()
-    if "độ tin cậy cao" in gemini_lower or "tin cậy cao" in gemini_lower or "cao" in gemini_lower:
-        gemini_adjustment = 0.2  # Increased from 0.1
-    elif "độ tin cậy thấp" in gemini_lower or "tin cậy thấp" in gemini_lower or "thấp" in gemini_lower:
-        gemini_adjustment = -0.2  # Increased from -0.1
     else:
-        gemini_adjustment = 0.0
-    # Special case: If DistilBERT confidence is very low but sources and Gemini agree it's real
-    if (distilbert_confidence < 0.6 and
-        source_credibility > 0.6 and
-        ("cao" in gemini_lower or "chính xác" in gemini_lower or "đáng tin cậy" in gemini_lower) and
-        not ("thấp" in gemini_lower or "giả" in gemini_lower or "fake" in gemini_lower)):
-        # Override with higher confidence ONLY if Gemini says it's real
-        base_confidence = 0.8
-        print("Overriding low DistilBERT confidence due to strong source and Gemini agreement for REAL")
-    # Special case: If DistilBERT and Gemini both say FAKE, respect that
-    elif (distilbert_prediction == "FAKE" and
-          ("thấp" in gemini_lower or "giả" in gemini_lower or "fake" in gemini_lower)):
-        # Override with low confidence for FAKE
-        base_confidence = 0.2
-        print("Overriding confidence due to DistilBERT and Gemini agreement for FAKE")
-    # Calculate final confidence
-    final_confidence = base_confidence + credibility_adjustment + popularity_adjustment + gemini_adjustment + support_adjustment
     final_confidence = max(0.05, min(0.95, final_confidence))
     return final_confidence
 def analyze_news(news_text):

 **1. KẾT LUẬN:** [THẬT/GIẢ/KHÔNG XÁC ĐỊNH]
+**2. ĐỘ TIN CẬY:** [X%/Y%] (Trong đó X% là độ tin cậy tin THẬT, Y% là độ tin cậy tin GIẢ, X+Y=100%)
+**3. PHÂN TÍCH CHI TIẾT:**
 - **Nội dung:** [Phân tích chi tiết về nội dung tin tức]
 - **Nguồn tin:** [Đánh giá về nguồn và độ tin cậy]
 - **Ngữ cảnh:** [So sánh với thông tin có sẵn]
 - **Ngôn ngữ:** [Phân tích cách sử dụng từ ngữ]
 - **Thời gian:** [Kiểm tra tính hợp lý về mặt thời gian]
+**4. CÁC DẤU HIỆU CẢNH BÁO:** [Liệt kê các dấu hiệu đáng ngờ nếu có]
+**5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:**
 - [Hướng dẫn cụ thể để kiểm chứng thông tin]
 - [Các nguồn tin đáng tin cậy để tham khảo]
 - [Cách phân biệt tin thật và tin giả]
+QUAN TRỌNG: Trong phần "ĐỘ TIN CẬY", hãy cung cấp tỷ lệ phần trăm chính xác dựa trên phân tích của bạn. Ví dụ: "95%/5%" nghĩa là 95% tin tức này là THẬT, 5% là GIẢ.
 Viết chi tiết, chuyên nghiệp và hữu ích cho người đọc.
 """
             fallback_analysis = f"""**1. KẾT LUẬN:** {conclusion}
+**2. ĐỘ TIN CẬY:** {'5%/95%' if conclusion == 'GIẢ' else '95%/5%' if conclusion == 'THẬT' else '50%/50%'}
+**3. PHÂN TÍCH CHI TIẾT:**
+- **Nội dung:** {'Tin tức có vẻ hợp lý' if distilbert_prediction == 'REAL' else 'Tin tức có nhiều dấu hiệu đáng ngờ' if distilbert_prediction == 'FAKE' else 'Nội dung không rõ ràng'}
+- **Nguồn tin:** Google Search không khả dụng (hết quota) - không thể kiểm tra nguồn
+- **Ngữ cảnh:** Phân tích từ khóa: {confidence_boost}
+- **Ngôn ngữ:** {'Ngôn ngữ trung tính' if fake_score == real_score else 'Có dấu hiệu cảm xúc thái quá' if fake_score > real_score else 'Ngôn ngữ khách quan'}
+- **Thời gian:** Không thể xác minh do thiếu thông tin bổ sung
+**4. CÁC DẤU HIỆU CẢNH BÁO:**
+{chr(10).join([f"- {sign}" for sign in warning_signs]) if warning_signs else "- Không phát hiện dấu hiệu cảnh báo rõ ràng"}
+**5. KHUYẾN NGHỊ CHO NGƯỜI ĐỌC:**
+- **Kiểm tra nguồn:** Tìm kiếm thông tin tương tự trên các trang báo uy tín như VnExpress, Tuổi Trẻ, Thanh Niên
+- **Xác minh thời gian:** Kiểm tra xem tin tức có được đăng tải đồng thời trên nhiều nguồn không
+- **Đánh giá ngôn ngữ:** Tránh chia sẻ tin tức có ngôn ngữ cảm xúc thái quá hoặc tạo cảm giác cấp bách
+- **Lưu ý:** Do hệ thống API tạm thời không khả dụng, kết quả phân tích có thể không hoàn toàn chính xác"""
+            return fallback_analysis
+        # For other errors, see what models are available
+        try:
+            models = genai.list_models()
+            print("Available models:")
+            for model in models:
+                if 'gemini' in model.name.lower():
+                    print(f"  - {model.name}")
+        except Exception as list_error:
+            print(f"Could not list models: {list_error}")
+        return f"Lỗi phân tích Gemini: {e}"
+def extract_gemini_percentage(gemini_analysis):
+    """Extract percentage confidence from Gemini analysis"""
+    try:
+        gemini_lower = gemini_analysis.lower()
+        # Look for the confidence percentage pattern
+        import re
+        # Pattern to match "X%/Y%" format
+        percentage_pattern = r'độ tin cậy.*?(\d+)%/(\d+)%'
+        match = re.search(percentage_pattern, gemini_lower)
+        if match:
+            real_percent = int(match.group(1))
+            fake_percent = int(match.group(2))
+            # Normalize to ensure they add up to 100
+            total = real_percent + fake_percent
+            if total > 0:
+                real_percent = real_percent / total
+                fake_percent = fake_percent / total
+            else:
+                real_percent = 0.5
+                fake_percent = 0.5
+            print(f"Extracted Gemini percentages: {real_percent:.1%} real, {fake_percent:.1%} fake")
+            return real_percent, fake_percent
+        # Fallback: try to find individual percentages
+        real_match = re.search(r'(\d+)%.*?thật', gemini_lower)
+        fake_match = re.search(r'(\d+)%.*?giả', gemini_lower)
+        if real_match and fake_match:
+            real_percent = int(real_match.group(1)) / 100
+            fake_percent = int(fake_match.group(1)) / 100
+            print(f"Extracted Gemini percentages (fallback): {real_percent:.1%} real, {fake_percent:.1%} fake")
+            return real_percent, fake_percent
+        print("Could not extract Gemini percentages, using conclusion analysis")
+        return None, None
+    except Exception as e:
+        print(f"Error extracting Gemini percentages: {e}")
+        return None, None
+        # If we hit the API limit, provide a basic analysis
+        if "429" in str(e) or "quota" in str(e).lower():
+            print("Gemini API quota exceeded, providing enhanced fallback analysis...")
+            # Enhanced analysis based on content patterns
+            fake_patterns = ['giả', 'sai', 'không đúng', 'bịa đặt', 'lừa đảo', 'fake news', 'tin đồn']
+            real_patterns = ['chính thức', 'xác nhận', 'chính xác', 'đúng', 'verified', 'chính phủ', 'bộ y tế']
+            news_lower = news_text.lower()
+            fake_score = sum(1 for pattern in fake_patterns if pattern in news_lower)
+            real_score = sum(1 for pattern in real_patterns if pattern in news_lower)
+            # Adjust prediction based on patterns
+            if fake_score > real_score and distilbert_prediction == 'FAKE':
+                confidence_boost = "Cao (có từ khóa nghi ngờ)"
+            elif real_score > fake_score and distilbert_prediction == 'REAL':
+                confidence_boost = "Cao (có từ khóa xác thực)"
+            else:
+                confidence_boost = "Trung bình"
+            # Create detailed fallback analysis
+            conclusion = 'THẬT' if distilbert_prediction == 'REAL' else 'GIẢ' if distilbert_prediction == 'FAKE' else 'KHÔNG XÁC ĐỊNH'
+            # Enhanced analysis based on content patterns
+            suspicious_patterns = []
+            if fake_score > 0:
+                suspicious_patterns.append(f"Tìm thấy {fake_score} từ khóa nghi ngờ")
+            if real_score > 0:
+                suspicious_patterns.append(f"Tìm thấy {real_score} từ khóa xác thực")
+            warning_signs = []
+            if 'cảnh báo' in news_lower or 'nguy hiểm' in news_lower:
+                warning_signs.append("Sử dụng từ ngữ gây sợ hãi")
+            if 'ngay lập tức' in news_lower or 'khẩn cấp' in news_lower:
+                warning_signs.append("Tạo cảm giác cấp bách không cần thiết")
+            if len(news_text) < 100:
+                warning_signs.append("Tin tức quá ngắn, thiếu thông tin chi tiết")
+            fallback_analysis = f"""**1. KẾT LUẬN:** {conclusion}
 **2. PHÂN TÍCH CHI TIẾT:**
 - **Nội dung:** {'Tin tức có vẻ hợp lý' if distilbert_prediction == 'REAL' else 'Tin tức có nhiều dấu hiệu đáng ngờ' if distilbert_prediction == 'FAKE' else 'Nội dung không rõ ràng'}
 - **Nguồn tin:** Google Search không khả dụng (hết quota) - không thể kiểm tra nguồn
         return f"Lỗi phân tích Gemini: {e}"
 def calculate_combined_confidence(distilbert_prediction, distilbert_confidence, source_credibility, popularity_score, gemini_analysis, source_support=0.5):
+    """Calculate combined confidence using weighted approach:
+    - DistilBERT: 30% weight
+    - Gemini AI: 30% weight
+    - Google Search (source credibility + support): 20% weight
+    - Other factors: 20% weight
+    """
+    # 1. DISTILBERT SCORE (30% weight)
     if distilbert_prediction == "REAL":
+        distilbert_score = distilbert_confidence
     else:
+        distilbert_score = 1 - distilbert_confidence
+    print(f"DistilBERT Score: {distilbert_score:.3f} (30% weight)")
+    # 2. GEMINI AI SCORE (30% weight)
+    gemini_lower = gemini_analysis.lower()
+    # Try to extract percentage from Gemini analysis first
+    gemini_real_percent, gemini_fake_percent = extract_gemini_percentage(gemini_analysis)
+    if gemini_real_percent is not None and gemini_fake_percent is not None:
+        # Use the extracted percentage directly
+        gemini_score = gemini_real_percent
+        print(f"Gemini Score (from percentage): {gemini_score:.3f} (30% weight) - {gemini_real_percent:.1%} real, {gemini_fake_percent:.1%} fake")
     else:
+        # Fallback to conclusion analysis
+        conclusion_score = 0.5  # Default neutral
+        if "**kết luận:** giả" in gemini_lower or "kết luận:** fake" in gemini_lower or "kết luận:** giả" in gemini_lower:
+            conclusion_score = 0.1  # Very low for FAKE
+            print("Gemini Conclusion: FAKE")
+        elif "**kết luận:** thật" in gemini_lower or "kết luận:** real" in gemini_lower or "kết luận:** thật" in gemini_lower:
+            conclusion_score = 0.9  # Very high for REAL
+            print("Gemini Conclusion: REAL")
+        elif "giả" in gemini_lower and "kết luận" in gemini_lower:
+            # Check if "giả" appears near "kết luận"
+            conclusion_start = gemini_lower.find("kết luận")
+            if conclusion_start != -1:
+                conclusion_section = gemini_lower[conclusion_start:conclusion_start + 50]
+                if "giả" in conclusion_section:
+                    conclusion_score = 0.1
+                    print("Gemini Conclusion: FAKE (detected in conclusion section)")
+                elif "thật" in conclusion_section:
+                    conclusion_score = 0.9
+                    print("Gemini Conclusion: REAL (detected in conclusion section)")
+        # Additional analysis indicators
+        fake_indicators = ["giả", "fake", "vô lý", "phi thực tế", "absurd", "preposterous", "impossible",
+                          "không thể xảy ra", "không có căn cứ", "tin giả"]
+        real_indicators = ["thật", "real", "chính xác", "đúng", "xác nhận", "verified", "đáng tin cậy"]
+        fake_count = sum(1 for indicator in fake_indicators if indicator in gemini_lower)
+        real_count = sum(1 for indicator in real_indicators if indicator in gemini_lower)
+        # Adjust based on analysis indicators (but conclusion takes priority)
+        if fake_count > real_count:
+            analysis_adjustment = -0.2
+            print(f"Gemini Analysis: {fake_count} fake indicators vs {real_count} real indicators")
+        elif real_count > fake_count:
+            analysis_adjustment = 0.2
+            print(f"Gemini Analysis: {real_count} real indicators vs {fake_count} fake indicators")
+        else:
+            analysis_adjustment = 0.0
+        gemini_score = max(0.1, min(0.9, conclusion_score + analysis_adjustment))
+        print(f"Gemini Score (from conclusion): {gemini_score:.3f} (30% weight)")
+    # 3. GOOGLE SEARCH SCORE (20% weight - source credibility + support)
+    # Source credibility component (10%)
+    credibility_component = source_credibility * 0.5  # Convert to 0-0.5 scale
+    # Source support component (10%)
+    support_component = source_support * 0.5  # Convert to 0-0.5 scale
+    google_search_score = credibility_component + support_component + 0.5  # Add base 0.5 for neutral
+    # If Gemini strongly says FAKE, reduce Google Search score
+    if gemini_score < 0.3:  # Gemini says FAKE (low score)
+        google_search_score = min(google_search_score, 0.4)  # Cap at 0.4 when Gemini says fake
+        print(f"Google Search Score: {google_search_score:.3f} (20% weight) - Credibility: {source_credibility:.2f}, Support: {source_support:.2f} - CAPPED due to Gemini FAKE")
     else:
+        print(f"Google Search Score: {google_search_score:.3f} (20% weight) - Credibility: {source_credibility:.2f}, Support: {source_support:.2f}")
+    # 4. OTHER FACTORS (20% weight - popularity, etc.)
+    other_factors_score = popularity_score * 0.4 + 0.6  # Convert popularity to 0.6-1.0 scale
+    # If Gemini strongly says FAKE, reduce Other Factors score
+    if gemini_score < 0.3:  # Gemini says FAKE (low score)
+        other_factors_score = min(other_factors_score, 0.5)  # Cap at 0.5 when Gemini says fake
+        print(f"Other Factors Score: {other_factors_score:.3f} (20% weight) - Popularity: {popularity_score:.2f} - CAPPED due to Gemini FAKE")
     else:
+        print(f"Other Factors Score: {other_factors_score:.3f} (20% weight) - Popularity: {popularity_score:.2f}")
+    # 5. COMBINE WITH WEIGHTS
+    final_confidence = (
+        distilbert_score * 0.30 +      # DistilBERT: 30%
+        gemini_score * 0.30 +          # Gemini AI: 30%
+        google_search_score * 0.20 +   # Google Search: 20%
+        other_factors_score * 0.20     # Other factors: 20%
+    )
     final_confidence = max(0.05, min(0.95, final_confidence))
+    print(f"Final Weighted Confidence: {final_confidence:.3f}")
+    print(f"  - DistilBERT (30%): {distilbert_score:.3f} × 0.30 = {distilbert_score * 0.30:.3f}")
+    print(f"  - Gemini (30%): {gemini_score:.3f} × 0.30 = {gemini_score * 0.30:.3f}")
+    print(f"  - Google Search (20%): {google_search_score:.3f} × 0.20 = {google_search_score * 0.20:.3f}")
+    print(f"  - Other Factors (20%): {other_factors_score:.3f} × 0.20 = {other_factors_score * 0.20:.3f}")
     return final_confidence
 def analyze_news(news_text):