Spaces:

TanmayTomar
/

FORENSIC-TOOLKIT

Running

App Files Files Community

TanmayTomar commited on Sep 21

Commit

0cec2c9

verified ·

1 Parent(s): 82a2213

Upload 8 files

Browse files

Files changed (9) hide show

.gitattributes +2 -0
Dockerfile +22 -0
IMG_PIPELINE.py +28 -0
TEXT_PIPELINE.py +50 -0
app.py +105 -0
data.csv +3 -0
evidence_index.faiss +3 -0
pmo_func.py +313 -0
requirements.txt +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data.csv filter=lfs diff=lfs merge=lfs -text
+evidence_index.faiss filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+# Use an official Python runtime
+FROM python:3.11-slim
+# Set the working directory
+WORKDIR /app
+# Set environment variables
+ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+# Copy and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your application code
+COPY . .
+# Expose the port the app runs on
+EXPOSE 7860
+# Run the FastAPI server when the container launches
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

IMG_PIPELINE.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from TEXT_PIPELINE import run_text_pipeline
+def run_img_pipeline(img_pth: str, state: dict):
+    """
+    Orchestrates the image analysis workflow using pre-loaded tools.
+    """
+    # Unpack the image-specific tools
+    manipulation_analyzer = state['manipulation_analyzer']
+    ocr_analyzer = state['ocr_analyzer']
+    # Run the analyses
+    manipulation_results = manipulation_analyzer.run_image_forensics(img_pth)
+    in_image_report = ocr_analyzer.get_in_image_anal(img_pth)
+    rev_img_search_res = ocr_analyzer.rev_img_search(img_pth)
+    text_analysis_report = {}
+    # If text is found, run the text pipeline, passing all the necessary state
+    if in_image_report.get("Extracted Text", "").strip():
+        text_analysis_report = run_text_pipeline(in_image_report["Extracted Text"], state)
+    return {
+        'image_manipulation_report': manipulation_results,
+        'in_image_content_report': in_image_report,
+        'reverse_image_search_report': rev_img_search_res,
+        'extracted_text_analysis_report': text_analysis_report
+    }

TEXT_PIPELINE.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import pandas as pd
+import faiss
+import os
+import json
+def run_text_pipeline(claim: str, state: dict):
+    """
+    Executes the text analysis pipeline using pre-loaded tools.
+    """
+    # Unpack all the necessary tools and data from the state dictionary
+    retriever = state['retriever']
+    reranker = state['reranker']
+    classifier = state['classifier']
+    summarizer = state['summarizer']
+    fact_checker = state['fact_checker']
+    df = state['df']
+    evidence_corpus = state['evidence_corpus']
+    faiss_index = state['faiss_index']
+    # --- RAG Pipeline ---
+    retrieved_docs, indices = retriever.retrieve_evidence(claim, faiss_index, evidence_corpus)
+    reranked_docs = reranker.rerank_evidendce(claim, retrieved_docs)
+    if not reranked_docs:
+        # --- Fallback to Google Fact Check ---
+        print("No results from RAG, trying Google Fact Check...")
+        result = fact_checker.check_claim(claim)
+        return {
+            "final_verdict": result.get('verdict', 'NEUTRAL'),
+            "explanation": result.get('summary', 'Could not verify claim.'),
+            "source": {result.get('source'): result.get('URLs', ['#'])[0]} if result else {}
+        }
+    final_verdict, _ = classifier(claim, reranked_docs)
+    top_evidence_for_summary = reranked_docs[:3]
+    _, explanation = summarizer(claim, top_evidence_for_summary, final_verdict)
+    # Get sources from the original dataframe
+    sources_dict = {}
+    if len(indices) > 0 and 'source' in df.columns and 'url' in df.columns:
+        df_rel = df.iloc[indices]
+        # Handle potential duplicate sources by taking the first URL for each source
+        sources_dict = df_rel.groupby('source')['url'].first().to_dict()
+    return {
+        "final_verdict": final_verdict,
+        "explanation": explanation,
+        "source": sources_dict
+    }

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
+from fastapi.responses import JSONResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from contextlib import asynccontextmanager
+import uvicorn
+import os
+import shutil
+import pandas as pd
+import faiss
+# Import your classes and pipeline functions
+from pmo_func import retriver, reranker, Classifier, summarizer, img_manipulation, OCR, FactChecker
+from TEXT_PIPELINE import run_text_pipeline
+from IMG_PIPELINE import run_img_pipeline
+# This dictionary will hold all our initialized models and data
+app_state = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Loads all models and data once when the server starts up."""
+    print("--- 🚀 Server starting up... Loading all models... 🚀 ---")
+    app_state['retriever'] = retriver()
+    app_state['reranker'] = reranker()
+    app_state['classifier'] = Classifier()
+    app_state['summarizer'] = summarizer()
+    app_state['manipulation_analyzer'] = img_manipulation()
+    app_state['ocr_analyzer'] = OCR()
+    app_state['fact_checker'] = FactChecker()
+    try:
+        df = pd.read_csv('data.csv', low_memory=False)
+        app_state['evidence_corpus'] = df['text'].dropna().tolist()
+        app_state['df'] = df
+    except Exception as e:
+        print(f"CRITICAL ERROR: Could not load data.csv: {e}")
+        app_state['evidence_corpus'] = []
+        app_state['df'] = pd.DataFrame()
+    index_file = "evidence_index.faiss"
+    if os.path.exists(index_file):
+        app_state['faiss_index'] = faiss.read_index(index_file)
+    elif app_state['evidence_corpus']:
+        print("Building FAISS index for the first time...")
+        app_state['faiss_index'] = app_state['retriever'].build_faiss_idx(app_state['evidence_corpus'])
+    else:
+        app_state['faiss_index'] = None
+    print("--- ✅ All models and data loaded successfully! ✅ ---")
+    yield
+    print("--- Shutting down ---")
+app = FastAPI(lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins (fine for a hackathon)
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
+)
+# Mounts the 'frontend_by_gemini' folder at the '/static' URL path
+app.mount("/static", StaticFiles(directory="frontend_by_gemini"), name="static")
+# Mounts the root directory to serve files like 'ela_result.png'
+app.mount("/results", StaticFiles(directory="."), name="results")
+@app.get("/")
+async def read_index():
+    return FileResponse('frontend_by_gemini/index.html')
+@app.post("/analyze")
+async def analyze_content(
+    text_input: str = Form(None),
+    image_file: UploadFile = File(None)
+):
+    # This logic correctly prioritizes the image if both are sent
+    if image_file and image_file.filename:
+        try:
+            temp_dir = "temp_uploads"
+            os.makedirs(temp_dir, exist_ok=True)
+            temp_path = os.path.join(temp_dir, image_file.filename)
+            with open(temp_path, "wb") as buffer:
+                shutil.copyfileobj(image_file.file, buffer)
+            report = run_img_pipeline(temp_path, app_state)
+            shutil.rmtree(temp_dir)
+            return JSONResponse(content=report)
+        except Exception as e:
+            print(f"Error in image pipeline: {e}")
+            raise HTTPException(status_code=500, detail="Error processing image.")
+    elif text_input:
+        try:
+            report = run_text_pipeline(text_input, app_state)
+            return JSONResponse(content=report)
+        except Exception as e:
+            print(f"Error in text pipeline: {e}")
+            raise HTTPException(status_code=500, detail="Error processing text.")
+    else:
+        raise HTTPException(status_code=400, detail="No valid input provided.")
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=int(os.environ.get("PORT", 8080)), reload=True)

data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54f359a9ec2dd519c4ac6f24ff1002f6b6aab1ed3a227422fa4e5ef63c93afc0
+size 401398654

evidence_index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e2b94c8c2f9c2411f93b6b3edbdb5b400355dd4176f65bb1c93bbeb63e4f9e6
+size 542823981

pmo_func.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.cross_encoder import CrossEncoder
+from transformers import pipeline
+from PIL import Image, ImageChops, ImageEnhance
+import torch
+from google.cloud import vision
+import os
+import io
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+from dotenv import load_dotenv
+import requests
+from bs4 import BeautifulSoup
+import trafilatura as tra
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+class retriver:
+    def __init__(self):
+        self.retrivermodel = SentenceTransformer('all-MiniLM-L6-v2')
+    def build_faiss_idx(self, evidence_corpus):
+        embeddings = self.retrivermodel.encode(evidence_corpus)
+        index = faiss.IndexFlatIP(embeddings.shape[1])
+        index.add(np.array(embeddings, dtype=np.float32))
+        faiss.write_index(index, "evidence_index.faiss")
+        return index
+    def retrieve_evidence(self, claim, index, evidence_corpus, top_k=10):
+        claim_embedding = self.retrivermodel.encode([claim])
+        distances, indices = index.search(np.array(claim_embedding, dtype=np.float32), top_k)
+        retrieved_docs = [evidence_corpus[i] for i in indices[0]]
+        return retrieved_docs, indices[0]
+class reranker:
+    def __init__(self):
+        self.reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=DEVICE)
+    def rerank_evidendce(self, claim, evidence_list):
+        sentance_pairs = [[claim, evidence] for evidence in evidence_list]
+        score = self.reranker_model.predict(sentance_pairs)
+        scored_evidence = sorted(zip(score, evidence_list), reverse=True)
+        return scored_evidence
+class Classifier:
+    def __init__(self):
+        self.model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
+        self.label_names = ["entailment", "neutral", "contradiction"]
+        self.device = torch.device(DEVICE)
+        print(f"Classifier device: {self.device}")
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model.eval()
+    def classify(self, claim, top_evidence):
+        verdicts = []
+        evidences = [e[1] for e in top_evidence]
+        if not evidences:
+            return "NEUTRAL", []
+        inputs = self.tokenizer(evidences, [claim] * len(evidences), return_tensors="pt", padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            outputs = self.model(**inputs)
+        probs = torch.softmax(outputs.logits, dim=-1)
+        for i, evidence in enumerate(evidences):
+            pred = torch.argmax(probs[i]).item()
+            verdicts.append({
+                "evidence": evidence,
+                "verdict": self.label_names[pred],
+                "scores": {name: float(probs[i][j]) for j, name in enumerate(self.label_names)}
+            })
+        top_verdict_info = verdicts[0]
+        if top_verdict_info["verdict"] == "entailment" and top_verdict_info["scores"]["entailment"] > 0.8:
+            result = "TRUE"
+        elif top_verdict_info["verdict"] == "contradiction" and top_verdict_info["scores"]["contradiction"] > 0.8:
+            result = "FALSE"
+        else:
+            for v in verdicts[1:]:
+                if v["verdict"] == "contradiction" and v["scores"]["contradiction"] > 0.9:
+                    result = "FALSE"
+                    break
+            else:
+                result = "NEUTRAL"
+        return result, verdicts
+    def __call__(self, claim, evidences):
+        return self.classify(claim, evidences)
+class summarizer:
+    def __init__(self):
+        self.model_name = "google/flan-t5-base" # Using a smaller model for server efficiency
+        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
+        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
+        self.device = torch.device(DEVICE)
+        self.model.to(self.device)
+        self.model.eval()
+        print(f"Summarizer device: {self.device}")
+    def forward(self, claim, top_evidence, verdict, max_input_len=1024, max_output_len=150):
+        evidence_texts = [e[1] for e in top_evidence]
+        if not evidence_texts:
+            return verdict, "No evidence was provided to generate a summary."
+        input_text = f"""Claim: "{claim}"\nVerdict: {verdict}\nEvidence:\n{"\n---\n".join(evidence_texts)}\n\nWrite a short, neutral explanation for why the verdict is {verdict}, based only on the evidence provided."""
+        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_len).to(self.device)
+        with torch.no_grad():
+            summary_ids = self.model.generate(inputs["input_ids"], max_length=max_output_len, num_beams=4, early_stopping=True)
+        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return verdict, summary
+    def __call__(self, claim, top_evidence, verdict):
+        return self.forward(claim, top_evidence, verdict)
+class FactChecker:
+    def __init__(self):
+        self.factcheck_api = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
+        self.google_search = "https://www.google.com/search"
+        load_dotenv()
+        self.factcheck_api_key = os.getenv("GOOGLE_FACT_CHECK_API_KEY")
+        # Lazy load heavy models
+        self.reranker = None
+        self.classifier = None
+        self.summarizer = None
+    def check_google_factcheck(self, claim: str, pages: int = 5):
+        if not self.factcheck_api_key:
+            print("Google FactCheck API key not found in .env file.")
+            return None
+        params = {'key': self.factcheck_api_key, 'query': claim, 'languageCode': 'en-US', 'pageSize': pages}
+        try:
+            response = requests.get(self.factcheck_api, params=params, timeout=10)
+            response.raise_for_status()
+            data = response.json()
+            if 'claims' in data and data['claims']:
+                claim_data = data['claims'][0]
+                review = claim_data.get('claimReview', [{}])[0]
+                return {
+                    'claim': claim_data.get('text', claim),
+                    'verdict': review.get('textualRating', 'Unknown'),
+                    'summary': f"Rated by {review.get('publisher', {}).get('name', 'Unknown')}",
+                    'source': [review.get('publisher', {}).get('name', 'Unknown')],
+                    'method': 'google_factcheck',
+                    'URLs': [review.get('url', '')]
+                }
+        except Exception as e:
+            print(f"FactCheck API error: {e}")
+        return None
+    def google_news_search(self, query: str, num_pages: int = 1):
+        print("Searching the Web...")
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
+        articles_gg = []
+        for page in range(num_pages):
+            params = {"q": query, "tbm": "nws", 'start': page * 10}
+            try:
+                res = requests.get(self.google_search, params=params, headers=headers, timeout=15)
+                soup = BeautifulSoup(res.text, 'html.parser')
+                # Note: This selector is fragile and may break if Google changes its HTML.
+                for article_link in soup.select("a.WlydOe"):
+                    title_div = article_link.find('div', class_="n0jPhd")
+                    source_div = article_link.find('div', class_="MgUUmf")
+                    if not (title_div and source_div): continue
+                    title = title_div.text
+                    a_url = article_link['href']
+                    source = source_div.text
+                    content = tra.extract(tra.fetch_url(a_url)) if a_url else "No content extracted"
+                    articles_gg.append({'title': title, 'url': a_url, 'text': content or "", 'source': source})
+            except Exception as e:
+                print(f"Error during web search: {e}")
+        top_evidences = [d.get('text', '') for d in articles_gg]
+        urls = [d.get('url', '') for d in articles_gg]
+        return top_evidences, urls, articles_gg
+    def search_and_analyze_claim(self, claim: str):
+        print("Performing web analysis...")
+        if self.reranker is None:
+            print("Loading AI models for web analysis...")
+            self.reranker = reranker()
+            self.classifier = Classifier()
+            self.summarizer = summarizer()
+        top_evidences, urls, article_list = self.google_news_search(claim)
+        if not top_evidences:
+            return {'claim': claim, 'verdict': 'Unverifiable', 'summary': 'No relevant sources found.', 'source': [], 'method': 'web_search', 'URLs': []}
+        reranked_articles = self.reranker.rerank_evidendce(claim, top_evidences)
+        if not reranked_articles:
+            return {'claim': claim, 'verdict': 'Unverifiable', 'summary': 'No relevant sources found after reranking.', 'source': [], 'method': 'web_search', 'URLs': []}
+        verdict, _ = self.classifier(claim, reranked_articles)
+        _, summary = self.summarizer(claim, reranked_articles[:3], verdict)
+        return {
+            'claim': claim,
+            'verdict': verdict,
+            'summary': summary,
+            'source': [arc.get('source', '') for arc in article_list],
+            'method': 'web_analysis',
+            'URLs': urls
+        }
+    def check_claim(self, claim: str):
+        """Main function to check a claim using the fallback pipeline."""
+        print(f"\n--- Checking claim: '{claim}' ---")
+        factcheck_result = self.check_google_factcheck(claim)
+        if factcheck_result:
+            print("Found result in FactCheck database.")
+            return factcheck_result
+        print("No FactCheck result, falling back to live web analysis...")
+        return self.search_and_analyze_claim(claim)
+class img_manipulation:
+    def __init__(self):
+        self.GEN_AI_IMAGE = pipeline("image-classification", model="umm-maybe/AI-image-detector", device=DEVICE)
+    def Gen_AI_IMG(self, img_pth):
+        try:
+            with Image.open(img_pth) as img:
+                img = img.convert('RGB')
+                result = self.GEN_AI_IMAGE(img)
+            proba = next((item['score'] for item in result if item['label'] == 'artificial'), 0.0)
+            return proba * 100
+        except Exception as e:
+            print(f'AI image detection error: {e}')
+            return 0.0
+    def generated_image(self, img_pth, quality=90, scale=15):
+        try:
+            with Image.open(img_pth) as orig_img:
+                orig_img = orig_img.convert('RGB')
+                temp_path = 'temp_resaved.jpg'
+                orig_img.save(temp_path, 'JPEG', quality=quality)
+                with Image.open(temp_path) as resaved_img:
+                    ela_image = ImageChops.difference(orig_img, resaved_img)
+            os.remove(temp_path)
+            ela_data = np.array(ela_image)
+            mean_intensity = ela_data.mean()
+            scaled_score = min(100, (mean_intensity / 25.0) * 100)
+            # Save the ELA image and return its path for serving
+            ela_path = "ela_result.png"
+            enhancer = ImageEnhance.Brightness(ela_image)
+            max_diff = max(1, max([ex[1] for ex in ela_image.getextrema()]))
+            ela_image_enhanced = enhancer.enhance(scale / max_diff)
+            ela_image_enhanced.save(ela_path)
+            return scaled_score, ela_path
+        except Exception as e:
+            print(f'ELA generation error: {e}')
+            return 0.0, None
+    def run_image_forensics(self, image_path):
+        ai_score = self.Gen_AI_IMG(image_path)
+        classic_score, ela_path = self.generated_image(image_path)
+        return {
+            "ai_generated_score_percent": ai_score,
+            "classic_edit_score_percent": classic_score,
+            "ela_image_path": ela_path
+        }
+class OCR:
+    def __init__(self, key_path='GOOGLE_VISION_API.json'):
+        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_path
+        self.client = vision.ImageAnnotatorClient()
+    def _get_full_vision_analysis(self, img_pth):
+        try:
+            with open(img_pth, 'rb') as image_file:
+                content = image_file.read()
+            image = vision.Image(content=content)
+            features = [{'type_': vision.Feature.Type.DOCUMENT_TEXT_DETECTION}, {'type_': vision.Feature.Type.SAFE_SEARCH_DETECTION}, {'type_': vision.Feature.Type.LANDMARK_DETECTION}, {'type_': vision.Feature.Type.LOGO_DETECTION}, {'type_': vision.Feature.Type.WEB_DETECTION}]
+            response = self.client.annotate_image({'image': image, 'features': features})
+            return response, None
+        except Exception as e:
+            return None, str(e)
+    def get_in_image_anal(self, img_pth):
+        response, error = self._get_full_vision_analysis(img_pth)
+        if error: return {'error': error}
+        report = {}
+        if response.full_text_annotation: report['Extracted Text'] = response.full_text_annotation.text
+        if response.safe_search_annotation:
+            safe = response.safe_search_annotation
+            report['Safe Search'] = {'adult': vision.Likelihood(safe.adult).name, 'violence': vision.Likelihood(safe.violence).name}
+        entities = []
+        if response.landmark_annotations: entities.extend([f'Landmark: {l.description}' for l in response.landmark_annotations])
+        if response.logo_annotations: entities.extend([f'Logo: {l.description}' for l in response.logo_annotations])
+        if entities: report['Identified Entities'] = entities
+        return report
+    def rev_img_search(self, img_pth):
+        response, error = self._get_full_vision_analysis(img_pth)
+        if error: return {'error': error}
+        report = {}
+        if response.web_detection and response.web_detection.pages_with_matching_images:
+            matches = [{'title': p.page_title, 'url': p.url} for p in response.web_detection.pages_with_matching_images[:5]]
+            report['Reverse Image Matches'] = matches
+        return report

requirements.txt ADDED Viewed

Binary file (8.01 kB). View file