File size: 15,567 Bytes
29a3268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff9e1fa
 
29a3268
ff9e1fa
 
 
 
 
 
29a3268
ff9e1fa
 
 
 
 
 
 
 
 
 
 
 
29a3268
 
ff9e1fa
 
 
 
 
 
 
 
 
 
 
29a3268
 
ff9e1fa
 
 
 
 
29a3268
ff9e1fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29a3268
 
 
 
 
 
 
 
 
 
 
ff9e1fa
29a3268
 
 
 
ff9e1fa
29a3268
 
 
ff9e1fa
 
29a3268
1e00889
 
 
 
 
29a3268
1e00889
 
 
 
29a3268
1e00889
29a3268
 
1e00889
 
 
 
 
29a3268
1e00889
29a3268
 
1e00889
 
 
 
 
 
29a3268
 
1e00889
 
 
 
 
29a3268
 
1e00889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29a3268
1e00889
 
 
 
 
29a3268
ff9e1fa
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import os
import torch
import math
import gradio as gr
from PIL import Image
from transformers import (
    GPT2LMHeadModel, GPT2Tokenizer,
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoImageProcessor, AutoModelForImageClassification,
    logging
)
from openai import OpenAI
from groq import Groq
import cv2
import numpy as np
import torch.nn as nn
import librosa

logging.set_verbosity_error()

# -----------------------------
# API Keys (set via Space secrets)
# -----------------------------
HF_TOKEN = os.getenv("HF_TOKEN")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
client = Groq(api_key=GROQ_API_KEY)

device = "cuda" if torch.cuda.is_available() else "cpu"

# TEXT DETECTION
# -----------------------------
def run_hf_detector(text, model_id="roberta-base-openai-detector"):
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
    model = AutoModelForSequenceClassification.from_pretrained(model_id, token=HF_TOKEN).to(device)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
    human_score, ai_score = float(probs[0]), float(probs[1])
    label = "AI-generated" if ai_score > human_score else "Human-generated"
    return {"ai_score": ai_score, "human_score": human_score, "hf_label": label}

def calculate_perplexity(text):
    model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    encodings = tokenizer(text, return_tensors="pt").to(device)
    max_length = model.config.n_positions
    if encodings.input_ids.size(1) > max_length:
        encodings.input_ids = encodings.input_ids[:, :max_length]
        encodings.attention_mask = encodings.attention_mask[:, :max_length]
    with torch.no_grad():
        outputs = model(**encodings, labels=encodings.input_ids)
    loss = outputs.loss
    perplexity = math.exp(loss.item())
    label = "AI-generated" if perplexity < 60 else "Human-generated"
    return {"perplexity": perplexity, "perplexity_label": label}

def generate_text_explanation(text, ai_score, human_score):
    decision = "AI-generated" if ai_score > human_score else "Human-generated"
    prompt = f"""
    You are an AI text analysis expert. Explain concisely why this text was classified as '{decision}'.
    Text: "{text}"
    Explanation:"""
    response = client.chat.completions.create(
        model="gemma2-9b-it",
        messages=[{"role":"user","content":prompt}],
        max_tokens=150,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

def analyze_text(text):
    try:
        hf_out = run_hf_detector(text)
        hf_out["ai_score"] = float(hf_out["ai_score"])
        hf_out["human_score"] = float(hf_out["human_score"])
        diff = abs(hf_out["ai_score"] - hf_out["human_score"])
        confidence = "High" if diff>0.8 else "Medium" if diff>=0.3 else "Low"
        perp_out = calculate_perplexity(text)
        explanation = generate_text_explanation(text, hf_out["ai_score"], hf_out["human_score"])
        return {"ai_score": hf_out["ai_score"], "confidence": confidence, "explanation": explanation}
    except:
        return {"ai_score":0.0,"confidence":"Low","explanation":"Error analyzing text."}

# -----------------------------
# IMAGE DETECTION
# -----------------------------
image_model_name = "Ateeqq/ai-vs-human-image-detector"
image_processor = AutoImageProcessor.from_pretrained(image_model_name)
image_model = AutoModelForImageClassification.from_pretrained(image_model_name)
image_model.eval()

def generate_image_explanation(ai_probability,human_probability,confidence):
    prompt = f"""
    You are an AI image analysis expert.
    AI: {ai_probability:.4f}, Human: {human_probability:.4f}, Confidence: {confidence}
    Explain in 1-2 sentences why it was classified as {'AI-generated' if ai_probability>human_probability else 'Human-generated'}.
    """
    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role":"user","content":prompt}],
        temperature=0.6
    )
    return response.choices[0].message.content.strip()

def analyze_image(image):
    image = image.convert("RGB")
    inputs = image_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        logits = image_model(**inputs).logits
    probabilities = torch.nn.functional.softmax(logits/6.0, dim=-1)[0]
    ai_prob, human_prob = probabilities[0].item(), probabilities[1].item()
    diff = abs(ai_prob-human_prob)
    confidence = "High" if diff>=0.7 else "Medium" if diff>=0.3 else "Low"
    explanation = generate_image_explanation(ai_prob, human_prob, confidence)
    return {"ai_probability": ai_prob, "confidence": confidence, "explanation": explanation}

# -----------------------------
# VIDEO DETECTION
# -----------------------------
def extract_frames(video_path, frame_rate=1):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    interval = int(fps*frame_rate)
    frames,count = [],0
    while cap.isOpened():
        ret,frame = cap.read()
        if not ret: break
        if count%interval==0: frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
        count+=1
    cap.release()
    return frames

def analyze_video(video_path):
    frames = extract_frames(video_path, frame_rate=1)
    if not frames: return {"error":"No frames extracted."}
    ai_probs,human_probs = [],[]
    for img in frames:
        inputs = image_processor(images=img, return_tensors="pt")
        with torch.no_grad(): logits = image_model(**inputs).logits
        probs = torch.nn.functional.softmax(logits, dim=-1)[0]
        ai_probs.append(probs[0].item())
        human_probs.append(probs[1].item())
    avg_ai,avg_human = float(np.mean(ai_probs)), float(np.mean(human_probs))
    diff = abs(avg_ai-avg_human)
    confidence = "High" if diff>=0.7 else "Medium" if diff>=0.3 else "Low"
    prompt = f"Video processed {len(frames)} frames. AI: {avg_ai:.4f}, Human: {avg_human:.4f}. Confidence: {confidence}. Explain why it was {'AI-generated' if avg_ai>avg_human else 'Human-generated'}."
    response = client.chat.completions.create(model="llama-3.3-70b-versatile", messages=[{"role":"user","content":prompt}], temperature=0.6)
    explanation = response.choices[0].message.content.strip()
    return {"ai_probability":avg_ai,"confidence":confidence,"explanation":explanation}

# -----------------------------
# AUDIO DETECTION
# -----------------------------
class AudioCNNRNN(nn.Module):
    def __init__(self, lstm_hidden_size=128, num_classes=2):
        super(AudioCNNRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_size, num_classes)

    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        c_in = x.view(batch_size * seq_len, c, h, w)
        features = self.cnn(c_in)
        features = features.mean(dim=[2, 3])
        features = features.view(batch_size, seq_len, -1)
        lstm_out, _ = self.lstm(features)
        out = self.fc(lstm_out[:, -1, :])
        return out

def extract_mel_spectrogram(audio_path, sr=16000, n_mels=64):
    waveform, sample_rate = librosa.load(audio_path, sr=sr)
    mel_spec = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

def slice_spectrogram(mel_spec, slice_size=128, step=64):
    slices = []
    for start in range(0, mel_spec.shape[1] - slice_size, step):
        slice_ = mel_spec[:, start:start + slice_size]
        slices.append(slice_)
    return slices
def analyze_audio(audio_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = AudioCNNRNN()
    model.eval()
    model.to(device)

    mel_spec = extract_mel_spectrogram(audio_path)
    mel_slices = slice_spectrogram(mel_spec, slice_size=128, step=64)

    if len(mel_slices) == 0:
        raise RuntimeError("No mel slices generated. Check audio length.")

    tensor_slices = [torch.tensor(s).unsqueeze(0) for s in mel_slices]
    data = torch.stack(tensor_slices)
    data = data.unsqueeze(0)
    data = data.to(device)

    with torch.no_grad():
        outputs = model(data)
        logits = outputs

    temperature = 3.0
    probabilities = torch.nn.functional.softmax(logits / temperature, dim=-1)

    ai_probability = probabilities[0][0].item()
    human_probability = probabilities[0][1].item()

    diff = abs(ai_probability - human_probability)
    if diff >= 0.7:
        confidence = "High"
    elif diff >= 0.3:
        confidence = "Medium"
    else:
        confidence = "Low"

    prompt = f"""
    You are an AI audio analysis expert.
    The detector outputs:
    - AI-generated probability: {ai_probability:.4f}
    - Human-generated probability: {human_probability:.4f}
    - Confidence level: {confidence}

    Give a short, human-readable explanation (1-2 sentences) of why the audio was likely classified as {'AI-generated' if ai_probability > human_probability else 'human-generated'}.
    Base it on audio cues such as tone, pitch patterns, unnatural pauses, synthesis artifacts, or other hints you might infer.
    Avoid repeating probabilities; focus on the reasoning.
    """

    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.6,
    )

    return {
        "ai_probability": ai_probability,
        "confidence": confidence,
        "explanation": response.choices[0].message.content.strip()
    }

# -----------------------------
# GRADIO UI
# -----------------------------
def format_text_results(text):
    res = analyze_text(text)
    conf_map = {"High":"🟒 High","Medium":"🟑 Medium","Low":"πŸ”΄ Low"}
    return f"### Text Detection\nAI Score: {res['ai_score']:.4f}\nConfidence: {conf_map.get(res['confidence'],res['confidence'])}\nExplanation: {res['explanation']}"

def format_image_results(image):
    res = analyze_image(image)
    return f"### Image Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"

def format_video_results(video_file):
    res = analyze_video(video_file)
    if "error" in res: return res["error"]
    return f"### Video Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"

def format_audio_results(audio_file):
    res = analyze_audio(audio_file)
    return f"### Audio Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"

with gr.Blocks() as app:
    # Home Page
    home_page = gr.Column(visible=True)
    with home_page:
        gr.Markdown("## 🏠 AI Detection Tool")
        gr.Markdown("Select an option below to continue:")
        with gr.Row():
            text_page_btn = gr.Button("🧠 Text Detection")
            image_page_btn = gr.Button("πŸ–ΌοΈ Image Detection")
            video_page_btn = gr.Button("🎬 Video Detection")  # Add on home page
            audio_page_btn = gr.Button("🎡 Audio Detection")  # Add this to home page

    # Text Page
    text_page = gr.Column(visible=False)
    with text_page:
        gr.Markdown("## 🧠 Text Detection")
        text_input = gr.Textbox(lines=5, placeholder="Paste your text here...", label="Input Text")
        text_output = gr.Markdown("⚑ Result will appear here after submission...", label="Result")
        analyze_text_btn = gr.Button("Analyze Text")
        back_btn_text = gr.Button("⬅️ Back")

    # Image Page
    image_page = gr.Column(visible=False)
    with image_page:
        gr.Markdown("## πŸ–ΌοΈ Image Detection")
        image_input = gr.Image(type="pil", label="Upload Image")
        image_output = gr.Markdown("⚑ Result will appear here after image upload...", label="Result")
        analyze_image_btn = gr.Button("Analyze Image")
        back_btn_image = gr.Button("⬅️ Back")
    # Video page
    video_page = gr.Column(visible=False)
    with video_page:
        gr.Markdown("## 🎬 Video Detection")
        video_input = gr.Video(label="Upload Video")  # Corrected
        video_output = gr.Markdown("⚑ Result will appear here after video upload...", label="Result")
        analyze_video_btn = gr.Button("Analyze Video")
        back_btn_video = gr.Button("⬅️ Back")
    audio_page = gr.Column(visible=False)
    with audio_page:
        gr.Markdown("## 🎡 Audio Detection")
        audio_input = gr.Audio(label="Upload Audio", type="filepath")  # Use type="filepath" to get local path
        audio_output = gr.Markdown("⚑ Result will appear here after audio upload...", label="Result")
        analyze_audio_btn = gr.Button("Analyze Audio")
        back_btn_audio = gr.Button("⬅️ Back")

    def show_video_page():
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
    def show_audio_page():
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)

    audio_page_btn.click(show_audio_page, outputs=[home_page, text_page, image_page, video_page, audio_page])

    # Back button returns to home
    # Navigation functions
    def show_text_page():
        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    def show_image_page():
        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
    def show_home():
        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)

    # Bind navigation buttons
    text_page_btn.click(show_text_page, outputs=[home_page, text_page, image_page])
    image_page_btn.click(show_image_page, outputs=[home_page, text_page, image_page])
    back_btn_text.click(show_home, outputs=[home_page, text_page, image_page])
    back_btn_image.click(show_home, outputs=[home_page, text_page, image_page])
    video_page_btn.click(show_video_page, outputs=[home_page, text_page, image_page, video_page])
    back_btn_video.click(lambda: (gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)),
                         outputs=[home_page, text_page, image_page, video_page])
    back_btn_audio.click(lambda: (
        gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
    ), outputs=[home_page, text_page, image_page, video_page, audio_page])


    # Bind analysis buttons
    analyze_text_btn.click(format_text_results, inputs=text_input, outputs=text_output)
    analyze_image_btn.click(format_image_results, inputs=image_input, outputs=image_output)
    analyze_video_btn.click(format_video_results, inputs=video_input, outputs=video_output)
    analyze_audio_btn.click(format_audio_results, inputs=audio_input, outputs=audio_output)

app.launch(share=True, debug=True)