Spaces:
Running
Running
| # app.py - 文本检测多模型集成系统 | |
| import gradio as gr | |
| from transformers import pipeline | |
| import numpy as np | |
| import re | |
| # 加载多个检测模型 | |
| models = { | |
| "model1": { | |
| "name": "Xenova/distilbert-base-ai-generated-text-detection", | |
| "detector": None, | |
| "weight": 0.4 | |
| }, | |
| "model2": { | |
| "name": "Hello-SimpleAI/chatgpt-detector-roberta", | |
| "detector": None, | |
| "weight": 0.3 | |
| }, | |
| "model3": { | |
| "name": "roberta-base-openai-detector", | |
| "detector": None, | |
| "weight": 0.3 | |
| } | |
| } | |
| # 初始化模型 | |
| for key in models: | |
| try: | |
| models[key]["detector"] = pipeline("text-classification", model=models[key]["name"]) | |
| print(f"成功加载模型: {models[key]['name']}") | |
| except Exception as e: | |
| print(f"加载模型 {models[key]['name']} 失败: {str(e)}") | |
| models[key]["detector"] = None | |
| def analyze_text_features(text): | |
| # 文本特征分析 | |
| features = {} | |
| features["length"] = len(text) | |
| words = text.split() | |
| features["word_count"] = len(words) | |
| features["avg_word_length"] = sum(len(word) for word in words) / max(1, len(words)) | |
| features["unique_words_ratio"] = len(set(text.lower().split())) / max(1, len(words)) | |
| # 句子分析 | |
| sentences = re.split(r'[.!?]+', text) | |
| features["sentence_count"] = len(sentences) | |
| features["avg_sentence_length"] = sum(len(s.split()) for s in sentences) / max(1, len(sentences)) | |
| # 词汇多样性 | |
| if len(words) > 0: | |
| features["lexical_diversity"] = len(set(words)) / len(words) | |
| # 标点符号比例 | |
| punctuation_count = sum(1 for char in text if char in ",.!?;:\"'()[]{}") | |
| features["punctuation_ratio"] = punctuation_count / max(1, len(text)) | |
| return features | |
| def detect_ai_text(text): | |
| if not text or len(text.strip()) < 50: | |
| return {"error": "文本太短,无法可靠检测"} | |
| results = {} | |
| valid_models = 0 | |
| weighted_ai_probability = 0 | |
| # 使用每个模型进行预测 | |
| for key, model_info in models.items(): | |
| if model_info["detector"] is not None: | |
| try: | |
| result = model_info["detector"](text) | |
| # 提取结果 | |
| label = result[0]["label"] | |
| score = result[0]["score"] | |
| # 确定AI生成概率 | |
| if "ai" in label.lower() or "chatgpt" in label.lower() or "generated" in label.lower(): | |
| ai_probability = score | |
| else: | |
| ai_probability = 1 - score | |
| # 添加到结果 | |
| results[key] = { | |
| "model_name": model_info["name"], | |
| "ai_probability": ai_probability, | |
| "label": label, | |
| "score": score | |
| } | |
| # 累加加权概率 | |
| weighted_ai_probability += ai_probability * model_info["weight"] | |
| valid_models += 1 | |
| except Exception as e: | |
| results[key] = { | |
| "model_name": model_info["name"], | |
| "error": str(e) | |
| } | |
| # 计算最终加权概率 | |
| final_ai_probability = weighted_ai_probability / max(sum(m["weight"] for k, m in models.items() if m["detector"] is not None), 1) | |
| # 分析文本特征 | |
| text_features = analyze_text_features(text) | |
| # 确定置信度级别 | |
| if final_ai_probability > 0.7: | |
| confidence_level = "高概率AI生成" | |
| elif final_ai_probability < 0.3: | |
| confidence_level = "高概率人类创作" | |
| else: | |
| confidence_level = "无法确定" | |
| # 构建最终结果 | |
| final_result = { | |
| "ai_probability": final_ai_probability, | |
| "confidence_level": confidence_level, | |
| "individual_model_results": results, | |
| "features": text_features | |
| } | |
| return final_result | |
| # 创建Gradio界面 | |
| iface = gr.Interface( | |
| fn=detect_ai_text, | |
| inputs=gr.Textbox(lines=10, placeholder="粘贴要检测的文本..."), | |
| outputs=gr.JSON(), | |
| title="增强型AI文本检测API", | |
| description="多模型集成检测文本是否由AI生成", | |
| examples=[ | |
| ["这是一段示例文本,用于测试AI文本检测功能。请输入至少50个字符的文本以获得准确的检测结果。"] | |
| ], | |
| allow_flagging="never" | |
| ) | |
| iface.launch() | |