# advanced_tools.py import json import time from typing import Dict, List, Any, Optional from transformers import pipeline # type: ignore[import] import torch import numpy as np from sentence_transformers import SentenceTransformer class AdvancedTools: def __init__(self): """Modelleri lazy loading ile yükle""" self.models = {} self.embedder = None self.cache = {} def _load_sentiment(self): """Sentiment model'i yükle""" if "sentiment" not in self.models: print("🔄 Loading sentiment model...") self.models["sentiment"] = pipeline( # type: ignore[call-overload] "sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=-1 # CPU ) return self.models["sentiment"] def _load_ner(self): """NER model'i yükle""" if "ner" not in self.models: print("🔄 Loading NER model...") self.models["ner"] = pipeline( # type: ignore[call-overload] "ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=-1 ) return self.models["ner"] def _load_embedder(self): """Embedding model'i yükle""" if self.embedder is None: print("🔄 Loading embedding model...") # clean_up_tokenization_spaces parametresini açıkça belirt (future warning için) self.embedder = SentenceTransformer( 'all-MiniLM-L6-v2', tokenizer_kwargs={'clean_up_tokenization_spaces': True} ) return self.embedder def sentiment_analysis(self, input_data: Dict) -> Dict: """Duygu analizi""" text = input_data.get("text", "") if not text: return {"error": "No text provided"} model = self._load_sentiment() # Metni cümlelere böl sentences = text.split('. ') results = [] for sentence in sentences: if len(sentence.strip()) > 3: result = model(sentence[:512])[0] # type: ignore[misc] results.append({ "sentence": sentence[:50] + "..." if len(sentence) > 50 else sentence, "sentiment": result["label"], "confidence": float(result["score"]) # numpy type -> Python float }) # Genel duygu hesapla if results: positive_count = sum(1 for r in results if r["sentiment"] == "POSITIVE") negative_count = sum(1 for r in results if r["sentiment"] == "NEGATIVE") overall = "POSITIVE" if positive_count > negative_count else "NEGATIVE" confidence = float(max(r["confidence"] for r in results)) # Ensure Python float else: overall = "NEUTRAL" confidence = 0.5 return { "overall_sentiment": overall, "confidence": confidence, "sentence_analysis": results, "summary": { "positive_sentences": sum(1 for r in results if r["sentiment"] == "POSITIVE"), "negative_sentences": sum(1 for r in results if r["sentiment"] == "NEGATIVE"), "total_sentences": len(results) } } def entity_extraction(self, input_data: Dict) -> Dict: """Named Entity Recognition""" text = input_data.get("text", "") if not text: return {"error": "No text provided"} model = self._load_ner() entities = model(text[:512]) # type: ignore[misc] # Entity'leri grupla ve numpy tiplerini Python tiplerine çevir grouped = {} serializable_entities = [] for entity in entities: entity_type = entity["entity_group"] if entity_type not in grouped: grouped[entity_type] = [] grouped[entity_type].append({ "word": entity["word"], "score": float(entity["score"]) # numpy.float32 -> Python float }) # İlk 10 entity için serileştirilebilir versiyon if len(serializable_entities) < 10: serializable_entities.append({ "entity_group": entity["entity_group"], "word": entity["word"], "score": float(entity["score"]), # numpy.float32 -> Python float "start": int(entity["start"]) if "start" in entity else None, "end": int(entity["end"]) if "end" in entity else None }) return { "entities": serializable_entities, # İlk 10 entity (serileştirilebilir) "grouped": grouped, "summary": { "total_entities": len(entities), "entity_types": list(grouped.keys()), "most_common_type": max(grouped.keys(), key=lambda k: len(grouped[k])) if grouped else None } } def semantic_similarity(self, input_data: Dict) -> Dict: """İki metin arasındaki benzerlik""" text1 = input_data.get("text1", "") text2 = input_data.get("text2", "") if not text1 or not text2: return {"error": "Both text1 and text2 are required"} embedder = self._load_embedder() # Embed metinleri embeddings = embedder.encode([text1, text2]) # type: ignore[misc] # Cosine similarity hesapla from sklearn.metrics.pairwise import cosine_similarity similarity_score = cosine_similarity( [embeddings[0]], [embeddings[1]] )[0][0] return { "text1": text1[:100] + "..." if len(text1) > 100 else text1, "text2": text2[:100] + "..." if len(text2) > 100 else text2, "similarity_score": float(similarity_score), "similarity_percentage": round(float(similarity_score) * 100, 2) } def text_embedding(self, input_data: Dict) -> Dict: """Metni vector'e çevir (embedding)""" text = input_data.get("text", "") if not text: return {"error": "No text provided"} embedder = self._load_embedder() embedding = embedder.encode(text) # type: ignore[misc] return { "text": text[:100] + "..." if len(text) > 100 else text, "embedding": embedding.tolist()[:50], # İlk 50 dimension "embedding_dimension": len(embedding), "embedding_size_kb": round(len(embedding) * 4 / 1024, 2) } def smart_cache(self, input_data: Dict) -> Dict: """Caching ve cache stats""" operation = input_data.get("operation", "stats") if operation == "clear": size_before = len(self.cache) self.cache.clear() return { "operation": "clear", "items_cleared": size_before, "cache_size": 0 } elif operation == "stats": return { "operation": "stats", "cached_items": len(self.cache), "cache_keys": list(self.cache.keys())[:10], "cache_memory_estimate_kb": round(len(str(self.cache)) / 1024, 2) } elif operation == "set": key = input_data.get("key") value = input_data.get("value") if not key or value is None: return {"error": "key and value required for set operation"} self.cache[key] = value return { "operation": "set", "key": key, "cache_size": len(self.cache) } elif operation == "get": key = input_data.get("key") if not key: return {"error": "key required for get operation"} value = self.cache.get(key) return { "operation": "get", "key": key, "found": value is not None, "value": value } return {"error": "Unknown operation"} # Global instance oluştur advanced_tools = AdvancedTools()