Spaces:
Sleeping
Sleeping
| # advanced_tools.py | |
| import json | |
| import time | |
| from typing import Dict, List, Any, Optional | |
| from transformers import pipeline # type: ignore[import] | |
| import torch | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| class AdvancedTools: | |
| def __init__(self): | |
| """Modelleri lazy loading ile yükle""" | |
| self.models = {} | |
| self.embedder = None | |
| self.cache = {} | |
| def _load_sentiment(self): | |
| """Sentiment model'i yükle""" | |
| if "sentiment" not in self.models: | |
| print("🔄 Loading sentiment model...") | |
| self.models["sentiment"] = pipeline( # type: ignore[call-overload] | |
| "sentiment-analysis", | |
| model="distilbert-base-uncased-finetuned-sst-2-english", | |
| device=-1 # CPU | |
| ) | |
| return self.models["sentiment"] | |
| def _load_ner(self): | |
| """NER model'i yükle""" | |
| if "ner" not in self.models: | |
| print("🔄 Loading NER model...") | |
| self.models["ner"] = pipeline( # type: ignore[call-overload] | |
| "ner", | |
| model="dslim/bert-base-NER", | |
| aggregation_strategy="simple", | |
| device=-1 | |
| ) | |
| return self.models["ner"] | |
| def _load_embedder(self): | |
| """Embedding model'i yükle""" | |
| if self.embedder is None: | |
| print("🔄 Loading embedding model...") | |
| # clean_up_tokenization_spaces parametresini açıkça belirt (future warning için) | |
| self.embedder = SentenceTransformer( | |
| 'all-MiniLM-L6-v2', | |
| tokenizer_kwargs={'clean_up_tokenization_spaces': True} | |
| ) | |
| return self.embedder | |
| def sentiment_analysis(self, input_data: Dict) -> Dict: | |
| """Duygu analizi""" | |
| text = input_data.get("text", "") | |
| if not text: | |
| return {"error": "No text provided"} | |
| model = self._load_sentiment() | |
| # Metni cümlelere böl | |
| sentences = text.split('. ') | |
| results = [] | |
| for sentence in sentences: | |
| if len(sentence.strip()) > 3: | |
| result = model(sentence[:512])[0] # type: ignore[misc] | |
| results.append({ | |
| "sentence": sentence[:50] + "..." if len(sentence) > 50 else sentence, | |
| "sentiment": result["label"], | |
| "confidence": float(result["score"]) # numpy type -> Python float | |
| }) | |
| # Genel duygu hesapla | |
| if results: | |
| positive_count = sum(1 for r in results if r["sentiment"] == "POSITIVE") | |
| negative_count = sum(1 for r in results if r["sentiment"] == "NEGATIVE") | |
| overall = "POSITIVE" if positive_count > negative_count else "NEGATIVE" | |
| confidence = float(max(r["confidence"] for r in results)) # Ensure Python float | |
| else: | |
| overall = "NEUTRAL" | |
| confidence = 0.5 | |
| return { | |
| "overall_sentiment": overall, | |
| "confidence": confidence, | |
| "sentence_analysis": results, | |
| "summary": { | |
| "positive_sentences": sum(1 for r in results if r["sentiment"] == "POSITIVE"), | |
| "negative_sentences": sum(1 for r in results if r["sentiment"] == "NEGATIVE"), | |
| "total_sentences": len(results) | |
| } | |
| } | |
| def entity_extraction(self, input_data: Dict) -> Dict: | |
| """Named Entity Recognition""" | |
| text = input_data.get("text", "") | |
| if not text: | |
| return {"error": "No text provided"} | |
| model = self._load_ner() | |
| entities = model(text[:512]) # type: ignore[misc] | |
| # Entity'leri grupla ve numpy tiplerini Python tiplerine çevir | |
| grouped = {} | |
| serializable_entities = [] | |
| for entity in entities: | |
| entity_type = entity["entity_group"] | |
| if entity_type not in grouped: | |
| grouped[entity_type] = [] | |
| grouped[entity_type].append({ | |
| "word": entity["word"], | |
| "score": float(entity["score"]) # numpy.float32 -> Python float | |
| }) | |
| # İlk 10 entity için serileştirilebilir versiyon | |
| if len(serializable_entities) < 10: | |
| serializable_entities.append({ | |
| "entity_group": entity["entity_group"], | |
| "word": entity["word"], | |
| "score": float(entity["score"]), # numpy.float32 -> Python float | |
| "start": int(entity["start"]) if "start" in entity else None, | |
| "end": int(entity["end"]) if "end" in entity else None | |
| }) | |
| return { | |
| "entities": serializable_entities, # İlk 10 entity (serileştirilebilir) | |
| "grouped": grouped, | |
| "summary": { | |
| "total_entities": len(entities), | |
| "entity_types": list(grouped.keys()), | |
| "most_common_type": max(grouped.keys(), key=lambda k: len(grouped[k])) if grouped else None | |
| } | |
| } | |
| def semantic_similarity(self, input_data: Dict) -> Dict: | |
| """İki metin arasındaki benzerlik""" | |
| text1 = input_data.get("text1", "") | |
| text2 = input_data.get("text2", "") | |
| if not text1 or not text2: | |
| return {"error": "Both text1 and text2 are required"} | |
| embedder = self._load_embedder() | |
| # Embed metinleri | |
| embeddings = embedder.encode([text1, text2]) # type: ignore[misc] | |
| # Cosine similarity hesapla | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| similarity_score = cosine_similarity( | |
| [embeddings[0]], | |
| [embeddings[1]] | |
| )[0][0] | |
| return { | |
| "text1": text1[:100] + "..." if len(text1) > 100 else text1, | |
| "text2": text2[:100] + "..." if len(text2) > 100 else text2, | |
| "similarity_score": float(similarity_score), | |
| "similarity_percentage": round(float(similarity_score) * 100, 2) | |
| } | |
| def text_embedding(self, input_data: Dict) -> Dict: | |
| """Metni vector'e çevir (embedding)""" | |
| text = input_data.get("text", "") | |
| if not text: | |
| return {"error": "No text provided"} | |
| embedder = self._load_embedder() | |
| embedding = embedder.encode(text) # type: ignore[misc] | |
| return { | |
| "text": text[:100] + "..." if len(text) > 100 else text, | |
| "embedding": embedding.tolist()[:50], # İlk 50 dimension | |
| "embedding_dimension": len(embedding), | |
| "embedding_size_kb": round(len(embedding) * 4 / 1024, 2) | |
| } | |
| def smart_cache(self, input_data: Dict) -> Dict: | |
| """Caching ve cache stats""" | |
| operation = input_data.get("operation", "stats") | |
| if operation == "clear": | |
| size_before = len(self.cache) | |
| self.cache.clear() | |
| return { | |
| "operation": "clear", | |
| "items_cleared": size_before, | |
| "cache_size": 0 | |
| } | |
| elif operation == "stats": | |
| return { | |
| "operation": "stats", | |
| "cached_items": len(self.cache), | |
| "cache_keys": list(self.cache.keys())[:10], | |
| "cache_memory_estimate_kb": round(len(str(self.cache)) / 1024, 2) | |
| } | |
| elif operation == "set": | |
| key = input_data.get("key") | |
| value = input_data.get("value") | |
| if not key or value is None: | |
| return {"error": "key and value required for set operation"} | |
| self.cache[key] = value | |
| return { | |
| "operation": "set", | |
| "key": key, | |
| "cache_size": len(self.cache) | |
| } | |
| elif operation == "get": | |
| key = input_data.get("key") | |
| if not key: | |
| return {"error": "key required for get operation"} | |
| value = self.cache.get(key) | |
| return { | |
| "operation": "get", | |
| "key": key, | |
| "found": value is not None, | |
| "value": value | |
| } | |
| return {"error": "Unknown operation"} | |
| # Global instance oluştur | |
| advanced_tools = AdvancedTools() |