Spaces:

Hasitha16
/

churnsight-ai

Running

App Files Files Community

churnsight-ai / model.py

Hasitha16

Update model.py

cd098b8 verified 4 months ago

raw

history blame

3.95 kB

	import os
	os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf-cache"
	os.environ["HF_HOME"] = "/tmp/hf-home"
	import nltk
	os.environ["NLTK_DATA"] = "/tmp/nltk_data"
	nltk.download("punkt", download_dir="/tmp/nltk_data")
	from typing import List, Optional
	from pydantic import BaseModel
	from transformers import pipeline

	# ✅ Extra: Smart Summarization Imports
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from nltk.tokenize import sent_tokenize
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
	sentiment_analyzer = pipeline("sentiment-analysis")

	# 🧠 Basic Summarization (Abstractive)
	def summarize_review(text):
	return summarizer(text, max_length=60, min_length=10, do_sample=False, no_repeat_ngram_size=3)[0]["summary_text"]

	# 🧠 Smart Summarization (Clustered Key Sentences)
	def smart_summarize(text, n_clusters=1):
	"""Improved summarization using clustering on sentence embeddings"""
	tokenizer = nltk.tokenize.PunktSentenceTokenizer() # ✅ Use default trained Punkt tokenizer
	sentences = tokenizer.tokenize(text)

	if len(sentences) <= 1:
	return text

	vectorizer = TfidfVectorizer(stop_words="english")
	tfidf_matrix = vectorizer.fit_transform(sentences)

	if len(sentences) <= n_clusters:
	return " ".join(sentences)

	kmeans = KMeans(n_clusters=n_clusters, random_state=42)
	kmeans.fit(tfidf_matrix)

	avg = []
	for i in range(n_clusters):
	idx = np.where(kmeans.labels_ == i)[0]
	if len(idx) == 0:
	continue
	avg_vector = tfidf_matrix[idx].mean(axis=0).A1.reshape(1, -1) # Convert np.matrix to ndarray
	sim = cosine_similarity(avg_vector, tfidf_matrix[idx])
	most_representative_idx = idx[np.argmax(sim)]
	avg.append(sentences[most_representative_idx])

	return " ".join(sorted(avg, key=sentences.index))

	# 📊 Sentiment Detection
	def analyze_sentiment(text):
	result = sentiment_analyzer(text)[0]
	label = result["label"]
	score = result["score"]

	if "star" in label:
	stars = int(label[0])
	if stars <= 2:
	label = "NEGATIVE"
	elif stars == 3:
	label = "NEUTRAL"
	else:
	label = "POSITIVE"

	return {
	"label": label,
	"score": score
	}

	# 🔥 Emotion Detection (heuristic-based)
	def detect_emotion(text):
	text_lower = text.lower()
	if "angry" in text_lower or "hate" in text_lower:
	return "anger"
	elif "happy" in text_lower or "love" in text_lower:
	return "joy"
	elif "sad" in text_lower or "disappointed" in text_lower:
	return "sadness"
	elif "confused" in text_lower or "unclear" in text_lower:
	return "confusion"
	else:
	return "neutral"

	# 🧩 Aspect-Based Sentiment (mock)
	def extract_aspect_sentiment(text, aspects: list):
	results = {}
	text_lower = text.lower()
	for asp in aspects:
	label = "positive" if asp in text_lower and "not" not in text_lower else "neutral"
	results[asp] = {
	"label": label,
	"confidence": 0.85
	}
	return results

	# ✅ Pydantic Schemas for FastAPI
	class ReviewInput(BaseModel):
	text: str
	model: str = "distilbert-base-uncased-finetuned-sst-2-english"
	industry: str = "Generic"
	aspects: bool = False
	follow_up: Optional[str] = None
	product_category: Optional[str] = None
	device: Optional[str] = None

	class BulkReviewInput(BaseModel):
	reviews: List[str]
	model: str = "distilbert-base-uncased-finetuned-sst-2-english"
	industry: str = "Generic"
	aspects: bool = False
	product_category: Optional[str] = None
	device: Optional[str] = None

	class TranslationInput(BaseModel):
	text: str
	target_lang: str = "fr"

	class ChatInput(BaseModel):
	question: str
	context: str