Spaces:

aradhyapavan
/

nlp-sentiment-analysis-pretarined-models

Running

nlp-sentiment-analysis-pretarined-models

File size: 5,758 Bytes

7cb1242

import spacy
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import csv
import re
import nltk
import os

# Ensure NLTK uses a writable directory; downloads are handled in Dockerfile/app startup
NLTK_DIR = os.environ.get('NLTK_DATA', os.path.join(os.getcwd(), 'nltk_data'))
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DIR)

# Load spaCy medium English model for text preprocessing (NER, POS tagging)
nlp = spacy.load("en_core_web_md")

# Pre-load transformer models once for faster response
models = {
    "default": pipeline('sentiment-analysis', model="distilbert-base-uncased-finetuned-sst-2-english"),
    "roberta": pipeline('sentiment-analysis', model="cardiffnlp/twitter-roberta-base-sentiment"),
    "emotion": pipeline('sentiment-analysis', model="j-hartmann/emotion-english-distilroberta-base")
}

# Initialize stemmer
stemmer = PorterStemmer()

import re

def preprocess_text(text):
    """
    Preprocesses the input text by cleaning, normalizing, tokenizing, stemming, lemmatizing, 
    and extracting Named Entities and POS tags.

    Returns:
    - cleaned_text: Text after removing stop words, punctuation, URLs, and emails.
    - removed_text: Text that was removed during cleaning.
    - normalized_text: Lowercased version of cleaned text.
    - tokenized_text: List of tokens (words) from normalized text.
    - stemmed_tokens: List of stemmed tokens.
    - lemmatized_tokens: List of lemmatized tokens.
    - ner: List of named entities found in the original text.
    - pos: List of Part-of-Speech (POS) tags using normalized text.
    """
    
    # Step 1: Clean the text, removing newlines, multiple spaces, and other unwanted characters
    text = re.sub(r'\s+', ' ', text).strip()  # Replaces any form of multiple whitespace (including \r\n) with a single space

    # Step 2: Apply spaCy NLP for further processing
    doc = nlp(text)

    # Step 3: Cleaning: Remove stop words, punctuations, URLs, and emails
    cleaned_text = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct and not token.like_url and not token.like_email])

    # Removed Text: Contains all stop words, punctuations, URLs, and emails that were filtered
    removed_text = " ".join([token.text for token in doc if token.is_stop or token.is_punct or token.like_url or token.like_email])

    # Step 4: Normalization (lowercasing)
    normalized_text = cleaned_text.lower()

    # Step 5: Tokenization
    tokenized_text = word_tokenize(normalized_text)

    # Step 6: POS tagging on the normalized text (so that it's consistent with tokenized/lemmatized text)
    normalized_doc = nlp(" ".join(tokenized_text))
    pos = [(token.text, token.pos_) for token in normalized_doc if token.pos_ != 'SPACE']

    # Convert tokenized text to a clean list (without brackets or quotes)
    tokenized_text_clean = tokenized_text

    # Step 7: Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokenized_text]

    # Step 8: Lemmatization
    lemmatized_tokens = [token.lemma_ for token in normalized_doc]

    # Step 9: Named Entity Recognition (NER)
    ner = [(ent.text, ent.label_) for ent in doc.ents]

    return cleaned_text, removed_text, normalized_text, tokenized_text_clean, stemmed_tokens, lemmatized_tokens, ner, pos

def analyze_sentiment(text, model_type="default"):
    """
    Analyze the sentiment of the given text using the specified model type.

    Arguments:
    - text: The input text to analyze.
    - model_type: The sentiment model to use ('default', 'roberta', or 'emotion').

    Returns:
    - sentiment: The overall sentiment of the text (e.g., POSITIVE, NEGATIVE).
    - probabilities: The sentiment probabilities or confidence scores for each label.
    """
    
    classifier = models[model_type]
    results = classifier(text)

    if model_type == 'roberta':
        sentiment_mapping = {
            "LABEL_0": "NEGATIVE",
            "LABEL_1": "NEUTRAL",
            "LABEL_2": "POSITIVE"
        }
        sentiment = sentiment_mapping[results[0]['label']]
        confidence = results[0]['score']
        probabilities = [0, 0, 0]
        
        if sentiment == "NEGATIVE":
            probabilities = [confidence, 1 - confidence, 0]
        elif sentiment == "NEUTRAL":
            probabilities = [0, confidence, 1 - confidence]
        else:
            probabilities = [0, 1 - confidence, confidence]

    elif model_type == 'emotion':
        emotions = ['ANGER', 'DISGUST', 'FEAR', 'JOY', 'NEUTRAL', 'SADNESS', 'SURPRISE']
        emotion_probs = [0] * len(emotions)
        for res in results:
            emotion_idx = emotions.index(res['label'].upper())
            emotion_probs[emotion_idx] = res['score']
        probabilities = emotion_probs
        sentiment = results[0]['label'].upper()

    else:
        sentiment = results[0]['label'].upper()
        confidence = results[0]['score']
        probabilities = {
            'NEGATIVE': [confidence, 1 - confidence, 0],
            'POSITIVE': [0, 1 - confidence, confidence]
        }.get(sentiment, [0.3, 0.4, 0.3])

    return sentiment, probabilities

def read_file(file):
    """
    Reads the uploaded file and returns its content as a string.
    Supports .txt and .csv file formats.

    Arguments:
    - file: The uploaded file.

    Returns:
    - Content of the file as a single string.
    """
    
    if file.filename.endswith('.txt'):
        return file.read().decode('utf-8')
    elif file.filename.endswith('.csv'):
        reader = csv.reader(file.read().decode('utf-8').splitlines())
        return ' '.join([' '.join(row) for row in reader])
    else:
        return None