File size: 5,758 Bytes
7cb1242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import spacy
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import csv
import re
import nltk
import os

# Ensure NLTK uses a writable directory; downloads are handled in Dockerfile/app startup
NLTK_DIR = os.environ.get('NLTK_DATA', os.path.join(os.getcwd(), 'nltk_data'))
os.makedirs(NLTK_DIR, exist_ok=True)
if NLTK_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DIR)

# Load spaCy medium English model for text preprocessing (NER, POS tagging)
nlp = spacy.load("en_core_web_md")

# Pre-load transformer models once for faster response
models = {
    "default": pipeline('sentiment-analysis', model="distilbert-base-uncased-finetuned-sst-2-english"),
    "roberta": pipeline('sentiment-analysis', model="cardiffnlp/twitter-roberta-base-sentiment"),
    "emotion": pipeline('sentiment-analysis', model="j-hartmann/emotion-english-distilroberta-base")
}

# Initialize stemmer
stemmer = PorterStemmer()

import re

def preprocess_text(text):
    """
    Preprocesses the input text by cleaning, normalizing, tokenizing, stemming, lemmatizing, 
    and extracting Named Entities and POS tags.

    Returns:
    - cleaned_text: Text after removing stop words, punctuation, URLs, and emails.
    - removed_text: Text that was removed during cleaning.
    - normalized_text: Lowercased version of cleaned text.
    - tokenized_text: List of tokens (words) from normalized text.
    - stemmed_tokens: List of stemmed tokens.
    - lemmatized_tokens: List of lemmatized tokens.
    - ner: List of named entities found in the original text.
    - pos: List of Part-of-Speech (POS) tags using normalized text.
    """
    
    # Step 1: Clean the text, removing newlines, multiple spaces, and other unwanted characters
    text = re.sub(r'\s+', ' ', text).strip()  # Replaces any form of multiple whitespace (including \r\n) with a single space

    # Step 2: Apply spaCy NLP for further processing
    doc = nlp(text)

    # Step 3: Cleaning: Remove stop words, punctuations, URLs, and emails
    cleaned_text = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct and not token.like_url and not token.like_email])

    # Removed Text: Contains all stop words, punctuations, URLs, and emails that were filtered
    removed_text = " ".join([token.text for token in doc if token.is_stop or token.is_punct or token.like_url or token.like_email])

    # Step 4: Normalization (lowercasing)
    normalized_text = cleaned_text.lower()

    # Step 5: Tokenization
    tokenized_text = word_tokenize(normalized_text)

    # Step 6: POS tagging on the normalized text (so that it's consistent with tokenized/lemmatized text)
    normalized_doc = nlp(" ".join(tokenized_text))
    pos = [(token.text, token.pos_) for token in normalized_doc if token.pos_ != 'SPACE']

    # Convert tokenized text to a clean list (without brackets or quotes)
    tokenized_text_clean = tokenized_text

    # Step 7: Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokenized_text]

    # Step 8: Lemmatization
    lemmatized_tokens = [token.lemma_ for token in normalized_doc]

    # Step 9: Named Entity Recognition (NER)
    ner = [(ent.text, ent.label_) for ent in doc.ents]

    return cleaned_text, removed_text, normalized_text, tokenized_text_clean, stemmed_tokens, lemmatized_tokens, ner, pos

def analyze_sentiment(text, model_type="default"):
    """
    Analyze the sentiment of the given text using the specified model type.

    Arguments:
    - text: The input text to analyze.
    - model_type: The sentiment model to use ('default', 'roberta', or 'emotion').

    Returns:
    - sentiment: The overall sentiment of the text (e.g., POSITIVE, NEGATIVE).
    - probabilities: The sentiment probabilities or confidence scores for each label.
    """
    
    classifier = models[model_type]
    results = classifier(text)

    if model_type == 'roberta':
        sentiment_mapping = {
            "LABEL_0": "NEGATIVE",
            "LABEL_1": "NEUTRAL",
            "LABEL_2": "POSITIVE"
        }
        sentiment = sentiment_mapping[results[0]['label']]
        confidence = results[0]['score']
        probabilities = [0, 0, 0]
        
        if sentiment == "NEGATIVE":
            probabilities = [confidence, 1 - confidence, 0]
        elif sentiment == "NEUTRAL":
            probabilities = [0, confidence, 1 - confidence]
        else:
            probabilities = [0, 1 - confidence, confidence]

    elif model_type == 'emotion':
        emotions = ['ANGER', 'DISGUST', 'FEAR', 'JOY', 'NEUTRAL', 'SADNESS', 'SURPRISE']
        emotion_probs = [0] * len(emotions)
        for res in results:
            emotion_idx = emotions.index(res['label'].upper())
            emotion_probs[emotion_idx] = res['score']
        probabilities = emotion_probs
        sentiment = results[0]['label'].upper()

    else:
        sentiment = results[0]['label'].upper()
        confidence = results[0]['score']
        probabilities = {
            'NEGATIVE': [confidence, 1 - confidence, 0],
            'POSITIVE': [0, 1 - confidence, confidence]
        }.get(sentiment, [0.3, 0.4, 0.3])

    return sentiment, probabilities

def read_file(file):
    """
    Reads the uploaded file and returns its content as a string.
    Supports .txt and .csv file formats.

    Arguments:
    - file: The uploaded file.

    Returns:
    - Content of the file as a single string.
    """
    
    if file.filename.endswith('.txt'):
        return file.read().decode('utf-8')
    elif file.filename.endswith('.csv'):
        reader = csv.reader(file.read().decode('utf-8').splitlines())
        return ' '.join([' '.join(row) for row in reader])
    else:
        return None