Spaces:

ashhadahsan
/

summarizer-space

Runtime error

App Files Files Community

summarizer-space / app.py

ashhadahsan

Update app.py

58f1444 over 2 years ago

raw

history blame

13.8 kB

	import streamlit as st
	import pandas as pd
	from transformers import pipeline
	from stqdm import stqdm
	from simplet5 import SimpleT5
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from transformers import BertTokenizer, TFBertForSequenceClassification
	from tensorflow.keras.models import load_model
	from tensorflow.nn import softmax
	import numpy as np
	from datetime import datetime
	import logging
	from constants import sub_themes_dict

	date = datetime.now().strftime(r"%Y-%m-%d")
	model_classes = {
	0: "Ads",
	1: "Apps",
	2: "Battery",
	3: "Charging",
	4: "Delivery",
	5: "Display",
	6: "FOS",
	7: "HW",
	8: "Order",
	9: "Refurb",
	10: "SD",
	11: "Setup",
	12: "Unknown",
	13: "WiFi",
	}


	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	# @st.cache_resource
	def load_t5():
	model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

	tokenizer = AutoTokenizer.from_pretrained("t5-base")
	return model, tokenizer


	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	# @st.cache_resource
	def custom_model():
	return pipeline("summarization", model="my_awesome_sum/")


	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	# @st.cache_resource
	def convert_df(df):
	# IMPORTANT: Cache the conversion to prevent computation on every rerun
	return df.to_csv(index=False).encode("utf-8")


	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	# @st.cache_resource
	def load_one_line_summarizer(model):
	return model.load_model("t5", "snrspeaks/t5-one-line-summary")


	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	# @st.cache_resource
	def classify_category():
	tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
	new_model = load_model("model")
	return tokenizer, new_model


	@st.cache(allow_output_mutation=True, suppress_st_warning=True)
	# @st.cache_resource
	def classify_sub_theme():
	tokenizer = BertTokenizer.from_pretrained(
	"ashhadahsan/amazon-subtheme-bert-base-finetuned"
	)
	new_model = TFBertForSequenceClassification.from_pretrained(
	"ashhadahsan/amazon-subtheme-bert-base-finetuned"
	)
	return tokenizer, new_model


	st.set_page_config(layout="wide", page_title="Amazon Review Summarizer")
	st.title("Amazon Review Summarizer")

	uploaded_file = st.file_uploader("Choose a file", type=["xlsx", "xls", "csv"])
	summarizer_option = st.selectbox(
	"Select Summarizer",
	("Custom trained on the dataset", "t5-base", "t5-one-line-summary"),
	)
	col1, col2, col3 = st.columns([1, 1, 1])

	with col1:
	summary_yes = st.checkbox("Summrization", value=False)

	with col2:
	classification = st.checkbox("Classify Category", value=True)

	with col3:
	sub_theme = st.checkbox("Sub theme classification", value=True)

	ps = st.empty()

	if st.button("Process", type="primary"):
	cancel_button = st.empty()
	cancel_button2 = st.empty()
	cancel_button3 = st.empty()
	if uploaded_file is not None:
	if uploaded_file.name.split(".")[-1] in ["xls", "xlsx"]:
	df = pd.read_excel(uploaded_file, engine="openpyxl")
	if uploaded_file.name.split(".")[-1] in [".csv"]:
	df = pd.read_csv(uploaded_file)
	columns = df.columns.values.tolist()
	columns = [x.lower() for x in columns]
	df.columns = columns
	print(summarizer_option)
	output = pd.DataFrame()
	try:
	text = df["text"].values.tolist()
	output["text"] = text
	if summarizer_option == "Custom trained on the dataset":
	if summary_yes:
	model = custom_model()

	progress_text = "Summarization in progress. Please wait."
	summary = []

	for x in stqdm(range(len(text))):
	if cancel_button.button("Cancel", key=x):
	del model
	break
	try:
	summary.append(
	model(
	f"summarize: {text[x]}",
	max_length=50,
	early_stopping=True,
	)[0]["summary_text"]
	)
	except:
	pass
	output["summary"] = summary
	del model
	if classification:
	classification_token, classification_model = classify_category()
	tf_batch = classification_token(
	text,
	max_length=128,
	padding=True,
	truncation=True,
	return_tensors="tf",
	)
	with st.spinner(text="identifying theme"):
	tf_outputs = classification_model(tf_batch)
	classes = []
	with st.spinner(text="creating output file"):
	for x in stqdm(range(len(text))):
	tf_o = softmax(tf_outputs["logits"][x], axis=-1)
	label = np.argmax(tf_o, axis=0)
	keys = model_classes
	classes.append(keys.get(label))
	output["category"] = classes
	del classification_token, classification_model
	if sub_theme:
	classification_token, classification_model = classify_sub_theme()
	tf_batch = classification_token(
	text,
	max_length=128,
	padding=True,
	truncation=True,
	return_tensors="tf",
	)
	with st.spinner(text="identifying sub theme"):
	tf_outputs = classification_model(tf_batch)
	classes = []
	with st.spinner(text="creating output file"):
	for x in stqdm(range(len(text))):
	tf_o = softmax(tf_outputs["logits"][x], axis=-1)
	label = np.argmax(tf_o, axis=0)
	keys = sub_themes_dict
	classes.append(keys.get(label))
	output["sub theme"] = classes
	del classification_token, classification_model

	csv = convert_df(output)
	st.download_button(
	label="Download data as CSV",
	data=csv,
	file_name=f"{summarizer_option}_{date}_df.csv",
	mime="text/csv",
	)
	if summarizer_option == "t5-base":
	if summary_yes:
	model, tokenizer = load_t5()
	summary = []
	for x in stqdm(range(len(text))):
	if cancel_button2.button("Cancel", key=x):
	del model, tokenizer
	break
	tokens_input = tokenizer.encode(
	"summarize: " + text[x],
	return_tensors="pt",
	max_length=tokenizer.model_max_length,
	truncation=True,
	)
	summary_ids = model.generate(
	tokens_input,
	min_length=80,
	max_length=150,
	length_penalty=20,
	num_beams=2,
	)
	summary_gen = tokenizer.decode(
	summary_ids[0], skip_special_tokens=True
	)
	summary.append(summary_gen)
	del model, tokenizer
	output["summary"] = summary

	if classification:
	classification_token, classification_model = classify_category()
	tf_batch = classification_token(
	text,
	max_length=128,
	padding=True,
	truncation=True,
	return_tensors="tf",
	)
	with st.spinner(text="identifying theme"):
	tf_outputs = classification_model(tf_batch)
	classes = []
	with st.spinner(text="creating output file"):
	for x in stqdm(range(len(text))):
	tf_o = softmax(tf_outputs["logits"][x], axis=-1)
	label = np.argmax(tf_o, axis=0)
	keys = model_classes
	classes.append(keys.get(label))
	output["category"] = classes
	del classification_token, classification_model
	if sub_theme:
	classification_token, classification_model = classify_sub_theme()
	tf_batch = classification_token(
	text,
	max_length=128,
	padding=True,
	truncation=True,
	return_tensors="tf",
	)
	with st.spinner(text="identifying sub theme"):
	tf_outputs = classification_model(tf_batch)
	classes = []
	with st.spinner(text="creating output file"):
	for x in stqdm(range(len(text))):
	tf_o = softmax(tf_outputs["logits"][x], axis=-1)
	label = np.argmax(tf_o, axis=0)
	keys = sub_themes_dict
	classes.append(keys.get(label))
	output["sub theme"] = classes
	del classification_token, classification_model
	csv = convert_df(output)
	st.download_button(
	label="Download data as CSV",
	data=csv,
	file_name=f"{summarizer_option}_{date}_df.csv",
	mime="text/csv",
	)

	if summarizer_option == "t5-one-line-summary":
	if summary_yes:
	model = SimpleT5()
	load_one_line_summarizer(model=model)

	summary = []
	for x in stqdm(range(len(text))):
	if cancel_button3.button("Cancel", key=x):
	del model
	break
	try:
	summary.append(model.predict(text[x])[0])
	except:
	pass
	output["summary"] = summary
	del model

	if classification:
	classification_token, classification_model = classify_category()
	tf_batch = classification_token(
	text,
	max_length=128,
	padding=True,
	truncation=True,
	return_tensors="tf",
	)
	with st.spinner(text="identifying theme"):
	tf_outputs = classification_model(tf_batch)
	classes = []
	with st.spinner(text="creating output file"):
	for x in stqdm(range(len(text))):
	tf_o = softmax(tf_outputs["logits"][x], axis=-1)
	label = np.argmax(tf_o, axis=0)
	keys = model_classes
	classes.append(keys.get(label))
	output["category"] = classes
	del classification_token, classification_model
	if sub_theme:
	classification_token, classification_model = classify_sub_theme()
	tf_batch = classification_token(
	text,
	max_length=128,
	padding=True,
	truncation=True,
	return_tensors="tf",
	)
	with st.spinner(text="identifying sub theme"):
	tf_outputs = classification_model(tf_batch)
	classes = []
	with st.spinner(text="creating output file"):
	for x in stqdm(range(len(text))):
	tf_o = softmax(tf_outputs["logits"][x], axis=-1)
	label = np.argmax(tf_o, axis=0)
	keys = sub_themes_dict
	classes.append(keys.get(label))
	output["sub theme"] = classes
	del classification_token, classification_model

	csv = convert_df(output)
	st.download_button(
	label="Download data as CSV",
	data=csv,
	file_name=f"{summarizer_option}_{date}_df.csv",
	mime="text/csv",
	)

	except KeyError:
	st.error(
	"Please Make sure that your data must have a column named text",
	icon="🚨",
	)
	st.info("Text column must have amazon reviews", icon="ℹ️")
	except BaseException as e:
	logging.exception("An exception was occurred")