Spaces:

heymenn
/

insight-finder-v2

Sleeping

App Files Files Community

insight-finder-v2 / src /services /utils.py

heymenn

Upload 7 files

1f05644 verified 6 months ago

raw

history blame contribute delete

4.24 kB

	import pickle
	import numpy as np
	import pandas as pd

	import nltk
	from nltk.stem import *
	nltk.download("punkt_tab")

	FILE_PATH = "/app/src/ressources/technologies_database.xlsx"

	def set_prompt(problem):
	prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
	Take into account different technical domains to encompass the whole problem.
	Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
	Technical problem :
	""" + problem
	return prompt

	def load_technologies_excel():
	df = pd.read_excel(FILE_PATH)
	return df

	def load_technologies():
	EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl'

	try:
	with open(EMBEDDINGS_FILE, 'rb') as f:
	loaded_data = pickle.load(f)
	global_tech = loaded_data['global_tech']
	global_tech_embedding = loaded_data['global_tech_embeddings']
	return global_tech, global_tech_embedding
	except Exception as e:
	print(f"Error: {e}")

	def tech_to_dict(technologies):
	tech_dict = []
	for index, tech in enumerate(technologies):
	if not tech.find("<title>") > 1:
	tab = tech.split("\n")
	tab.pop(0)
	tab.pop(len(tab)-1)
	tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
	"purpose": tab[1][tab[1].find(": ")+2:],
	"key_components": tab[2][tab[2].find(": ")+2:],
	"advantages": tab[3][tab[3].find(": ")+2:],
	"limitations": tab[4][tab[4].find(": ")+2:],
	"id": index})
	return tech_dict

	def save_dataframe(df, title):
	pd.DataFrame(df).to_excel(title)
	return title

	def stem(data,data_type):
	stemmer = SnowballStemmer("english")
	processed_data = []
	if data_type == "technologies":
	for t_item in data:
	processed_data.append({
	"title": stemmer.stem(t_item["title"]),
	"purpose": stemmer.stem(t_item["purpose"]),
	"key_components": stemmer.stem(t_item["key_components"]),
	"advantages": stemmer.stem(t_item["advantages"]),
	"limitations": stemmer.stem(t_item["limitations"]),
	"id": t_item["id"]
	})
	else:
	for t_item in data:
	print(t_item)
	processed_data.append({
	"title": stemmer.stem(t_item),
	"description": stemmer.stem(data[t_item])
	})

	return processed_data


	def get_technologies_by_id(id_list, technologies):
	result = []
	id_set = set(id_list)
	for tech in technologies:
	if tech.get('id') in id_set:
	result.append(tech)
	return result

	def save_to_pickle(result_similarites):

	constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
	max_id2 = max([item['id2'] for item in result_similarites])

	row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
	col_labels = list(range(1, max_id2 + 1))

	num_rows = len(constraint_titles)
	num_cols = max_id2

	matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)

	for item in result_similarites:
	row_idx = row_label_to_index[item['constraint']['title']]
	col_idx = item['id2'] - 1 #
	similarity_value = item['similarity'].item()

	matrix[row_idx, col_idx] = similarity_value

	print(f"Successfully created matrix with shape: {matrix.shape}")
	print(f"Number of rows (unique constraints): {num_rows}")
	print(f"Number of columns (max id2): {num_cols}")
	print("\nExample 5x5 block of the created matrix (NaN for missing values):")
	print(matrix[:5, :5])

	output_filename = "cosine_similarity_matrix_with_labels.pkl"
	data_to_save = {
	'matrix': matrix,
	'row_labels': constraint_titles,
	'col_labels': col_labels
	}

	with open(output_filename, 'wb') as f:
	pickle.dump(data_to_save, f)

	print(f"\nMatrix and labels saved to {output_filename}")
	return output_filename