Spaces:
Sleeping
Sleeping
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| import nltk | |
| from nltk.stem import * | |
| nltk.download("punkt_tab") | |
| FILE_PATH = "/app/src/ressources/technologies_database.xlsx" | |
| def set_prompt(problem): | |
| prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only. | |
| Take into account different technical domains to encompass the whole problem. | |
| Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"}) | |
| Technical problem : | |
| """ + problem | |
| return prompt | |
| def load_technologies_excel(): | |
| df = pd.read_excel(FILE_PATH) | |
| return df | |
| def load_technologies(): | |
| EMBEDDINGS_FILE = '/app/src/ressources/global_tech_embeddings.pkl' | |
| try: | |
| with open(EMBEDDINGS_FILE, 'rb') as f: | |
| loaded_data = pickle.load(f) | |
| global_tech = loaded_data['global_tech'] | |
| global_tech_embedding = loaded_data['global_tech_embeddings'] | |
| return global_tech, global_tech_embedding | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| def tech_to_dict(technologies): | |
| tech_dict = [] | |
| for index, tech in enumerate(technologies): | |
| if not tech.find("<title>") > 1: | |
| tab = tech.split("\n") | |
| tab.pop(0) | |
| tab.pop(len(tab)-1) | |
| tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], | |
| "purpose": tab[1][tab[1].find(": ")+2:], | |
| "key_components": tab[2][tab[2].find(": ")+2:], | |
| "advantages": tab[3][tab[3].find(": ")+2:], | |
| "limitations": tab[4][tab[4].find(": ")+2:], | |
| "id": index}) | |
| return tech_dict | |
| def save_dataframe(df, title): | |
| pd.DataFrame(df).to_excel(title) | |
| return title | |
| def stem(data,data_type): | |
| stemmer = SnowballStemmer("english") | |
| processed_data = [] | |
| if data_type == "technologies": | |
| for t_item in data: | |
| processed_data.append({ | |
| "title": stemmer.stem(t_item["title"]), | |
| "purpose": stemmer.stem(t_item["purpose"]), | |
| "key_components": stemmer.stem(t_item["key_components"]), | |
| "advantages": stemmer.stem(t_item["advantages"]), | |
| "limitations": stemmer.stem(t_item["limitations"]), | |
| "id": t_item["id"] | |
| }) | |
| else: | |
| for t_item in data: | |
| print(t_item) | |
| processed_data.append({ | |
| "title": stemmer.stem(t_item), | |
| "description": stemmer.stem(data[t_item]) | |
| }) | |
| return processed_data | |
| def get_technologies_by_id(id_list, technologies): | |
| result = [] | |
| id_set = set(id_list) | |
| for tech in technologies: | |
| if tech.get('id') in id_set: | |
| result.append(tech) | |
| return result | |
| def save_to_pickle(result_similarites): | |
| constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites]))) | |
| max_id2 = max([item['id2'] for item in result_similarites]) | |
| row_label_to_index = {title: i for i, title in enumerate(constraint_titles)} | |
| col_labels = list(range(1, max_id2 + 1)) | |
| num_rows = len(constraint_titles) | |
| num_cols = max_id2 | |
| matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32) | |
| for item in result_similarites: | |
| row_idx = row_label_to_index[item['constraint']['title']] | |
| col_idx = item['id2'] - 1 # | |
| similarity_value = item['similarity'].item() | |
| matrix[row_idx, col_idx] = similarity_value | |
| print(f"Successfully created matrix with shape: {matrix.shape}") | |
| print(f"Number of rows (unique constraints): {num_rows}") | |
| print(f"Number of columns (max id2): {num_cols}") | |
| print("\nExample 5x5 block of the created matrix (NaN for missing values):") | |
| print(matrix[:5, :5]) | |
| output_filename = "cosine_similarity_matrix_with_labels.pkl" | |
| data_to_save = { | |
| 'matrix': matrix, | |
| 'row_labels': constraint_titles, | |
| 'col_labels': col_labels | |
| } | |
| with open(output_filename, 'wb') as f: | |
| pickle.dump(data_to_save, f) | |
| print(f"\nMatrix and labels saved to {output_filename}") | |
| return output_filename |