Spaces:
Sleeping
Sleeping
Delete app
Browse files- app/__init__.py +0 -0
- app/core.py +0 -17
- app/services/processor.py +0 -217
- app/services/technologies_database.xlsx +0 -3
- app/services/utils.py +0 -110
app/__init__.py
DELETED
|
File without changes
|
app/core.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
from app.services.utils import *
|
| 2 |
-
from app.services.processor import *
|
| 3 |
-
|
| 4 |
-
def process_input(data):
|
| 5 |
-
prompt = set_prompt(data)
|
| 6 |
-
constraints = retrieve_constraints(prompt)
|
| 7 |
-
constraints_stemmed = stem(constraints, "constraints")
|
| 8 |
-
save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
|
| 9 |
-
df = load_technologies()
|
| 10 |
-
global_tech, keys, original_tech = preprocess_tech_data(df)
|
| 11 |
-
save_dataframe(global_tech, "global_tech.xlsx")
|
| 12 |
-
result_similarities, matrix = get_contrastive_similarities(global_tech, constraints_stemmed)
|
| 13 |
-
save_to_pickle(result_similarities)
|
| 14 |
-
best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
|
| 15 |
-
best_technologies_id = select_technologies(best_combinations)
|
| 16 |
-
best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
|
| 17 |
-
return best_technologies
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/services/processor.py
DELETED
|
@@ -1,217 +0,0 @@
|
|
| 1 |
-
from app.services.utils import tech_to_dict, stem
|
| 2 |
-
import requests as r
|
| 3 |
-
import json
|
| 4 |
-
import nltk
|
| 5 |
-
import itertools
|
| 6 |
-
import numpy as np
|
| 7 |
-
|
| 8 |
-
from sentence_transformers import *
|
| 9 |
-
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 10 |
-
|
| 11 |
-
def retrieve_constraints(prompt):
|
| 12 |
-
request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
|
| 13 |
-
response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
|
| 14 |
-
|
| 15 |
-
decoded_content = json.loads(response.content.decode())
|
| 16 |
-
llm_response = decoded_content["content"][0]["message"]["content"]
|
| 17 |
-
|
| 18 |
-
start_marker = '{'
|
| 19 |
-
end_marker = '}'
|
| 20 |
-
start_index = llm_response.find(start_marker) + len(start_marker)
|
| 21 |
-
end_index = llm_response.find(end_marker, start_index)
|
| 22 |
-
json_str = llm_response[start_index:end_index].strip()
|
| 23 |
-
|
| 24 |
-
constraints_json = json.loads("{"+json_str+"}")
|
| 25 |
-
|
| 26 |
-
return constraints_json
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def preprocess_tech_data(_df):
|
| 30 |
-
if _df is None or "description" not in _df.columns:
|
| 31 |
-
return [], []
|
| 32 |
-
|
| 33 |
-
technologies_list = _df["description"].to_list()
|
| 34 |
-
tech_dict_raw = tech_to_dict(technologies_list)
|
| 35 |
-
|
| 36 |
-
tech_dict_filtered = [
|
| 37 |
-
t for t in tech_dict_raw if (
|
| 38 |
-
len(t.get("title", "")) >= 5 and
|
| 39 |
-
len(t.get("advantages", "")) >= 5 and
|
| 40 |
-
len(t.get("key_components", "")) >= 5
|
| 41 |
-
)
|
| 42 |
-
]
|
| 43 |
-
|
| 44 |
-
if not tech_dict_filtered:
|
| 45 |
-
return [], []
|
| 46 |
-
|
| 47 |
-
processed_tech_wt = stem(tech_dict_filtered,"technologies")
|
| 48 |
-
|
| 49 |
-
for t_item_wt in processed_tech_wt:
|
| 50 |
-
kc = t_item_wt.get("key_components")
|
| 51 |
-
if isinstance(kc, str):
|
| 52 |
-
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
|
| 53 |
-
else:
|
| 54 |
-
t_item_wt["key_components"] = ""
|
| 55 |
-
|
| 56 |
-
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
|
| 60 |
-
return processed_tech_wt, _keys, original_tech_for_display
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def remove_over_repeated_technologies(result):
|
| 64 |
-
total_lists = len(result)
|
| 65 |
-
tech_title = {}
|
| 66 |
-
|
| 67 |
-
for idx, item in enumerate(result):
|
| 68 |
-
for tech in item['technologies']:
|
| 69 |
-
tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
|
| 70 |
-
|
| 71 |
-
threshold = total_lists * 0.3
|
| 72 |
-
print(threshold)
|
| 73 |
-
print(tech_title)
|
| 74 |
-
to_delete = []
|
| 75 |
-
for tech, lists in tech_title.items():
|
| 76 |
-
if lists > threshold:
|
| 77 |
-
print(f"This technology have been found over repeated : " + tech)
|
| 78 |
-
to_delete.append(tech)
|
| 79 |
-
|
| 80 |
-
for idx, item in enumerate(result):
|
| 81 |
-
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
|
| 82 |
-
|
| 83 |
-
return result
|
| 84 |
-
|
| 85 |
-
def get_contrastive_similarities(global_tech, constraints):
|
| 86 |
-
selected_pairs = []
|
| 87 |
-
matrix = []
|
| 88 |
-
|
| 89 |
-
for i, constraint in enumerate(constraints):
|
| 90 |
-
print(constraint)
|
| 91 |
-
for j, tech2 in enumerate(global_tech):
|
| 92 |
-
if i >= j:
|
| 93 |
-
continue
|
| 94 |
-
|
| 95 |
-
purpose_sim = model.similarity(model.encode(constraint["description"]), model.encode(tech2["purpose"]))
|
| 96 |
-
|
| 97 |
-
print(f"Constraint: {constraint}, Tech 2: {tech2['title']}")
|
| 98 |
-
print(f"Purpose Similarity: {purpose_sim}")
|
| 99 |
-
selected_pairs.append({
|
| 100 |
-
"constraint": constraint,
|
| 101 |
-
"id2": tech2["id"],
|
| 102 |
-
"similarity": purpose_sim
|
| 103 |
-
})
|
| 104 |
-
if purpose_sim == np.float32(None):
|
| 105 |
-
purpose_sim = 0.0
|
| 106 |
-
matrix.append(purpose_sim)
|
| 107 |
-
|
| 108 |
-
return selected_pairs,matrix
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
|
| 112 |
-
if not list1 or not list2:
|
| 113 |
-
print("Warning: One or both input lists are empty. Returning an empty list.")
|
| 114 |
-
return []
|
| 115 |
-
|
| 116 |
-
MIN_SIMILARITY = 0.3
|
| 117 |
-
MAX_SIMILARITY = 0.8
|
| 118 |
-
|
| 119 |
-
possible_matches_for_each_l1 = []
|
| 120 |
-
for i in range(len(list1)):
|
| 121 |
-
valid_matches_for_l1_element = []
|
| 122 |
-
for j in range(len(list2)):
|
| 123 |
-
score = matrix[i, j]
|
| 124 |
-
|
| 125 |
-
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
| 126 |
-
valid_matches_for_l1_element.append((list2[j], score))
|
| 127 |
-
|
| 128 |
-
if not valid_matches_for_l1_element:
|
| 129 |
-
print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
|
| 130 |
-
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
|
| 131 |
-
"Returning an empty list as no complete combinations can be formed.")
|
| 132 |
-
|
| 133 |
-
else:
|
| 134 |
-
possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
|
| 135 |
-
|
| 136 |
-
result = []
|
| 137 |
-
for tech_list, problem in possible_matches_for_each_l1:
|
| 138 |
-
sorted_list = sorted(
|
| 139 |
-
tech_list,
|
| 140 |
-
key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
|
| 141 |
-
reverse=True
|
| 142 |
-
)
|
| 143 |
-
top5 = sorted_list[:5]
|
| 144 |
-
result.append({
|
| 145 |
-
'technologies': top5,
|
| 146 |
-
'problem': problem
|
| 147 |
-
})
|
| 148 |
-
|
| 149 |
-
result = remove_over_repeated_technologies(result)
|
| 150 |
-
return result
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
def select_technologies(problem_technology_list):
|
| 154 |
-
distinct_techs = set()
|
| 155 |
-
candidate_map = []
|
| 156 |
-
|
| 157 |
-
for problem_data in problem_technology_list:
|
| 158 |
-
cand_dict = {}
|
| 159 |
-
for tech_info, sim in problem_data['technologies']:
|
| 160 |
-
tech_id = tech_info['id']
|
| 161 |
-
distinct_techs.add(tech_id)
|
| 162 |
-
cand_dict[tech_id] = float(sim)
|
| 163 |
-
candidate_map.append(cand_dict)
|
| 164 |
-
|
| 165 |
-
distinct_techs = sorted(list(distinct_techs))
|
| 166 |
-
n = len(problem_technology_list)
|
| 167 |
-
|
| 168 |
-
if n == 0:
|
| 169 |
-
return set()
|
| 170 |
-
|
| 171 |
-
min_k = None
|
| 172 |
-
best_set = None
|
| 173 |
-
best_avg = -1
|
| 174 |
-
|
| 175 |
-
print(f"Distinct technologies: {distinct_techs}")
|
| 176 |
-
print(f"Candidate map: {candidate_map}")
|
| 177 |
-
print(f"Number of problems: {n}")
|
| 178 |
-
|
| 179 |
-
for k in range(1, len(distinct_techs)+1):
|
| 180 |
-
if min_k is not None and k > min_k:
|
| 181 |
-
break
|
| 182 |
-
|
| 183 |
-
for T in itertools.combinations(distinct_techs, k):
|
| 184 |
-
total_sim = 0.0
|
| 185 |
-
covered = True
|
| 186 |
-
print(f"Trying combination: {T}")
|
| 187 |
-
for i in range(n):
|
| 188 |
-
max_sim = -1.0
|
| 189 |
-
found = False
|
| 190 |
-
for tech in T:
|
| 191 |
-
if tech in candidate_map[i]:
|
| 192 |
-
found = True
|
| 193 |
-
sim_val = candidate_map[i][tech]
|
| 194 |
-
if sim_val > max_sim:
|
| 195 |
-
max_sim = sim_val
|
| 196 |
-
if not found:
|
| 197 |
-
covered = False
|
| 198 |
-
break
|
| 199 |
-
else:
|
| 200 |
-
total_sim += max_sim
|
| 201 |
-
|
| 202 |
-
if covered:
|
| 203 |
-
avg_sim = total_sim / n
|
| 204 |
-
if min_k is None or k < min_k:
|
| 205 |
-
min_k = k
|
| 206 |
-
best_set = T
|
| 207 |
-
best_avg = avg_sim
|
| 208 |
-
elif k == min_k and avg_sim > best_avg:
|
| 209 |
-
best_set = T
|
| 210 |
-
best_avg = avg_sim
|
| 211 |
-
|
| 212 |
-
if min_k is not None and k == min_k:
|
| 213 |
-
break
|
| 214 |
-
|
| 215 |
-
if best_set is None:
|
| 216 |
-
return set()
|
| 217 |
-
return set(best_set)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/services/technologies_database.xlsx
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
|
| 3 |
-
size 213138
|
|
|
|
|
|
|
|
|
|
|
|
app/services/utils.py
DELETED
|
@@ -1,110 +0,0 @@
|
|
| 1 |
-
import pickle
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
-
|
| 5 |
-
import nltk
|
| 6 |
-
from nltk.stem import *
|
| 7 |
-
nltk.download("punkt_tab")
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
def set_prompt(InputData):
|
| 11 |
-
prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
|
| 12 |
-
Take into account different technical domains to encompass the whole problem.
|
| 13 |
-
Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
|
| 14 |
-
Technical problem :
|
| 15 |
-
""" + InputData['problem']
|
| 16 |
-
return prompt
|
| 17 |
-
|
| 18 |
-
def load_technologies():
|
| 19 |
-
df = pd.read_excel('technologies_database.xlsx')
|
| 20 |
-
return df
|
| 21 |
-
|
| 22 |
-
def tech_to_dict(technologies):
|
| 23 |
-
tech_dict = []
|
| 24 |
-
for index, tech in enumerate(technologies):
|
| 25 |
-
if not tech.find("<title>") > 1:
|
| 26 |
-
tab = tech.split("\n")
|
| 27 |
-
tab.pop(0)
|
| 28 |
-
tab.pop(len(tab)-1)
|
| 29 |
-
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
|
| 30 |
-
"purpose": tab[1][tab[1].find(": ")+2:],
|
| 31 |
-
"key_components": tab[2][tab[2].find(": ")+2:],
|
| 32 |
-
"advantages": tab[3][tab[3].find(": ")+2:],
|
| 33 |
-
"limitations": tab[4][tab[4].find(": ")+2:],
|
| 34 |
-
"id": index})
|
| 35 |
-
return tech_dict
|
| 36 |
-
|
| 37 |
-
def save_dataframe(df, title):
|
| 38 |
-
pd.DataFrame(df).to_excel(title)
|
| 39 |
-
return title
|
| 40 |
-
|
| 41 |
-
def stem(data,data_type):
|
| 42 |
-
stemmer = SnowballStemmer("english")
|
| 43 |
-
processed_data = []
|
| 44 |
-
if data_type == "technologies":
|
| 45 |
-
for t_item in data:
|
| 46 |
-
processed_data.append({
|
| 47 |
-
"title": stemmer.stem(t_item["title"]),
|
| 48 |
-
"purpose": stemmer.stem(t_item["purpose"]),
|
| 49 |
-
"key_components": stemmer.stem(t_item["key_components"]),
|
| 50 |
-
"advantages": stemmer.stem(t_item["advantages"]),
|
| 51 |
-
"limitations": stemmer.stem(t_item["limitations"]),
|
| 52 |
-
"id": t_item["id"]
|
| 53 |
-
})
|
| 54 |
-
else:
|
| 55 |
-
for t_item in data:
|
| 56 |
-
print(t_item)
|
| 57 |
-
processed_data.append({
|
| 58 |
-
"title": stemmer.stem(t_item),
|
| 59 |
-
"description": stemmer.stem(data[t_item])
|
| 60 |
-
})
|
| 61 |
-
|
| 62 |
-
return processed_data
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def get_technologies_by_id(id_list, technologies):
|
| 66 |
-
result = []
|
| 67 |
-
id_set = set(id_list)
|
| 68 |
-
for tech in technologies:
|
| 69 |
-
if tech.get('id') in id_set:
|
| 70 |
-
result.append(tech)
|
| 71 |
-
return result
|
| 72 |
-
|
| 73 |
-
def save_to_pickle(result_similarites):
|
| 74 |
-
|
| 75 |
-
constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
|
| 76 |
-
max_id2 = max([item['id2'] for item in result_similarites])
|
| 77 |
-
|
| 78 |
-
row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
|
| 79 |
-
col_labels = list(range(1, max_id2 + 1))
|
| 80 |
-
|
| 81 |
-
num_rows = len(constraint_titles)
|
| 82 |
-
num_cols = max_id2
|
| 83 |
-
|
| 84 |
-
matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
|
| 85 |
-
|
| 86 |
-
for item in result_similarites:
|
| 87 |
-
row_idx = row_label_to_index[item['constraint']['title']]
|
| 88 |
-
col_idx = item['id2'] - 1 #
|
| 89 |
-
similarity_value = item['similarity'].item()
|
| 90 |
-
|
| 91 |
-
matrix[row_idx, col_idx] = similarity_value
|
| 92 |
-
|
| 93 |
-
print(f"Successfully created matrix with shape: {matrix.shape}")
|
| 94 |
-
print(f"Number of rows (unique constraints): {num_rows}")
|
| 95 |
-
print(f"Number of columns (max id2): {num_cols}")
|
| 96 |
-
print("\nExample 5x5 block of the created matrix (NaN for missing values):")
|
| 97 |
-
print(matrix[:5, :5])
|
| 98 |
-
|
| 99 |
-
output_filename = "cosine_similarity_matrix_with_labels.pkl"
|
| 100 |
-
data_to_save = {
|
| 101 |
-
'matrix': matrix,
|
| 102 |
-
'row_labels': constraint_titles,
|
| 103 |
-
'col_labels': col_labels
|
| 104 |
-
}
|
| 105 |
-
|
| 106 |
-
with open(output_filename, 'wb') as f:
|
| 107 |
-
pickle.dump(data_to_save, f)
|
| 108 |
-
|
| 109 |
-
print(f"\nMatrix and labels saved to {output_filename}")
|
| 110 |
-
return output_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|