ALLOUNE
commited on
Commit
·
c97a50e
1
Parent(s):
367af23
add dataset
Browse files- app.py +9 -7
- requirements.txt +2 -1
- src/core.py +8 -12
- src/services/processor.py +18 -50
- src/services/utils.py +23 -51
app.py
CHANGED
|
@@ -27,12 +27,12 @@ class InputConstraints(BaseModel):
|
|
| 27 |
# This schema defines the structure for a single technology object
|
| 28 |
class Technology(BaseModel):
|
| 29 |
"""Represents a single technology entry with its details."""
|
| 30 |
-
|
| 31 |
purpose: str
|
| 32 |
-
|
| 33 |
advantages: str
|
| 34 |
limitations: str
|
| 35 |
-
|
| 36 |
|
| 37 |
class OutputPriorArt(BaseModel):
|
| 38 |
"""Represents the search of prior art using the technology combinations"""
|
|
@@ -55,12 +55,12 @@ class TechnologyData(BaseModel):
|
|
| 55 |
|
| 56 |
@app.post("/process", response_model=TechnologyData)
|
| 57 |
async def process(data: InputProblem):
|
| 58 |
-
result= process_input(data,
|
| 59 |
return {"technologies": result}
|
| 60 |
|
| 61 |
@app.post("/process-constraints", response_model=TechnologyData)
|
| 62 |
async def process_constraints(constraints: InputConstraints):
|
| 63 |
-
result= process_input(constraints.constraints,
|
| 64 |
return {"technologies": result}
|
| 65 |
|
| 66 |
@app.post("/prior-art-constraints", response_model=OutputPriorArt)
|
|
@@ -70,7 +70,7 @@ async def prior_art_constraints(data: InputPriorArtConstraints):
|
|
| 70 |
|
| 71 |
@app.post("/prior-art-problems", response_model=OutputPriorArt)
|
| 72 |
async def prior_art_problems(data: InputPriorArtProblem):
|
| 73 |
-
prior_art = process_prior_art(data.technologies, data.
|
| 74 |
return prior_art
|
| 75 |
|
| 76 |
def make_json_serializable(data):
|
|
@@ -268,7 +268,6 @@ def process_input_gradio(problem_description: str):
|
|
| 268 |
# Step 3: Stem Constraints
|
| 269 |
constraints_stemmed = stem(constraints, "constraints")
|
| 270 |
save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
|
| 271 |
-
print(constraints_stemmed)
|
| 272 |
|
| 273 |
# Step 4: Global Tech (already loaded, just acknowledge)
|
| 274 |
# save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
|
|
@@ -282,6 +281,9 @@ def process_input_gradio(problem_description: str):
|
|
| 282 |
# Step 6: Find Best List Combinations
|
| 283 |
best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
|
| 284 |
|
|
|
|
|
|
|
|
|
|
| 285 |
# Step 7: Select Technologies
|
| 286 |
best_technologies_id = select_technologies(best_combinations)
|
| 287 |
|
|
|
|
| 27 |
# This schema defines the structure for a single technology object
|
| 28 |
class Technology(BaseModel):
|
| 29 |
"""Represents a single technology entry with its details."""
|
| 30 |
+
name: str
|
| 31 |
purpose: str
|
| 32 |
+
problem_types_solved: str
|
| 33 |
advantages: str
|
| 34 |
limitations: str
|
| 35 |
+
domain_tags: str
|
| 36 |
|
| 37 |
class OutputPriorArt(BaseModel):
|
| 38 |
"""Represents the search of prior art using the technology combinations"""
|
|
|
|
| 55 |
|
| 56 |
@app.post("/process", response_model=TechnologyData)
|
| 57 |
async def process(data: InputProblem):
|
| 58 |
+
result= process_input(data, dataset, "problem")
|
| 59 |
return {"technologies": result}
|
| 60 |
|
| 61 |
@app.post("/process-constraints", response_model=TechnologyData)
|
| 62 |
async def process_constraints(constraints: InputConstraints):
|
| 63 |
+
result= process_input(constraints.constraints, dataset, "constraints")
|
| 64 |
return {"technologies": result}
|
| 65 |
|
| 66 |
@app.post("/prior-art-constraints", response_model=OutputPriorArt)
|
|
|
|
| 70 |
|
| 71 |
@app.post("/prior-art-problems", response_model=OutputPriorArt)
|
| 72 |
async def prior_art_problems(data: InputPriorArtProblem):
|
| 73 |
+
prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
|
| 74 |
return prior_art
|
| 75 |
|
| 76 |
def make_json_serializable(data):
|
|
|
|
| 268 |
# Step 3: Stem Constraints
|
| 269 |
constraints_stemmed = stem(constraints, "constraints")
|
| 270 |
save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
|
|
|
|
| 271 |
|
| 272 |
# Step 4: Global Tech (already loaded, just acknowledge)
|
| 273 |
# save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
|
|
|
|
| 281 |
# Step 6: Find Best List Combinations
|
| 282 |
best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
|
| 283 |
|
| 284 |
+
print("best_combinations")
|
| 285 |
+
print(best_combinations)
|
| 286 |
+
|
| 287 |
# Step 7: Select Technologies
|
| 288 |
best_technologies_id = select_technologies(best_combinations)
|
| 289 |
|
requirements.txt
CHANGED
|
@@ -9,4 +9,5 @@ pydantic
|
|
| 9 |
openpyxl
|
| 10 |
gradio
|
| 11 |
google.generativeai
|
| 12 |
-
google.genai
|
|
|
|
|
|
| 9 |
openpyxl
|
| 10 |
gradio
|
| 11 |
google.generativeai
|
| 12 |
+
google.genai
|
| 13 |
+
datasets
|
src/core.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
from src.services.utils import *
|
| 2 |
from src.services.processor import *
|
| 3 |
|
| 4 |
-
|
| 5 |
|
| 6 |
|
| 7 |
-
def process_input(data,
|
| 8 |
if data_type == "problem":
|
| 9 |
prompt = set_prompt(data.problem)
|
| 10 |
constraints = retrieve_constraints(prompt)
|
|
@@ -14,19 +14,13 @@ def process_input(data, global_tech, global_tech_embeddings, data_type):
|
|
| 14 |
|
| 15 |
constraints_stemmed = stem(constraints, "constraints")
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
save_dataframe(global_tech, "global_tech.xlsx")
|
| 20 |
-
|
| 21 |
-
result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, global_tech, global_tech_embeddings, )
|
| 22 |
|
| 23 |
save_to_pickle(result_similarities)
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
|
| 28 |
best_technologies_id = select_technologies(best_combinations)
|
| 29 |
-
best_technologies = get_technologies_by_id(best_technologies_id,
|
| 30 |
|
| 31 |
return best_technologies
|
| 32 |
|
|
@@ -38,5 +32,7 @@ def process_prior_art(technologies, data, data_type, techno_type):
|
|
| 38 |
print(f"An error occured during the process, trying again : {e}")
|
| 39 |
prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
|
| 40 |
prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
return prior_art_search
|
|
|
|
| 1 |
from src.services.utils import *
|
| 2 |
from src.services.processor import *
|
| 3 |
|
| 4 |
+
dataset = load_data()
|
| 5 |
|
| 6 |
|
| 7 |
+
def process_input(data, dataset, data_type):
|
| 8 |
if data_type == "problem":
|
| 9 |
prompt = set_prompt(data.problem)
|
| 10 |
constraints = retrieve_constraints(prompt)
|
|
|
|
| 14 |
|
| 15 |
constraints_stemmed = stem(constraints, "constraints")
|
| 16 |
|
| 17 |
+
result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, dataset)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
save_to_pickle(result_similarities)
|
| 20 |
|
| 21 |
+
best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
|
|
|
|
|
|
|
| 22 |
best_technologies_id = select_technologies(best_combinations)
|
| 23 |
+
best_technologies = get_technologies_by_id(best_technologies_id, dataset)
|
| 24 |
|
| 25 |
return best_technologies
|
| 26 |
|
|
|
|
| 32 |
print(f"An error occured during the process, trying again : {e}")
|
| 33 |
prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
|
| 34 |
prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
|
| 35 |
+
print("PRIOR ART SEARCH")
|
| 36 |
+
print(prior_art_reponse)
|
| 37 |
+
print(prior_art_search)
|
| 38 |
return prior_art_search
|
src/services/processor.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from src.services.utils import
|
| 2 |
import requests as r
|
| 3 |
import json
|
| 4 |
import nltk
|
|
@@ -23,51 +23,15 @@ def retrieve_constraints(prompt):
|
|
| 23 |
|
| 24 |
constraints_json = json.loads("{"+json_str+"}")
|
| 25 |
|
| 26 |
-
print(f"Whats returned : {constraints_json}")
|
| 27 |
return constraints_json
|
| 28 |
|
| 29 |
-
|
| 30 |
-
def preprocess_tech_data(_df):
|
| 31 |
-
if _df is None or "description" not in _df.columns:
|
| 32 |
-
return [], []
|
| 33 |
-
|
| 34 |
-
technologies_list = _df["description"].to_list()
|
| 35 |
-
tech_dict_raw = tech_to_dict(technologies_list)
|
| 36 |
-
|
| 37 |
-
tech_dict_filtered = [
|
| 38 |
-
t for t in tech_dict_raw if (
|
| 39 |
-
len(t.get("title", "")) >= 5 and
|
| 40 |
-
len(t.get("advantages", "")) >= 5 and
|
| 41 |
-
len(t.get("key_components", "")) >= 5
|
| 42 |
-
)
|
| 43 |
-
]
|
| 44 |
-
|
| 45 |
-
if not tech_dict_filtered:
|
| 46 |
-
return [], []
|
| 47 |
-
|
| 48 |
-
processed_tech_wt = stem(tech_dict_filtered,"technologies")
|
| 49 |
-
|
| 50 |
-
for t_item_wt in processed_tech_wt:
|
| 51 |
-
kc = t_item_wt.get("key_components")
|
| 52 |
-
if isinstance(kc, str):
|
| 53 |
-
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
|
| 54 |
-
else:
|
| 55 |
-
t_item_wt["key_components"] = ""
|
| 56 |
-
|
| 57 |
-
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
|
| 61 |
-
return processed_tech_wt, _keys, original_tech_for_display
|
| 62 |
-
|
| 63 |
-
|
| 64 |
def remove_over_repeated_technologies(result):
|
| 65 |
total_lists = len(result)
|
| 66 |
tech_title = {}
|
| 67 |
|
| 68 |
for idx, item in enumerate(result):
|
| 69 |
for tech in item['technologies']:
|
| 70 |
-
tech_title[tech[0]['
|
| 71 |
|
| 72 |
threshold = total_lists * 0.3
|
| 73 |
print(threshold)
|
|
@@ -79,11 +43,11 @@ def remove_over_repeated_technologies(result):
|
|
| 79 |
to_delete.append(tech)
|
| 80 |
|
| 81 |
for idx, item in enumerate(result):
|
| 82 |
-
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['
|
| 83 |
|
| 84 |
return result
|
| 85 |
|
| 86 |
-
def get_contrastive_similarities(constraints,
|
| 87 |
selected_pairs = []
|
| 88 |
matrix = []
|
| 89 |
|
|
@@ -93,8 +57,8 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
|
|
| 93 |
for i, constraint in enumerate(constraints):
|
| 94 |
constraint_embedding = constraint_embeddings[i]
|
| 95 |
constraint_matrix = []
|
| 96 |
-
for j,
|
| 97 |
-
tech_embedding =
|
| 98 |
|
| 99 |
purpose_sim = model.similarity(constraint_embedding, tech_embedding)
|
| 100 |
|
|
@@ -103,7 +67,7 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
|
|
| 103 |
|
| 104 |
selected_pairs.append({
|
| 105 |
"constraint": constraint,
|
| 106 |
-
"id2":
|
| 107 |
"similarity": purpose_sim
|
| 108 |
})
|
| 109 |
constraint_matrix.append(purpose_sim)
|
|
@@ -119,21 +83,25 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
|
|
| 119 |
MAX_SIMILARITY = 0.8
|
| 120 |
|
| 121 |
possible_matches_for_each_l1 = []
|
| 122 |
-
for i in
|
| 123 |
valid_matches_for_l1_element = []
|
| 124 |
-
for j in
|
| 125 |
score = matrix[i][j]
|
| 126 |
|
|
|
|
|
|
|
| 127 |
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
| 128 |
-
|
|
|
|
|
|
|
| 129 |
|
| 130 |
if not valid_matches_for_l1_element:
|
| 131 |
-
print(f"No valid matches found in list2 for '{
|
| 132 |
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
|
| 133 |
"Returning an empty list as no complete combinations can be formed.")
|
| 134 |
|
| 135 |
else:
|
| 136 |
-
possible_matches_for_each_l1.append((valid_matches_for_l1_element,
|
| 137 |
|
| 138 |
result = []
|
| 139 |
for tech_list, problem in possible_matches_for_each_l1:
|
|
@@ -219,10 +187,10 @@ def select_technologies(problem_technology_list):
|
|
| 219 |
|
| 220 |
def load_titles(techno, data_type):
|
| 221 |
if data_type == "pydantic":
|
| 222 |
-
technology_titles = [tech.
|
| 223 |
else: # data_type == "dict"
|
| 224 |
technologies = techno["technologies"]
|
| 225 |
-
technology_titles = [tech["
|
| 226 |
return technology_titles
|
| 227 |
|
| 228 |
def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:
|
|
|
|
| 1 |
+
from src.services.utils import load_data, stem, set_gemini
|
| 2 |
import requests as r
|
| 3 |
import json
|
| 4 |
import nltk
|
|
|
|
| 23 |
|
| 24 |
constraints_json = json.loads("{"+json_str+"}")
|
| 25 |
|
|
|
|
| 26 |
return constraints_json
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def remove_over_repeated_technologies(result):
|
| 29 |
total_lists = len(result)
|
| 30 |
tech_title = {}
|
| 31 |
|
| 32 |
for idx, item in enumerate(result):
|
| 33 |
for tech in item['technologies']:
|
| 34 |
+
tech_title[tech[0]['name']] = 0 if tech[0]['name'] not in tech_title else tech_title[tech[0]['name']] + 1
|
| 35 |
|
| 36 |
threshold = total_lists * 0.3
|
| 37 |
print(threshold)
|
|
|
|
| 43 |
to_delete.append(tech)
|
| 44 |
|
| 45 |
for idx, item in enumerate(result):
|
| 46 |
+
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['name'] not in to_delete]
|
| 47 |
|
| 48 |
return result
|
| 49 |
|
| 50 |
+
def get_contrastive_similarities(constraints, dataset):
|
| 51 |
selected_pairs = []
|
| 52 |
matrix = []
|
| 53 |
|
|
|
|
| 57 |
for i, constraint in enumerate(constraints):
|
| 58 |
constraint_embedding = constraint_embeddings[i]
|
| 59 |
constraint_matrix = []
|
| 60 |
+
for j, row in enumerate(dataset):
|
| 61 |
+
tech_embedding = row["embeddings"]
|
| 62 |
|
| 63 |
purpose_sim = model.similarity(constraint_embedding, tech_embedding)
|
| 64 |
|
|
|
|
| 67 |
|
| 68 |
selected_pairs.append({
|
| 69 |
"constraint": constraint,
|
| 70 |
+
"id2": j,
|
| 71 |
"similarity": purpose_sim
|
| 72 |
})
|
| 73 |
constraint_matrix.append(purpose_sim)
|
|
|
|
| 83 |
MAX_SIMILARITY = 0.8
|
| 84 |
|
| 85 |
possible_matches_for_each_l1 = []
|
| 86 |
+
for i, row_i in enumerate(list1):
|
| 87 |
valid_matches_for_l1_element = []
|
| 88 |
+
for j, row_j in enumerate(list2):
|
| 89 |
score = matrix[i][j]
|
| 90 |
|
| 91 |
+
# print(row_j)
|
| 92 |
+
# print(type(row_j))
|
| 93 |
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
|
| 94 |
+
del row_j["embeddings"]
|
| 95 |
+
row_j["id"] = j
|
| 96 |
+
valid_matches_for_l1_element.append((row_j, score))
|
| 97 |
|
| 98 |
if not valid_matches_for_l1_element:
|
| 99 |
+
print(f"No valid matches found in list2 for '{row_i}' from list1 "
|
| 100 |
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
|
| 101 |
"Returning an empty list as no complete combinations can be formed.")
|
| 102 |
|
| 103 |
else:
|
| 104 |
+
possible_matches_for_each_l1.append((valid_matches_for_l1_element, row_i))
|
| 105 |
|
| 106 |
result = []
|
| 107 |
for tech_list, problem in possible_matches_for_each_l1:
|
|
|
|
| 187 |
|
| 188 |
def load_titles(techno, data_type):
|
| 189 |
if data_type == "pydantic":
|
| 190 |
+
technology_titles = [tech.name for tech in techno]
|
| 191 |
else: # data_type == "dict"
|
| 192 |
technologies = techno["technologies"]
|
| 193 |
+
technology_titles = [tech["name"] for tech in technologies]
|
| 194 |
return technology_titles
|
| 195 |
|
| 196 |
def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:
|
src/services/utils.py
CHANGED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
import pickle
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
-
|
| 5 |
import nltk
|
| 6 |
from nltk.stem import *
|
| 7 |
nltk.download("punkt_tab")
|
| 8 |
-
|
| 9 |
from pathlib import Path
|
|
|
|
|
|
|
| 10 |
import os
|
| 11 |
import google.generativeai as genai
|
| 12 |
import json
|
| 13 |
from google.genai import Client, types
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
BASE_DIR = Path(__file__).resolve().parent.parent
|
| 16 |
|
| 17 |
-
FILE_PATH = BASE_DIR / 'ressources' / 'global_tech_embeddings.pkl'
|
| 18 |
|
| 19 |
def set_prompt(problem):
|
| 20 |
prompt = """
|
|
@@ -51,71 +51,44 @@ Output each constraints in a JSON such as : {"title of the constraints1":"descri
|
|
| 51 |
""" + problem
|
| 52 |
return prompt
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
return
|
| 57 |
-
|
| 58 |
-
def load_technologies():
|
| 59 |
-
EMBEDDINGS_FILE = FILE_PATH
|
| 60 |
-
|
| 61 |
-
try:
|
| 62 |
-
with open(EMBEDDINGS_FILE, 'rb') as f:
|
| 63 |
-
loaded_data = pickle.load(f)
|
| 64 |
-
global_tech = loaded_data['global_tech']
|
| 65 |
-
global_tech_embedding = loaded_data['global_tech_embeddings']
|
| 66 |
-
return global_tech, global_tech_embedding
|
| 67 |
-
except Exception as e:
|
| 68 |
-
print(f"Error: {e}")
|
| 69 |
-
|
| 70 |
-
def tech_to_dict(technologies):
|
| 71 |
-
tech_dict = []
|
| 72 |
-
for index, tech in enumerate(technologies):
|
| 73 |
-
if not tech.find("<title>") > 1:
|
| 74 |
-
tab = tech.split("\n")
|
| 75 |
-
tab.pop(0)
|
| 76 |
-
tab.pop(len(tab)-1)
|
| 77 |
-
tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
|
| 78 |
-
"purpose": tab[1][tab[1].find(": ")+2:],
|
| 79 |
-
"key_components": tab[2][tab[2].find(": ")+2:],
|
| 80 |
-
"advantages": tab[3][tab[3].find(": ")+2:],
|
| 81 |
-
"limitations": tab[4][tab[4].find(": ")+2:],
|
| 82 |
-
"id": index})
|
| 83 |
-
return tech_dict
|
| 84 |
-
|
| 85 |
-
def save_dataframe(df, title):
|
| 86 |
-
pd.DataFrame(df).to_excel(title)
|
| 87 |
-
return title
|
| 88 |
|
| 89 |
def stem(data,data_type):
|
| 90 |
stemmer = SnowballStemmer("english")
|
| 91 |
processed_data = []
|
| 92 |
if data_type == "technologies":
|
| 93 |
-
for t_item in data:
|
| 94 |
processed_data.append({
|
| 95 |
-
"
|
| 96 |
"purpose": stemmer.stem(t_item["purpose"]),
|
| 97 |
-
"
|
| 98 |
"advantages": stemmer.stem(t_item["advantages"]),
|
| 99 |
"limitations": stemmer.stem(t_item["limitations"]),
|
| 100 |
-
"
|
|
|
|
| 101 |
})
|
|
|
|
| 102 |
else:
|
| 103 |
for t_item in data:
|
| 104 |
-
print(t_item)
|
| 105 |
processed_data.append({
|
| 106 |
"title": stemmer.stem(t_item),
|
| 107 |
"description": stemmer.stem(data[t_item])
|
| 108 |
})
|
| 109 |
-
|
| 110 |
return processed_data
|
| 111 |
|
| 112 |
|
| 113 |
-
def get_technologies_by_id(
|
| 114 |
result = []
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
return result
|
| 120 |
|
| 121 |
def save_to_pickle(result_similarites):
|
|
@@ -133,7 +106,7 @@ def save_to_pickle(result_similarites):
|
|
| 133 |
|
| 134 |
for item in result_similarites:
|
| 135 |
row_idx = row_label_to_index[item['constraint']['title']]
|
| 136 |
-
col_idx = item['id2'] - 1
|
| 137 |
similarity_value = item['similarity'].item()
|
| 138 |
|
| 139 |
matrix[row_idx, col_idx] = similarity_value
|
|
@@ -157,7 +130,6 @@ def save_to_pickle(result_similarites):
|
|
| 157 |
print(f"\nMatrix and labels saved to {output_filename}")
|
| 158 |
return output_filename
|
| 159 |
|
| 160 |
-
|
| 161 |
def set_gemini():
|
| 162 |
gemini_api = os.getenv("GEMINI_API")
|
| 163 |
client = Client(api_key=gemini_api)
|
|
|
|
| 1 |
import pickle
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
import nltk
|
| 5 |
from nltk.stem import *
|
| 6 |
nltk.download("punkt_tab")
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
load_dotenv()
|
| 10 |
import os
|
| 11 |
import google.generativeai as genai
|
| 12 |
import json
|
| 13 |
from google.genai import Client, types
|
| 14 |
+
from datasets import load_dataset
|
| 15 |
+
|
| 16 |
|
|
|
|
| 17 |
|
|
|
|
| 18 |
|
| 19 |
def set_prompt(problem):
|
| 20 |
prompt = """
|
|
|
|
| 51 |
""" + problem
|
| 52 |
return prompt
|
| 53 |
|
| 54 |
+
|
| 55 |
+
def load_data():
|
| 56 |
+
return load_dataset("heymenn/Technologies", split="train")
|
| 57 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
def stem(data,data_type):
|
| 60 |
stemmer = SnowballStemmer("english")
|
| 61 |
processed_data = []
|
| 62 |
if data_type == "technologies":
|
| 63 |
+
for index, t_item in enumerate(data):
|
| 64 |
processed_data.append({
|
| 65 |
+
"name": stemmer.stem(t_item["name"]),
|
| 66 |
"purpose": stemmer.stem(t_item["purpose"]),
|
| 67 |
+
"problem_types_solved": stemmer.stem(t_item["problem_types_solved"]),
|
| 68 |
"advantages": stemmer.stem(t_item["advantages"]),
|
| 69 |
"limitations": stemmer.stem(t_item["limitations"]),
|
| 70 |
+
"domain_tags": stemmer.stem(t_item["domain_tags"]),
|
| 71 |
+
"id": index
|
| 72 |
})
|
| 73 |
+
|
| 74 |
else:
|
| 75 |
for t_item in data:
|
|
|
|
| 76 |
processed_data.append({
|
| 77 |
"title": stemmer.stem(t_item),
|
| 78 |
"description": stemmer.stem(data[t_item])
|
| 79 |
})
|
| 80 |
+
|
| 81 |
return processed_data
|
| 82 |
|
| 83 |
|
| 84 |
+
def get_technologies_by_id(technologies,dataset):
|
| 85 |
result = []
|
| 86 |
+
for id in technologies:
|
| 87 |
+
print(id)
|
| 88 |
+
data = dataset[id]
|
| 89 |
+
del data["embeddings"]
|
| 90 |
+
print(data)
|
| 91 |
+
result.append(data)
|
| 92 |
return result
|
| 93 |
|
| 94 |
def save_to_pickle(result_similarites):
|
|
|
|
| 106 |
|
| 107 |
for item in result_similarites:
|
| 108 |
row_idx = row_label_to_index[item['constraint']['title']]
|
| 109 |
+
col_idx = item['id2'] - 1
|
| 110 |
similarity_value = item['similarity'].item()
|
| 111 |
|
| 112 |
matrix[row_idx, col_idx] = similarity_value
|
|
|
|
| 130 |
print(f"\nMatrix and labels saved to {output_filename}")
|
| 131 |
return output_filename
|
| 132 |
|
|
|
|
| 133 |
def set_gemini():
|
| 134 |
gemini_api = os.getenv("GEMINI_API")
|
| 135 |
client = Client(api_key=gemini_api)
|