insight-finder

Running

App Files Files Community

ALLOUNE commited on Jul 24

Commit

c97a50e

1 Parent(s): 367af23

add dataset

Browse files

Files changed (5) hide show

app.py +9 -7
requirements.txt +2 -1
src/core.py +8 -12
src/services/processor.py +18 -50
src/services/utils.py +23 -51

app.py CHANGED Viewed

@@ -27,12 +27,12 @@ class InputConstraints(BaseModel):
 # This schema defines the structure for a single technology object
 class Technology(BaseModel):
     """Represents a single technology entry with its details."""
-    title: str
     purpose: str
-    key_components: str
     advantages: str
     limitations: str
-    id: int
 class OutputPriorArt(BaseModel):
     """Represents the search of prior art using the technology combinations"""
@@ -55,12 +55,12 @@ class TechnologyData(BaseModel):
 @app.post("/process", response_model=TechnologyData)
 async def process(data: InputProblem):
-    result= process_input(data, global_tech, global_tech_embeddings, "problem")
     return {"technologies": result}
 @app.post("/process-constraints", response_model=TechnologyData)
 async def process_constraints(constraints: InputConstraints):
-    result= process_input(constraints.constraints, global_tech, global_tech_embeddings, "constraints")
     return {"technologies": result}
 @app.post("/prior-art-constraints", response_model=OutputPriorArt)
@@ -70,7 +70,7 @@ async def prior_art_constraints(data: InputPriorArtConstraints):
 @app.post("/prior-art-problems", response_model=OutputPriorArt)
 async def prior_art_problems(data: InputPriorArtProblem):
-    prior_art = process_prior_art(data.technologies, data.problems, "problem", "pydantic")
     return prior_art
 def make_json_serializable(data):
@@ -268,7 +268,6 @@ def process_input_gradio(problem_description: str):
     # Step 3: Stem Constraints
     constraints_stemmed = stem(constraints, "constraints")
     save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
-    print(constraints_stemmed)
     # Step 4: Global Tech (already loaded, just acknowledge)
     # save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
@@ -282,6 +281,9 @@ def process_input_gradio(problem_description: str):
     # Step 6: Find Best List Combinations
     best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
     # Step 7: Select Technologies
     best_technologies_id = select_technologies(best_combinations)

 # This schema defines the structure for a single technology object
 class Technology(BaseModel):
     """Represents a single technology entry with its details."""
+    name: str
     purpose: str
+    problem_types_solved: str
     advantages: str
     limitations: str
+    domain_tags: str
 class OutputPriorArt(BaseModel):
     """Represents the search of prior art using the technology combinations"""
 @app.post("/process", response_model=TechnologyData)
 async def process(data: InputProblem):
+    result= process_input(data, dataset, "problem")
     return {"technologies": result}
 @app.post("/process-constraints", response_model=TechnologyData)
 async def process_constraints(constraints: InputConstraints):
+    result= process_input(constraints.constraints, dataset, "constraints")
     return {"technologies": result}
 @app.post("/prior-art-constraints", response_model=OutputPriorArt)
 @app.post("/prior-art-problems", response_model=OutputPriorArt)
 async def prior_art_problems(data: InputPriorArtProblem):
+    prior_art = process_prior_art(data.technologies, data.problem, "problem", "pydantic")
     return prior_art
 def make_json_serializable(data):
     # Step 3: Stem Constraints
     constraints_stemmed = stem(constraints, "constraints")
     save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
     # Step 4: Global Tech (already loaded, just acknowledge)
     # save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading
     # Step 6: Find Best List Combinations
     best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
+    print("best_combinations")
+    print(best_combinations)
     # Step 7: Select Technologies
     best_technologies_id = select_technologies(best_combinations)

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ pydantic
 openpyxl
 gradio
 google.generativeai
-google.genai

 openpyxl
 gradio
 google.generativeai
+google.genai
+datasets

src/core.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from src.services.utils import *
 from src.services.processor import *
-global_tech, global_tech_embeddings = load_technologies()
-def process_input(data, global_tech, global_tech_embeddings, data_type):
     if data_type == "problem":
         prompt = set_prompt(data.problem)
         constraints = retrieve_constraints(prompt)
@@ -14,19 +14,13 @@ def process_input(data, global_tech, global_tech_embeddings, data_type):
     constraints_stemmed = stem(constraints, "constraints")
-    save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
-    save_dataframe(global_tech, "global_tech.xlsx")
-    result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, global_tech, global_tech_embeddings, )
     save_to_pickle(result_similarities)
-    print(f"Matrix : {matrix} \n Constraints : {constraints_stemmed} \n Gloabl tech : {global_tech}")
-    best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)
     best_technologies_id = select_technologies(best_combinations)
-    best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
     return best_technologies
@@ -38,5 +32,7 @@ def process_prior_art(technologies, data, data_type, techno_type):
         print(f"An error occured during the process, trying again : {e}")
         prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
         prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
     return prior_art_search

 from src.services.utils import *
 from src.services.processor import *
+dataset = load_data()
+def process_input(data, dataset, data_type):
     if data_type == "problem":
         prompt = set_prompt(data.problem)
         constraints = retrieve_constraints(prompt)
     constraints_stemmed = stem(constraints, "constraints")
+    result_similarities, matrix = get_contrastive_similarities(constraints_stemmed, dataset)
     save_to_pickle(result_similarities)
+    best_combinations = find_best_list_combinations(constraints_stemmed, dataset, matrix)
     best_technologies_id = select_technologies(best_combinations)
+    best_technologies = get_technologies_by_id(best_technologies_id, dataset)
     return best_technologies
         print(f"An error occured during the process, trying again : {e}")
         prior_art_reponse = search_prior_art(technologies, data, data_type, techno_type)
         prior_art_search = add_citations_and_collect_uris(prior_art_reponse)
+    print("PRIOR ART SEARCH")
+    print(prior_art_reponse)
+    print(prior_art_search)
     return prior_art_search

src/services/processor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from src.services.utils import tech_to_dict, stem, set_gemini
 import requests as r
 import json
 import nltk
@@ -23,51 +23,15 @@ def retrieve_constraints(prompt):
     constraints_json = json.loads("{"+json_str+"}")
-    print(f"Whats returned : {constraints_json}")
     return constraints_json
-def preprocess_tech_data(_df):
-    if _df is None or "description" not in _df.columns:
-        return [], []
-    technologies_list = _df["description"].to_list()
-    tech_dict_raw = tech_to_dict(technologies_list)
-    tech_dict_filtered = [
-        t for t in tech_dict_raw if (
-            len(t.get("title", "")) >= 5 and
-            len(t.get("advantages", "")) >= 5 and
-            len(t.get("key_components", "")) >= 5
-        )
-    ]
-    if not tech_dict_filtered:
-        return [], []
-    processed_tech_wt = stem(tech_dict_filtered,"technologies")
-    for t_item_wt in processed_tech_wt:
-        kc = t_item_wt.get("key_components")
-        if isinstance(kc, str):
-            t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
-        else:
-            t_item_wt["key_components"] = ""
-    original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
-    _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
-    return processed_tech_wt, _keys, original_tech_for_display
 def remove_over_repeated_technologies(result):
     total_lists = len(result)
     tech_title = {}
     for idx, item in enumerate(result):
         for tech in item['technologies']:
-            tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
     threshold = total_lists * 0.3
     print(threshold)
@@ -79,11 +43,11 @@ def remove_over_repeated_technologies(result):
         to_delete.append(tech)
     for idx, item in enumerate(result):
-        result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
     return result
-def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded_tech_embeddings):
     selected_pairs = []
     matrix = []
@@ -93,8 +57,8 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
     for i, constraint in enumerate(constraints):
         constraint_embedding = constraint_embeddings[i]
         constraint_matrix = []
-        for j, tech2 in enumerate(pre_encoded_tech_data):
-            tech_embedding = pre_encoded_tech_embeddings[j]
             purpose_sim = model.similarity(constraint_embedding, tech_embedding)
@@ -103,7 +67,7 @@ def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded
             selected_pairs.append({
                 "constraint": constraint,
-                "id2": tech2["id"],
                 "similarity": purpose_sim
             })
             constraint_matrix.append(purpose_sim)
@@ -119,21 +83,25 @@ def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> l
     MAX_SIMILARITY = 0.8
     possible_matches_for_each_l1 = []
-    for i in range(len(list1)):
         valid_matches_for_l1_element = []
-        for j in range(len(list2)):
             score = matrix[i][j]
             if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
-                valid_matches_for_l1_element.append((list2[j], score))
         if not valid_matches_for_l1_element:
-            print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
                   f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
                   "Returning an empty list as no complete combinations can be formed.")
         else:
-          possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
     result = []
     for tech_list, problem in possible_matches_for_each_l1:
@@ -219,10 +187,10 @@ def select_technologies(problem_technology_list):
 def load_titles(techno, data_type):
     if data_type == "pydantic":
-        technology_titles = [tech.title for tech in techno]
     else: # data_type == "dict"
         technologies = techno["technologies"]
-        technology_titles = [tech["title"] for tech in technologies]
     return technology_titles
 def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:

+from src.services.utils import load_data, stem, set_gemini
 import requests as r
 import json
 import nltk
     constraints_json = json.loads("{"+json_str+"}")
     return constraints_json
 def remove_over_repeated_technologies(result):
     total_lists = len(result)
     tech_title = {}
     for idx, item in enumerate(result):
         for tech in item['technologies']:
+            tech_title[tech[0]['name']] = 0 if tech[0]['name'] not in tech_title else tech_title[tech[0]['name']] + 1
     threshold = total_lists * 0.3
     print(threshold)
         to_delete.append(tech)
     for idx, item in enumerate(result):
+        result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['name'] not in to_delete]
     return result
+def get_contrastive_similarities(constraints, dataset):
     selected_pairs = []
     matrix = []
     for i, constraint in enumerate(constraints):
         constraint_embedding = constraint_embeddings[i]
         constraint_matrix = []
+        for j, row in enumerate(dataset):
+            tech_embedding = row["embeddings"]
             purpose_sim = model.similarity(constraint_embedding, tech_embedding)
             selected_pairs.append({
                 "constraint": constraint,
+                "id2": j,
                 "similarity": purpose_sim
             })
             constraint_matrix.append(purpose_sim)
     MAX_SIMILARITY = 0.8
     possible_matches_for_each_l1 = []
+    for i, row_i in enumerate(list1):
         valid_matches_for_l1_element = []
+        for j, row_j in enumerate(list2):
             score = matrix[i][j]
+            # print(row_j)
+            # print(type(row_j))
             if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
+                del row_j["embeddings"]
+                row_j["id"] = j
+                valid_matches_for_l1_element.append((row_j, score))
         if not valid_matches_for_l1_element:
+            print(f"No valid matches found in list2 for '{row_i}' from list1 "
                   f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
                   "Returning an empty list as no complete combinations can be formed.")
         else:
+          possible_matches_for_each_l1.append((valid_matches_for_l1_element, row_i))
     result = []
     for tech_list, problem in possible_matches_for_each_l1:
 def load_titles(techno, data_type):
     if data_type == "pydantic":
+        technology_titles = [tech.name for tech in techno]
     else: # data_type == "dict"
         technologies = techno["technologies"]
+        technology_titles = [tech["name"] for tech in technologies]
     return technology_titles
 def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:

src/services/utils.py CHANGED Viewed

@@ -1,20 +1,20 @@
 import pickle
 import numpy as np
 import pandas as pd
 import nltk
 from nltk.stem import *
 nltk.download("punkt_tab")
 from pathlib import Path
 import os
 import google.generativeai as genai
 import json
 from google.genai import Client, types
-BASE_DIR = Path(__file__).resolve().parent.parent
-FILE_PATH = BASE_DIR / 'ressources' / 'global_tech_embeddings.pkl'
 def set_prompt(problem):
     prompt = """
@@ -51,71 +51,44 @@ Output each constraints in a JSON such as : {"title of the constraints1":"descri
 """ + problem
     return prompt
-def load_technologies_excel():
-    df = pd.read_excel(FILE_PATH)
-    return df
-def load_technologies():
-    EMBEDDINGS_FILE = FILE_PATH
-    try:
-        with open(EMBEDDINGS_FILE, 'rb') as f:
-            loaded_data = pickle.load(f)
-        global_tech = loaded_data['global_tech']
-        global_tech_embedding = loaded_data['global_tech_embeddings']
-        return global_tech, global_tech_embedding
-    except Exception as e:
-        print(f"Error: {e}")
-def tech_to_dict(technologies):
-    tech_dict = []
-    for index, tech in enumerate(technologies):
-        if not tech.find("<title>") > 1:
-            tab = tech.split("\n")
-            tab.pop(0)
-            tab.pop(len(tab)-1)
-            tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
-                                "purpose": tab[1][tab[1].find(": ")+2:],
-                                "key_components": tab[2][tab[2].find(": ")+2:],
-                                "advantages": tab[3][tab[3].find(": ")+2:],
-                                "limitations": tab[4][tab[4].find(": ")+2:],
-                                "id": index})
-    return tech_dict
-def save_dataframe(df, title):
-    pd.DataFrame(df).to_excel(title)
-    return title
 def stem(data,data_type):
     stemmer = SnowballStemmer("english")
     processed_data = []
     if data_type == "technologies":
-      for t_item in data:
           processed_data.append({
-              "title": stemmer.stem(t_item["title"]),
               "purpose": stemmer.stem(t_item["purpose"]),
-              "key_components": stemmer.stem(t_item["key_components"]),
               "advantages": stemmer.stem(t_item["advantages"]),
               "limitations": stemmer.stem(t_item["limitations"]),
-              "id": t_item["id"]
           })
     else:
       for t_item in data:
-          print(t_item)
           processed_data.append({
               "title": stemmer.stem(t_item),
               "description": stemmer.stem(data[t_item])
               })
     return processed_data
-def get_technologies_by_id(id_list, technologies):
     result = []
-    id_set = set(id_list)
-    for tech in technologies:
-        if tech.get('id') in id_set:
-            result.append(tech)
     return result
 def save_to_pickle(result_similarites):
@@ -133,7 +106,7 @@ def save_to_pickle(result_similarites):
     for item in result_similarites:
         row_idx = row_label_to_index[item['constraint']['title']]
-        col_idx = item['id2'] - 1 #
         similarity_value = item['similarity'].item()
         matrix[row_idx, col_idx] = similarity_value
@@ -157,7 +130,6 @@ def save_to_pickle(result_similarites):
     print(f"\nMatrix and labels saved to {output_filename}")
     return output_filename
 def set_gemini():
     gemini_api = os.getenv("GEMINI_API")
     client = Client(api_key=gemini_api)

 import pickle
 import numpy as np
 import pandas as pd
 import nltk
 from nltk.stem import *
 nltk.download("punkt_tab")
 from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
 import os
 import google.generativeai as genai
 import json
 from google.genai import Client, types
+from datasets import load_dataset
 def set_prompt(problem):
     prompt = """
 """ + problem
     return prompt
+def load_data():
+    return load_dataset("heymenn/Technologies", split="train")
 def stem(data,data_type):
     stemmer = SnowballStemmer("english")
     processed_data = []
     if data_type == "technologies":
+      for index, t_item in enumerate(data):
           processed_data.append({
+              "name": stemmer.stem(t_item["name"]),
               "purpose": stemmer.stem(t_item["purpose"]),
+              "problem_types_solved": stemmer.stem(t_item["problem_types_solved"]),
               "advantages": stemmer.stem(t_item["advantages"]),
               "limitations": stemmer.stem(t_item["limitations"]),
+              "domain_tags": stemmer.stem(t_item["domain_tags"]),
+              "id": index
           })
     else:
       for t_item in data:
           processed_data.append({
               "title": stemmer.stem(t_item),
               "description": stemmer.stem(data[t_item])
               })
     return processed_data
+def get_technologies_by_id(technologies,dataset):
     result = []
+    for id in technologies:
+        print(id)
+        data = dataset[id]
+        del data["embeddings"]
+        print(data)
+        result.append(data)
     return result
 def save_to_pickle(result_similarites):
     for item in result_similarites:
         row_idx = row_label_to_index[item['constraint']['title']]
+        col_idx = item['id2'] - 1
         similarity_value = item['similarity'].item()
         matrix[row_idx, col_idx] = similarity_value
     print(f"\nMatrix and labels saved to {output_filename}")
     return output_filename
 def set_gemini():
     gemini_api = os.getenv("GEMINI_API")
     client = Client(api_key=gemini_api)