Spaces:
Running
Running
Sean-Case
commited on
Commit
·
e0f53cc
1
Parent(s):
ff32b4a
Now should save embeddings by default. Added random seed to representation
Browse files- app.py +9 -1
- funcs/representation_model.py +2 -2
app.py
CHANGED
|
@@ -242,7 +242,15 @@ def extract_topics(in_files, in_file, min_docs_slider, in_colnames, max_topics_s
|
|
| 242 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
| 243 |
output_list.append(topic_model_save_name_zip)
|
| 244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
# Visualise the topics:
|
|
|
|
| 246 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
| 247 |
|
| 248 |
return output_text, output_list, topics_vis
|
|
@@ -290,7 +298,7 @@ with block:
|
|
| 290 |
with gr.Accordion("Data load and processing options", open = True):
|
| 291 |
with gr.Row():
|
| 292 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
|
| 293 |
-
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="
|
| 294 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
| 295 |
with gr.Row():
|
| 296 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
|
|
|
|
| 242 |
zip_folder(topic_model_save_name_folder, topic_model_save_name_zip)
|
| 243 |
output_list.append(topic_model_save_name_zip)
|
| 244 |
|
| 245 |
+
if return_intermediate_files == "Yes":
|
| 246 |
+
print("Saving embeddings to file")
|
| 247 |
+
semantic_search_file_name = data_file_name_no_ext + '_' + 'embeddings.npz'
|
| 248 |
+
np.savez_compressed(semantic_search_file_name, embeddings_out)
|
| 249 |
+
|
| 250 |
+
output_list.append(semantic_search_file_name)
|
| 251 |
+
|
| 252 |
# Visualise the topics:
|
| 253 |
+
print("Creating visualisation")
|
| 254 |
topics_vis = topic_model.visualize_documents(label_col, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
|
| 255 |
|
| 256 |
return output_text, output_list, topics_vis
|
|
|
|
| 298 |
with gr.Accordion("Data load and processing options", open = True):
|
| 299 |
with gr.Row():
|
| 300 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Anonymise data on file load. Names and other details are replaced with tags e.g. '<person>'.")
|
| 301 |
+
return_intermediate_files = gr.Dropdown(label = "Return intermediate processing files from file preparation. Files can be loaded in to save processing time in future.", value="Yes", choices=["Yes", "No"])
|
| 302 |
embedding_super_compress = gr.Dropdown(label = "Round embeddings to three dp for smaller files with less accuracy.", value="No", choices=["Yes", "No"])
|
| 303 |
with gr.Row():
|
| 304 |
low_resource_mode_opt = gr.Dropdown(label = "Use low resource embeddings model based on TF-IDF (consider if embedding generation is slow).", value="No", choices=["Yes", "No"])
|
funcs/representation_model.py
CHANGED
|
@@ -9,7 +9,7 @@ import torch.cuda
|
|
| 9 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
|
| 10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
| 15 |
chosen_start_tag = open_hermes_start # stablelm_start
|
|
@@ -117,7 +117,7 @@ llm_config = LLamacppInitConfigGpu(last_n_tokens_size=last_n_tokens_size,
|
|
| 117 |
|
| 118 |
## Create representation model parameters ##
|
| 119 |
# KeyBERT
|
| 120 |
-
keybert = KeyBERTInspired()
|
| 121 |
|
| 122 |
def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
|
| 123 |
|
|
|
|
| 9 |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
|
| 10 |
from funcs.prompts import capybara_prompt, capybara_start, open_hermes_prompt, open_hermes_start, stablelm_prompt, stablelm_start
|
| 11 |
|
| 12 |
+
random_seed = 42
|
| 13 |
|
| 14 |
chosen_prompt = open_hermes_prompt # stablelm_prompt
|
| 15 |
chosen_start_tag = open_hermes_start # stablelm_start
|
|
|
|
| 117 |
|
| 118 |
## Create representation model parameters ##
|
| 119 |
# KeyBERT
|
| 120 |
+
keybert = KeyBERTInspired(random_state=random_seed)
|
| 121 |
|
| 122 |
def create_representation_model(create_llm_topic_labels, llm_config, hf_model_name, hf_model_file, chosen_start_tag):
|
| 123 |
|