Spaces:
Runtime error
Runtime error
| import faiss | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| idx = 0 | |
| index = None | |
| newdoc = None | |
| dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings") | |
| model = SentenceTransformer("NbAiLab/nb-sbert-base") | |
| df = dataset["train"].to_pandas() | |
| def build_doc_frame(df, idx): | |
| doc = df.iloc[idx] | |
| # as df: | |
| doc_df = pd.DataFrame(doc).T | |
| # keep only sentences + embedding: | |
| doc_df = doc_df[["url", "sentences", "embedding"]] | |
| # unpack the sentences and embedding in separate rows | |
| doc_df = doc_df.explode(["sentences", "embedding"]) | |
| return doc_df | |
| def get_doc_embeddings(doc): | |
| return np.array(doc.embedding.tolist(), dtype="float32") | |
| def faiss_search(query_str, K=5): | |
| global idx | |
| global index | |
| global newdoc | |
| # find idx from url: | |
| # doc_idx = df[df.url == doc_url].index[0] | |
| # idx = int(doc_idx) | |
| target_emb = model.encode([query_str]) | |
| target_emb = np.array([target_emb.reshape(-1)]) | |
| faiss.normalize_L2(target_emb) | |
| D, I = index.search(np.array(target_emb), K) | |
| print(list(zip(D[0], I[0]))) | |
| # prettyprint the results: | |
| pretty_results = [] | |
| for idx, score in zip(I[0], D[0]): | |
| pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences)) | |
| pretty_results_str = "\n".join([f"Score: {score}\t\t{sent}" for score, sent in pretty_results]) | |
| top_k_str = f"Top {K} results for: {query_str}" | |
| # return str: | |
| return f"{top_k_str}\n{pretty_results_str}" | |
| # def DropdownSummary(): | |
| # next_opts = df.iloc[idx].summary.tolist() | |
| # return gr.Dropdown.update(choices=next_opts, label="Velg fra oppsummeringene") | |
| dropdown_opts = [doc.url for idx, doc in df.iterrows()] | |
| with gr.Blocks() as demo: | |
| gr.HTML( | |
| """ | |
| <h1>Lovdata rettsavgjørelser - semantisk søk</h1> | |
| """ | |
| ) | |
| def on_selection_change(selected_case): | |
| global idx | |
| global index | |
| global newdoc | |
| idx = df[df.url == selected_case].index[0] | |
| print("Selection changed!") | |
| print(selected_case) | |
| newdoc = build_doc_frame(df, idx) | |
| embeddings = get_doc_embeddings(newdoc) | |
| faiss.normalize_L2(embeddings) | |
| index = faiss.IndexFlatIP(768) | |
| index.add(embeddings) | |
| summary = df.iloc[idx].summary.tolist() | |
| # make a nice html-formatted ul-li list: | |
| summary_html = "<ul>" + "".join([f"<li>{sent}</li>" for sent in summary]) + "</ul>" | |
| # summary_dropdown.update(choices=summary, label="Velg fra oppsummeringene") | |
| url_html = f"<a href='{selected_case}' target='_blank'>{selected_case}</a>" | |
| return summary_html, url_html | |
| with gr.Row(): | |
| with gr.Column(): | |
| case_dropdown = gr.Dropdown(label="Velg en rettsavgjørelse", choices=dropdown_opts) | |
| summary_html = gr.HTML(label="Predefinert oppsummering", placeholder="<p>Velg en sak først<p>") | |
| case_url = gr.HTML(label="URL til rettsavgjørelse", placeholder="https://lovdata.no/...") | |
| with gr.Column(): | |
| query = gr.Textbox( | |
| label="Søk etter setninger", | |
| lines=1, | |
| placeholder="Kollisjon mellom to kjøretøy.", | |
| ) | |
| k_slider = gr.Slider(minimum=1, maximum=10, label="Antall treff", value=5, step=1) | |
| search_btn = gr.Button("Søk") | |
| output = gr.Textbox(label="Resultater", lines=10) | |
| case_dropdown.change( | |
| on_selection_change, | |
| inputs=[case_dropdown], | |
| outputs=[summary_html, case_url], | |
| ) | |
| search_btn.click(faiss_search, inputs=[query, k_slider], outputs=[output]) | |
| # clear_btn.click(None, inputs=[None, None], outputs=None) | |
| # search_btn.click(faiss_search, inputs=[None, None, None], outputs=["text"]) | |
| # search_btn.click(faiss_search, inputs=[idx, query, k_slider], outputs=["text"]) | |
| demo.launch() | |