Spaces:
Runtime error
Runtime error
| import requests | |
| from sentence_transformers import SentenceTransformer, CrossEncoder, util | |
| import os, re | |
| import torch | |
| from rank_bm25 import BM25Okapi | |
| from sklearn.feature_extraction import _stop_words | |
| import string | |
| import numpy as np | |
| import pandas as pd | |
| import base64 | |
| from io import StringIO | |
| import validators | |
| import nltk | |
| import warnings | |
| import streamlit as st | |
| from PIL import Image | |
| from beir.datasets.data_loader_hf import HFDataLoader | |
| from beir.reranking.models.mono_t5 import MonoT5 | |
| warnings.filterwarnings("ignore") | |
| auth_token = os.environ.get("auth_token") | |
| def load_data(dataset_type): | |
| corpus, queries, qrels = HFDataLoader(hf_repo="clarin-knext/"+dataset_type, streaming=False, keep_in_memory=False).load(split="test") | |
| corpus = [ doc['text']for doc in corpus] | |
| queries = [ query['text']for query in queries] | |
| return queries, corpus | |
| def bi_encode(bi_encoder_name,passages, dataset_name='scifact-pl'): | |
| global bi_encoder | |
| #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search | |
| bi_encoder = SentenceTransformer(bi_encoder_name,use_auth_token=auth_token) | |
| # Thos code would be used if we would embed the passages, but here to make it fast we will load already embedded tensors: | |
| # with st.spinner('Encoding passages into a vector space...'): | |
| # if bi_encoder_name == 'intfloat/multilingual-e5-base': | |
| # corpus_embeddings = bi_encoder.encode(['passage: ' + sentence for sentence in passages], convert_to_tensor=True) | |
| # else: | |
| # corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True) | |
| with st.spinner('Loading encoded passages...'): | |
| if bi_encoder_name == "sentence-transformers/distiluse-base-multilingual-cased-v1": | |
| name = 'distiluse-base-multilingual-cased-v1' | |
| elif bi_encoder_name == 'intfloat/multilingual-e5-base': | |
| name = 'multilingual-e5-base' | |
| elif bi_encoder_name == 'nthakur/mcontriever-base-msmarco': | |
| name = 'mcontriever' | |
| corpus_embeddings_name = "-".join([name, dataset_name, "corpus"]) | |
| corpus_embeddings = torch.load(corpus_embeddings_name, map_location=torch.device('cpu')) | |
| st.success(f"Embeddings computed. Shape: {corpus_embeddings.shape}") | |
| return bi_encoder, corpus_embeddings | |
| def cross_encode(cross_encoder_name): | |
| global cross_encoder | |
| #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality | |
| if cross_encoder_name == "clarin-knext/plt5-base-msmarco": | |
| cross_encoder = MonoT5(cross_encoder_name, use_amp=False, token_true='▁prawda', token_false='▁fałsz') | |
| else: | |
| cross_encoder = CrossEncoder(cross_encoder_name)#('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') | |
| return cross_encoder | |
| def bm25_tokenizer(text): | |
| # We also compare the results to lexical search (keyword search). Here, we use | |
| # the BM25 algorithm which is implemented in the rank_bm25 package. | |
| # We lower case our text and remove stop-words from indexing | |
| tokenized_doc = [] | |
| for token in text.lower().split(): | |
| token = token.strip(string.punctuation) | |
| if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS: | |
| tokenized_doc.append(token) | |
| return tokenized_doc | |
| def bm25_api(passages): | |
| tokenized_corpus = [] | |
| for passage in passages: | |
| tokenized_corpus.append(bm25_tokenizer(passage)) | |
| bm25 = BM25Okapi(tokenized_corpus) | |
| return bm25 | |
| bi_enc_options = ["sentence-transformers/distiluse-base-multilingual-cased-v1", 'intfloat/multilingual-e5-base', 'nthakur/mcontriever-base-msmarco'] | |
| # "all-mpnet-base-v2","multi-qa-MiniLM-L6-cos-v1",'intfloat/e5-base-v2',"neeva/query2query" | |
| cross_enc_options = [ 'clarin-knext/plt5-base-msmarco', 'clarin-knext/herbert-base-reranker-msmarco', 'cross-encoder/mmarco-mMiniLMv2-L12-H384-v1'] | |
| datasets_options = ["nfcorpus-pl", "scifact-pl", "fiqa-pl"] | |
| def display_df_as_table(model,top_k,score='score'): | |
| # Display the df with text and scores as a table | |
| df = pd.DataFrame([(hit[score], passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text']) | |
| df['Score'] = round(df['Score'],2) | |
| return df | |
| #Streamlit App | |
| st.title("Retrieval BEIR-PL Demo") | |
| """ | |
| Example of retrieval over BEIR-PL dataset. | |
| """ | |
| # window_size = st.sidebar.slider("Paragraph Window Size",min_value=1,max_value=10,value=3,key= | |
| # 'slider') | |
| st.sidebar.title("Menu") | |
| dataset_type = st.sidebar.selectbox("Dataset", options=datasets_options, key='dataset_select') | |
| bi_encoder_type = st.sidebar.selectbox("Bi-Encoder", options=bi_enc_options, key='bi_select') | |
| cross_encoder_type = st.sidebar.selectbox("Cross-Encoder", options=cross_enc_options, key='cross_select') | |
| top_k = st.sidebar.slider("Number of Top Hits Generated",min_value=1,max_value=5,value=2) | |
| hide_bm25 = st.sidebar.checkbox("Hide BM25 results?") | |
| hide_biencoder = st.sidebar.checkbox("Hide Bi-Encoder results?") | |
| hide_crossencoder = st.sidebar.checkbox("Hide Cross-Encoder results?") | |
| # This function will search all wikipedia articles for passages that | |
| # answer the query | |
| def search_func(query, bi_encoder_type, top_k=top_k): | |
| global bi_encoder, cross_encoder | |
| st.subheader(f"Search Query:\n_{query}_") | |
| ##### BM25 search (lexical search) ##### | |
| bm25_scores = bm25.get_scores(bm25_tokenizer(query)) | |
| top_n = np.argpartition(bm25_scores, -5)[-5:] | |
| bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n] | |
| bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True) | |
| if not hide_bm25: | |
| st.subheader(f"Top-{top_k} lexical search (BM25) hits") | |
| bm25_df = display_df_as_table(bm25_hits,top_k) | |
| st.write(bm25_df.to_html(index=False), unsafe_allow_html=True) | |
| ##### Sematic Search ##### | |
| # Encode the query using the bi-encoder and find potentially relevant passages | |
| if bi_encoder_type == 'intfloat/multilingual-e5-base': | |
| question_embedding = bi_encoder.encode("query: " + query, convert_to_tensor=True) | |
| else: | |
| question_embedding = bi_encoder.encode(query, convert_to_tensor=True) | |
| question_embedding = question_embedding.cpu() | |
| HITS_NUM=20 # Number of hits for reranker | |
| hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=HITS_NUM,score_function=util.dot_score) | |
| hits = hits[0] # Get the hits for the first query | |
| ##### Re-Ranking ##### | |
| # Now, score all retrieved passages with the cross_encoder | |
| cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits] | |
| cross_scores = cross_encoder.predict(cross_inp) | |
| # Sort results by the cross-encoder scores | |
| for idx in range(len(cross_scores)): | |
| hits[idx]['cross-score'] = cross_scores[idx] | |
| if not hide_biencoder: | |
| # Output of top-k hits from bi-encoder | |
| st.markdown("\n-------------------------\n") | |
| st.subheader(f"Top-{top_k} Bi-Encoder Retrieval hits") | |
| hits = sorted(hits, key=lambda x: x['score'], reverse=True) | |
| biencoder_df = display_df_as_table(hits,top_k) | |
| st.write(biencoder_df.to_html(index=False), unsafe_allow_html=True) | |
| if not hide_crossencoder: | |
| # Output of top-3 hits from re-ranker | |
| st.markdown("\n-------------------------\n") | |
| st.subheader(f"Top-{top_k} Cross-Encoder Re-ranker hits") | |
| hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) | |
| rerank_df = display_df_as_table(hits,top_k,'cross-score') | |
| st.write(rerank_df.to_html(index=False), unsafe_allow_html=True) | |
| st.markdown("---") | |
| def clear_text(): | |
| st.session_state["text_input"]= "" | |
| question, passages = load_data(dataset_type) | |
| st.write(pd.DataFrame(question[:5], columns=["Example queries from dataset"]).to_html(index=False, justify='center'), unsafe_allow_html=True) | |
| search_query = st.text_input("Ask your question:", | |
| value=question[0], | |
| key="text_input") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| search = st.button("Search",key='search_but', help='Click to Search!') | |
| with col2: | |
| clear = st.button("Clear Text Input", on_click=clear_text,key='clear',help='Click to clear the search query') | |
| if search: | |
| if bi_encoder_type: | |
| with st.spinner( | |
| text=f"Loading {bi_encoder_type} bi-encoder and embedding document into vector space. This might take a few seconds depending on the length of your document..." | |
| ): | |
| bi_encoder, corpus_embeddings = bi_encode(bi_encoder_type,passages, dataset_name=dataset_type) | |
| cross_encoder = cross_encode(cross_encoder_type) | |
| bm25 = bm25_api(passages) | |
| with st.spinner( | |
| text="Embedding completed, searching for relevant text for given query and hits..."): | |
| search_func(search_query,bi_encoder_type,top_k) | |
| st.markdown(""" | |
| """) |