File size: 2,210 Bytes
5dacc4a
 
8540184
f058a76
7e62e79
80d5eef
 
5dacc4a
213df71
 
 
 
 
 
 
 
 
 
 
f058a76
 
cac1fed
f058a76
 
 
 
 
 
 
 
 
 
 
8efdfea
 
 
 
 
 
 
 
 
 
 
 
 
 
213df71
648283c
3f41bd5
8efdfea
97c0116
 
8efdfea
 
139459b
bbfa082
213df71
7e25f90
50281b0
e292ca9
 
24e14d1
e292ca9
24e14d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import streamlit as st
from transformers import pipeline
import sqlite3
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
pipe=pipeline('sentiment-analysis')

text = """
Welcome to SorboBot, a Hugging Face Space designed to revolutionize the way you find published articles.

Powered by a full export from ScanR and Hal at Sorbonne University, SorboBot utilizes advanced language model technology to provide you with a list of published articles based on your prompt

Work in progress

Write your request:
"""
text=st.text_area(text)


if text:
    n_gram_range = (2, 2)
    stop_words = "english"
    # Extract candidate words/phrases
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
    candidates = count.get_feature_names_out()
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    doc_embedding = model.encode([text])
    candidate_embeddings = model.encode(candidates)
    top_n = 5
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    conn = sqlite3.connect('SU_CSV.db')
    cursor = conn.cursor()
    
    mots_cles_recherches = keywords
    
    # Création de la requête SQL
    query = f"SELECT title_s FROM BDD_Provisoire_SU WHERE {' OR '.join(['keyword_s LIKE ?'] * len(mots_cles_recherches))}"
    params = ['%' + mot + '%' for mot in mots_cles_recherches]
    
    cursor.execute(query, params)
    resultats = cursor.fetchall()
    
    # Affichage des titres d'articles trouvés
    if resultats:
        st.write("Titles of articles corresponding to your search:")
        for row in resultats[:3]:
            st.json(row[0])
    else:
        st.write("No article found in the database\n\n")
        st.json({})
    
    
    conn.close()
    generator = pipeline("text-generation", model="gpt2") # to modify for another model
    txt = generator(
        text,
        max_length=150,
        num_return_sequences=1,
    )[0]["generated_text"]

    st.write("Model output")
    st.write(txt)