Spaces:

SorbonneUniversity
/

SorboBot

Running

App Files Files Community

Léo Bourrel commited on Nov 17, 2023

Commit

39a3f86

1 Parent(s): 8ee0a1b

feat: install pre-commit && clean

Browse files

Files changed (9) hide show

.pre-commit-config.yaml +60 -0
Dockerfile +1 -1
execution.sh +1 -1
requirements.txt +3 -1
setup.py +0 -1
sorbobotapp/app.py +16 -8
sorbobotapp/conversation_retrieval_chain.py +3 -1
sorbobotapp/static/styles.css +1 -1
sorbobotapp/vector_store.py +7 -5

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+default_install_hook_types:
+# Mandatory to install both pre-commit and pre-push hooks (see https://pre-commit.com/#top_level-default_install_hook_types)
+# Add new hook types here to ensure automatic installation when running `pre-commit install`
+- pre-commit
+- pre-push
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.3.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: check-yaml
+  - id: check-json
+  - id: check-added-large-files
+- repo: https://github.com/srstevenson/nb-clean
+  rev: 3.0.0
+  hooks:
+    - id: nb-clean
+      args:
+      - --remove-empty-cells
+      - --preserve-cell-metadata
+      - --
+# - repo: https://github.com/pre-commit/mirrors-mypy
+#   rev: 'v1.5.1'
+#   hooks:
+#   -   id: mypy
+- repo: local
+  hooks:
+  - id: black
+    name: Formatting (black)
+    entry: black
+    language: system
+    types: [python]
+    stages: [commit]
+  # - id: ruff
+  #   name: Linter (ruff)
+  #   entry: ruff
+  #   language: system
+  #   types: [python]
+  #   stages: [commit]
+  # - id: test
+  #   name: Unit tests (pytest)
+  #   entry: make test
+  #   pass_filenames: false
+  #   language: system
+  #   types: [python]
+  #   stages: [push]
+  # - id: dvc-pre-push
+  #   name: DVC pre-push
+  #   entry: dvc
+  #   args:
+  #     - git-hook
+  #     - pre-push
+  #   require_serial: true
+  #   verbose: true
+  #   language: system
+  #   stages: [push]

Dockerfile CHANGED Viewed

@@ -53,4 +53,4 @@ STOPSIGNAL SIGINT
 HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
-CMD ["postgres"]


53
54	HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
55
56	+ CMD ["postgres"]

execution.sh CHANGED Viewed

@@ -4,4 +4,4 @@ bash /usr/local/bin/docker-entrypoint.sh "$@" &
 postgres &
 sleep 2
-streamlit run sorbobotapp/app.py --server.port=7860 --server.address=0.0.0.0

 postgres &
 sleep 2
+streamlit run sorbobotapp/app.py --server.port=7860 --server.address=0.0.0.0

requirements.txt CHANGED Viewed

@@ -1,12 +1,14 @@
 gpt4all==1.0.12
 langchain==0.0.313
 openai==0.28.1
 pandas==2.1.1
 pgvector==0.2.3
 psycopg2-binary==2.9.9
 psycopg2==2.9.9
 streamlit==1.27.2
 streamlit-chat==0.1.1
 SQLAlchemy==2.0.22
 sqlite-vss==0.1.2
-tiktoken==0.5.1

+black==23.11.0
 gpt4all==1.0.12
 langchain==0.0.313
 openai==0.28.1
 pandas==2.1.1
 pgvector==0.2.3
+pre-commit==3.5.0
 psycopg2-binary==2.9.9
 psycopg2==2.9.9
 streamlit==1.27.2
 streamlit-chat==0.1.1
 SQLAlchemy==2.0.22
 sqlite-vss==0.1.2
+tiktoken==0.5.1

setup.py CHANGED Viewed

@@ -8,4 +8,3 @@ setup(
     authors=["Leo Bourrel <[email protected]>"],
     package_dir={"": "sorbobotapp"},
 )

     authors=["Leo Bourrel <[email protected]>"],
     package_dir={"": "sorbobotapp"},
 )

sorbobotapp/app.py CHANGED Viewed

@@ -45,8 +45,12 @@ def send_message_callback():
             )
             st.session_state.token_count += cb.total_tokens
             if os.environ.get("ENVIRONMENT") == "dev":
-                history_id = insert_chat_history(conn, human_prompt, llm_response["answer"])
-                insert_chat_history_articles(conn, history_id, llm_response["source_documents"])
 def exemple_message_callback_button(args):
@@ -90,10 +94,10 @@ with chat_column:
         for chat in st.session_state.history:
             div = f"""
-                <div class="chat-row
                     {'' if chat.origin == 'ai' else 'row-reverse'}">
                     <img class="chat-icon" src="https://cdn-icons-png.flaticon.com/512/{
-                        '1129/1129398.png' if chat.origin == 'ai'
                                     else '1077/1077012.png'}"
                         width=32 height=32>
                     <div class="chat-bubble
@@ -128,16 +132,18 @@ with chat_column:
                 exemple,
                 key=f"{idx_exemple}_button",
                 on_click=exemple_message_callback_button,
-                args=(exemple,)
             )
-    st.button(":new: Start a new conversation", on_click=clear_history, type="secondary")
     if os.environ.get("ENVIRONMENT") == "dev":
         information_placeholder.caption(
             f"""
         Used {st.session_state.token_count} tokens \n
-        Debug Langchain conversation:
         {st.session_state.history}
         """
         )
@@ -175,7 +181,9 @@ with doc_column:
             doc_metadata = doc.metadata
             expander = st.expander(doc_content["title"])
-            expander.markdown(f"**HalID** : https://hal.science/{doc_metadata['hal_id']}")
             expander.markdown(doc_metadata["abstract"])
             expander.markdown(f"**Authors** : {doc_content['authors']}")
             expander.markdown(f"**Keywords** : {doc_content['keywords']}")

             )
             st.session_state.token_count += cb.total_tokens
             if os.environ.get("ENVIRONMENT") == "dev":
+                history_id = insert_chat_history(
+                    conn, human_prompt, llm_response["answer"]
+                )
+                insert_chat_history_articles(
+                    conn, history_id, llm_response["source_documents"]
+                )
 def exemple_message_callback_button(args):
         for chat in st.session_state.history:
             div = f"""
+                <div class="chat-row
                     {'' if chat.origin == 'ai' else 'row-reverse'}">
                     <img class="chat-icon" src="https://cdn-icons-png.flaticon.com/512/{
+                        '1129/1129398.png' if chat.origin == 'ai'
                                     else '1077/1077012.png'}"
                         width=32 height=32>
                     <div class="chat-bubble
                 exemple,
                 key=f"{idx_exemple}_button",
                 on_click=exemple_message_callback_button,
+                args=(exemple,),
             )
+    st.button(
+        ":new: Start a new conversation", on_click=clear_history, type="secondary"
+    )
     if os.environ.get("ENVIRONMENT") == "dev":
         information_placeholder.caption(
             f"""
         Used {st.session_state.token_count} tokens \n
+        Debug Langchain conversation:
         {st.session_state.history}
         """
         )
             doc_metadata = doc.metadata
             expander = st.expander(doc_content["title"])
+            expander.markdown(
+                f"**HalID** : https://hal.science/{doc_metadata['hal_id']}"
+            )
             expander.markdown(doc_metadata["abstract"])
             expander.markdown(f"**Authors** : {doc_content['authors']}")
             expander.markdown(f"**Keywords** : {doc_content['keywords']}")

sorbobotapp/conversation_retrieval_chain.py CHANGED Viewed

@@ -5,7 +5,9 @@ from typing import Any, Dict, Optional
 from keyword_extraction import KeywordExtractor
 from langchain.callbacks.manager import CallbackManagerForChainRun
 from langchain.chains.conversational_retrieval.base import (
-    ConversationalRetrievalChain, _get_chat_history)
 from langchain.schema import Document

 from keyword_extraction import KeywordExtractor
 from langchain.callbacks.manager import CallbackManagerForChainRun
 from langchain.chains.conversational_retrieval.base import (
+    ConversationalRetrievalChain,
+    _get_chat_history,
+)
 from langchain.schema import Document

sorbobotapp/static/styles.css CHANGED Viewed

@@ -33,4 +33,4 @@
 .chat-icon {
     border-radius: 5px;
-}

 .chat-icon {
     border-radius: 5px;
+}

sorbobotapp/vector_store.py CHANGED Viewed

@@ -222,7 +222,7 @@ class CustomVectorStore(VectorStore):
         return self._results_to_docs_and_scores(results)
     @staticmethod
-    def _fetch_title(title:str, abstract:str):
         if len(title) > 0:
             return title
         return abstract.split(".")[0]
@@ -234,7 +234,9 @@ class CustomVectorStore(VectorStore):
                 Document(
                     page_content=json.dumps(
                         {
-                            "title": self._fetch_title(result["title"][0], result["abstract"][0]),
                             "authors": result["authors"],
                             "keywords": result["keywords"],
                         }
@@ -271,14 +273,14 @@ class CustomVectorStore(VectorStore):
                         a.doi,
                         a.hal_id,
                         a.abstract_en,
-                        string_agg(distinct keyword."name", ', ') as keywords,
                         string_agg(distinct author."name", ', ') as authors,
                         abstract_embedding_en {self.distance_strategy} '{str(embedding)}' as distance
                     from article a
-                    left join article_keyword ON article_keyword.article_id = a.id
                     left join keyword on article_keyword.keyword_id = keyword.id
                     left join article_author ON article_author.article_id = a.id
-                    left join author on author.id = article_author.author_id
                     where
                         abstract_en != '' and
                         abstract_en != 'None' and

         return self._results_to_docs_and_scores(results)
     @staticmethod
+    def _fetch_title(title: str, abstract: str):
         if len(title) > 0:
             return title
         return abstract.split(".")[0]
                 Document(
                     page_content=json.dumps(
                         {
+                            "title": self._fetch_title(
+                                result["title"][0], result["abstract"][0]
+                            ),
                             "authors": result["authors"],
                             "keywords": result["keywords"],
                         }
                         a.doi,
                         a.hal_id,
                         a.abstract_en,
+                        string_agg(distinct keyword."name", ', ') as keywords,
                         string_agg(distinct author."name", ', ') as authors,
                         abstract_embedding_en {self.distance_strategy} '{str(embedding)}' as distance
                     from article a
+                    left join article_keyword ON article_keyword.article_id = a.id
                     left join keyword on article_keyword.keyword_id = keyword.id
                     left join article_author ON article_author.article_id = a.id
+                    left join author on author.id = article_author.author_id
                     where
                         abstract_en != '' and
                         abstract_en != 'None' and