Spaces:

HanLee
/

linkedin-learning

Sleeping

App Files Files Community

HanLee commited on Nov 17, 2023

Commit

4a49d79

1 Parent(s): 2dd27c9

feat: final

Browse files

Files changed (9) hide show

.chainlit/config.toml +78 -0
.env.sample +2 -0
.gitignore +166 -0
.vscode/settings.json +0 -1
app/app.py +207 -0
app/prompt.py +26 -0
chainlit.md +8 -0
requirements.txt +8 -0
sample_pdf/NVDA 2QFY24.pdf +0 -0

.chainlit/config.toml ADDED Viewed

	@@ -0,0 +1,78 @@

+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+# Enable third parties caching (e.g LangChain cache)
+cache = false
+# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
+# follow_symlink = false
+[features]
+# Show the prompt playground
+prompt_playground = true
+# Authorize users to upload files with messages
+multi_modal = true
+# Allows user to use speech to text
+[features.speech_to_text]
+    enabled = false
+    # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
+    # language = "en-US"
+[UI]
+# Name of the app and chatbot.
+name = "Chatbot"
+# Show the readme while the conversation is empty.
+show_readme_as_default = true
+# Description of the app and chatbot. This is used for HTML tags.
+# description = ""
+# Large size content are by default collapsed for a cleaner ui
+default_collapse_content = true
+# The default value for the expand messages settings.
+default_expand_messages = false
+# Hide the chain of thought details from the user in the UI.
+hide_cot = false
+# Link to your github repo. This will add a github button in the UI's header.
+github = "https://github.com/LinkedInLearning/hands-on-ai-building-and-deploying-llm-powered-apps-4511409"
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+# Override default MUI light theme. (Check theme.ts)
+[UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+# Override default MUI dark theme. (Check theme.ts)
+[UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+    [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+[meta]
+generated_by = "0.7.501"

.env.sample ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ALLOW_RESET=TRUE
2	+ OPENAI_API_KEY="sk-your-openai-api-key"

.gitignore CHANGED Viewed

@@ -1,4 +1,170 @@
 .DS_Store
 node_modules
 .tmp
 npm-debug.log

+# Ruff
+.ruff_cache/
+# Chainlit
+.chainlit/.langchain.db
+# Chroma
+.chromadb/
 .DS_Store
 node_modules
 .tmp
 npm-debug.log
+# VSCode
+.vscode/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

.vscode/settings.json CHANGED Viewed

@@ -17,7 +17,6 @@
   "files.autoSave": "afterDelay",
   "screencastMode.onlyKeyboardShortcuts": true,
   "terminal.integrated.fontSize": 18,
-  "workbench.activityBar.visible": true,
   "workbench.colorTheme": "Visual Studio Dark",
   "workbench.fontAliasing": "antialiased",
   "workbench.statusBar.visible": true

   "files.autoSave": "afterDelay",
   "screencastMode.onlyKeyboardShortcuts": true,
   "terminal.integrated.fontSize": 18,
   "workbench.colorTheme": "Visual Studio Dark",
   "workbench.fontAliasing": "antialiased",
   "workbench.statusBar.visible": true

app/app.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Chroma compatibility issue resolution
+# https://docs.trychroma.com/troubleshooting#sqlite
+__import__('pysqlite3')
+import sys
+sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+from tempfile import NamedTemporaryFile
+import chainlit as cl
+from chainlit.types import AskFileResponse
+import chromadb
+from chromadb.config import Settings
+from langchain.chains import ConversationalRetrievalChain, RetrievalQAWithSourcesChain
+from langchain.chains.base import Chain
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders import PDFPlumberLoader
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.vectorstores.base import VectorStore
+from prompt import EXAMPLE_PROMPT, PROMPT, WELCOME_MESSAGE
+namespaces = set()
+def process_file(*, file: AskFileResponse) -> list:
+    if file.type != "application/pdf":
+        raise TypeError("Only PDF files are supported")
+    with NamedTemporaryFile() as tempfile:
+        tempfile.write(file.content)
+        ######################################################################
+        #
+        # 1. Load the PDF
+        #
+        ######################################################################
+        loader = PDFPlumberLoader(tempfile.name)
+        ######################################################################
+        documents = loader.load()
+        ######################################################################
+        #
+        # 2. Split the text
+        #
+        ######################################################################
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=3000,
+            chunk_overlap=100
+        )
+        ######################################################################
+        docs = text_splitter.split_documents(documents)
+        for i, doc in enumerate(docs):
+            doc.metadata["source"] = f"source_{i}"
+        if not docs:
+            raise ValueError("PDF file parsing failed.")
+        return docs
+def create_search_engine(*, file: AskFileResponse) -> VectorStore:
+    # Process and save data in the user session
+    docs = process_file(file=file)
+    cl.user_session.set("docs", docs)
+    ##########################################################################
+    #
+    # 3. Set the Encoder model for creating embeddings
+    #
+    ##########################################################################
+    encoder = OpenAIEmbeddings(
+        model="text-embedding-ada-002"
+    )
+    ##########################################################################
+    # Initialize Chromadb client and settings, reset to ensure we get a clean
+    # search engine
+    client = chromadb.EphemeralClient()
+    client_settings=Settings(
+        allow_reset=True,
+        anonymized_telemetry=False
+    )
+    search_engine = Chroma(
+        client=client,
+        client_settings=client_settings
+    )
+    search_engine._client.reset()
+    ##########################################################################
+    #
+    # 4. Create the document search engine. Remember to add
+    # client_settings using the above settings.
+    #
+    ##########################################################################
+    search_engine = Chroma.from_documents(
+        client=client,
+        documents=docs,
+        embedding=encoder,
+        client_settings=client_settings
+    )
+    ##########################################################################
+    return search_engine
+@cl.on_chat_start
+async def start():
+    files = None
+    while files is None:
+        files = await cl.AskFileMessage(
+            content=WELCOME_MESSAGE,
+            accept=["application/pdf"],
+            max_size_mb=20,
+        ).send()
+    file = files[0]
+    msg = cl.Message(content=f"Processing `{file.name}`...")
+    await msg.send()
+    try:
+        search_engine = await cl.make_async(create_search_engine)(file=file)
+    except Exception as e:
+        await cl.Message(content=f"Error: {e}").send()
+        raise SystemError
+    llm = ChatOpenAI(
+        model='gpt-3.5-turbo-16k-0613',
+        temperature=0,
+        streaming=True
+    )
+    ##########################################################################
+    #
+    # 5. Create the chain / tool for RetrievalQAWithSourcesChain.
+    #
+    ##########################################################################
+    chain = RetrievalQAWithSourcesChain.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=search_engine.as_retriever(max_tokens_limit=4097),
+        ######################################################################
+        # 6. Customize prompts to improve summarization and question
+        # answering performance. Perhaps create your own prompt in prompts.py?
+        ######################################################################
+        chain_type_kwargs={
+            "prompt": PROMPT,
+            "document_prompt": EXAMPLE_PROMPT
+        },
+    )
+    ##########################################################################
+    # await msg.update(content=f"`{file.name}` processed. You can now ask questions!")
+    msg.content = f"`{file.name}` processed. You can now ask questions!"
+    await msg.update()
+    cl.user_session.set("chain", chain)
+@cl.on_message
+async def main(message: cl.Message):
+    chain = cl.user_session.get("chain")  # type: ConversationalRetrievalChain
+    cb = cl.AsyncLangchainCallbackHandler()
+    response = await chain.acall(message.content, callbacks=[cb])
+    answer = response["answer"]
+    sources = response["sources"].strip()
+    source_elements = []
+    # Get the documents from the user session
+    docs = cl.user_session.get("docs")
+    metadatas = [doc.metadata for doc in docs]
+    all_sources = [m["source"] for m in metadatas]
+    # Adding sources to the answer
+    if sources:
+        found_sources = []
+        # Add the sources to the message
+        for source in sources.split(","):
+            source_name = source.strip().replace(".", "")
+            # Get the index of the source
+            try:
+                index = all_sources.index(source_name)
+            except ValueError:
+                continue
+            text = docs[index].page_content
+            found_sources.append(source_name)
+            # Create the text element referenced in the message
+            source_elements.append(cl.Text(content=text, name=source_name))
+        if found_sources:
+            answer += f"\nSources: {', '.join(found_sources)}"
+        else:
+            answer += "\nNo sources found"
+    await cl.Message(content=answer, elements=source_elements).send()

app/prompt.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# flake8: noqa
+from langchain.prompts import PromptTemplate
+WELCOME_MESSAGE = """\
+Welcome to Introduction to LLM App Development Sample PDF QA Application!
+To get started:
+1. Upload a PDF or text file
+2. Ask any question about the file!
+"""
+template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
+If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+ALWAYS return a "SOURCES" field in your answer, with the format "SOURCES: <source1>, <source2>, <source3>, ...".
+QUESTION: {question}
+=========
+{summaries}
+=========
+FINAL ANSWER:"""
+PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
+EXAMPLE_PROMPT = PromptTemplate(
+    template="Content: {page_content}\nSource: {source}",
+    input_variables=["page_content", "source"],
+)

chainlit.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# Welcome to your PDF QA Sample Application! 🚀🤖
+Hi Team! 👋 Congratulations on launching your first LLM Application. This application is build using OpenAI, Langchain, Chainlit, and Chroma. The goal of this application is to provite a quick overview of the most basic archetype of LLM application and the prototyping and debugging environment.
+## Useful Links 🔗
+- **Langchain Documentation:** Get started with [Langchain Documentation](https://python.langchain.com/) 🔗
+- **Chainlit Documentation:** Get started with [Chainlit Documentation](https://docs.chainlit.io) 📚

requirements.txt CHANGED Viewed

	@@ -1 +1,9 @@
1	# Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.

 # Specify Python package requirements for your project here (e.g., Mako==1.1.1). If your project doesn't require these, you can leave this file unchanged or delete it.
+openai==1.2.3
+langchain==0.0.334
+chainlit==0.7.501
+tiktoken==0.5.1
+pdfplumber==0.10.3
+chromadb==0.4.17
+pysqlite3-binary==0.5.2.post1
+ruff==0.1.5

sample_pdf/NVDA 2QFY24.pdf ADDED Viewed

Binary file (85.3 kB). View file