Spaces:
Sleeping
Sleeping
| from typing import Any | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.output_parsers import NumberedListOutputParser | |
| from langchain.prompts import ChatPromptTemplate | |
| from utils import str_to_list | |
| query_template = """ | |
| You are a bi-lingual (french and english) linguistic teacher working at a top-tier university. | |
| We are conducting a research project that requires the extraction of keywords from chatbot queries. | |
| Below, you will find a query. Please identify and rank the three most important keywords or phrases (n-grams) based on their relevance to the main topic of the query. | |
| For each keyword or phrase, assign it to one of the following categories: ["University / Company", "Research domain", "Country", "Name", "Other"]. | |
| An 'n-gram' refers to a contiguous sequence of words, where 'n' can be 1 for a single word, 2 for a pair of words, and so on, up to two words in length. | |
| Please ensure not to list more than three n-grams in total. | |
| Your expertise in linguistic analysis is crucial for the success of this project. Thank you for your contribution. | |
| Please attach your ranked list in the following format: | |
| 1. Keyword/Phrase - Category | |
| 2. Keyword/Phrase - Category | |
| 3. Keyword/Phrase - Category | |
| You must be concise and don't need to justify your choices. | |
| ``` | |
| {query} | |
| ``` | |
| """ | |
| output_parser = NumberedListOutputParser() | |
| format_instructions = output_parser.get_format_instructions() | |
| class KeywordExtractor: | |
| def __init__(self): | |
| super().__init__() | |
| self.model = ChatOpenAI() | |
| self.prompt = ChatPromptTemplate.from_template( | |
| template=query_template, | |
| ) | |
| self.chain = self.prompt | self.model # | output_parser | |
| def __call__( | |
| self, inputs: str, filter_categories: list[str] = ["Research domain"] | |
| ) -> Any: | |
| output = self.chain.invoke({"query": inputs}) | |
| keywords = output_parser.parse(output.content) | |
| filtered_keywords = [] | |
| for keyword in keywords: | |
| if " - " not in keyword: | |
| continue | |
| keyword, category = keyword.split(" - ", maxsplit=2) | |
| if category in filter_categories: | |
| filtered_keywords.append(keyword) | |
| return filtered_keywords | |