Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -217,34 +217,6 @@ MODEL_TITLE = """
|
|
| 217 |
</div>
|
| 218 |
</div>
|
| 219 |
"""
|
| 220 |
-
# <a href='https://arxiv.org/pdf/2312.00738.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
|
| 221 |
-
# MODEL_DESC = """
|
| 222 |
-
# <div style='display:flex; gap: 0.25rem; '>
|
| 223 |
-
# <a href='https://github.com/SeaLLMs/SeaLLMs'><img src='https://img.shields.io/badge/Github-Code-success'></a>
|
| 224 |
-
# <a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
|
| 225 |
-
# <a href='https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
|
| 226 |
-
# </div>
|
| 227 |
-
# <span style="font-size: larger">
|
| 228 |
-
# This is <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a chatbot assistant optimized for Southeast Asian Languages. It produces helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
|
| 229 |
-
# Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">our article</a> for more details.
|
| 230 |
-
# </span>
|
| 231 |
-
# <br>
|
| 232 |
-
# <span >
|
| 233 |
-
# NOTE: The chatbot may produce inaccurate and harmful information about people, places, or facts.
|
| 234 |
-
# <span style="color: red">By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">SeaLLM Terms Of Use</a>, which include:</span><br>
|
| 235 |
-
# <ul>
|
| 236 |
-
# <li >
|
| 237 |
-
# You must not use our service to generate any harmful, unethical or illegal content that violates locally applicable and international laws or regulations,
|
| 238 |
-
# including but not limited to hate speech, violence, pornography and deception.</li>
|
| 239 |
-
# <li >
|
| 240 |
-
# The service collects user dialogue data for testing and performance improvement, and reserves the right to distribute it under
|
| 241 |
-
# <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution (CC-BY)</a> or similar license. So do not enter any personal information!
|
| 242 |
-
# </li>
|
| 243 |
-
# </ul>
|
| 244 |
-
# </span>
|
| 245 |
-
# """.strip()
|
| 246 |
-
|
| 247 |
-
# <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a helpful chatbot assistant for Southeast Asian Languages. It supports English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩, Thai 🇹🇭, Malay 🇲🇾, Khmer🇰🇭, Lao🇱🇦, Tagalog🇵🇭 and Burmese🇲🇲.
|
| 248 |
|
| 249 |
|
| 250 |
MODEL_DESC = f"""
|
|
@@ -1047,11 +1019,28 @@ class CustomTabbedInterface(gr.Blocks):
|
|
| 1047 |
|
| 1048 |
|
| 1049 |
|
| 1050 |
-
def vllm_abort(self: Any):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1051 |
sh = self.llm_engine.scheduler
|
| 1052 |
for g in (sh.waiting + sh.running + sh.swapped):
|
| 1053 |
sh.abort_seq_group(g.request_id)
|
| 1054 |
-
|
| 1055 |
from vllm.sequence import SequenceStatus
|
| 1056 |
scheduler = self.llm_engine.scheduler
|
| 1057 |
for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
|
|
@@ -1195,6 +1184,35 @@ def safety_check(text, history=None, ) -> Optional[str]:
|
|
| 1195 |
return None
|
| 1196 |
|
| 1197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1198 |
def chat_response_stream_multiturn(
|
| 1199 |
message: str,
|
| 1200 |
history: List[Tuple[str, str]],
|
|
@@ -1242,9 +1260,12 @@ def chat_response_stream_multiturn(
|
|
| 1242 |
return
|
| 1243 |
|
| 1244 |
# history will be appended with message later on
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
|
|
|
|
|
|
|
|
|
| 1248 |
|
| 1249 |
if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
|
| 1250 |
raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
|
|
@@ -1254,13 +1275,14 @@ def chat_response_stream_multiturn(
|
|
| 1254 |
max_tokens=max_tokens,
|
| 1255 |
frequency_penalty=frequency_penalty,
|
| 1256 |
presence_penalty=presence_penalty,
|
| 1257 |
-
stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]']
|
|
|
|
| 1258 |
)
|
| 1259 |
cur_out = None
|
| 1260 |
|
| 1261 |
for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
|
| 1262 |
if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
|
| 1263 |
-
cur_out = cur_out.replace("\\n", "\n")
|
| 1264 |
|
| 1265 |
# optionally check safety, and respond
|
| 1266 |
if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
|
|
@@ -1569,7 +1591,7 @@ def batch_inference(
|
|
| 1569 |
max_tokens: int,
|
| 1570 |
frequency_penalty: float,
|
| 1571 |
presence_penalty: float,
|
| 1572 |
-
stop_strings: str = "[STOP],<s>,</s
|
| 1573 |
current_time: Optional[float] = None,
|
| 1574 |
system_prompt: Optional[str] = SYSTEM_PROMPT_1
|
| 1575 |
):
|
|
@@ -1603,11 +1625,11 @@ def batch_inference(
|
|
| 1603 |
remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
|
| 1604 |
|
| 1605 |
if prompt_mode == 'chat':
|
| 1606 |
-
prompt_format_fn =
|
| 1607 |
elif prompt_mode == 'few-shot':
|
| 1608 |
from functools import partial
|
| 1609 |
prompt_format_fn = partial(
|
| 1610 |
-
|
| 1611 |
)
|
| 1612 |
else:
|
| 1613 |
raise gr.Error(f'Wrong mode {prompt_mode}')
|
|
@@ -1702,7 +1724,7 @@ def launch():
|
|
| 1702 |
f'\n| frequence_penalty={frequence_penalty} '
|
| 1703 |
f'\n| presence_penalty={presence_penalty} '
|
| 1704 |
f'\n| temperature={temperature} '
|
| 1705 |
-
f'\n| hf_model_name={hf_model_name} '
|
| 1706 |
f'\n| model_path={model_path} '
|
| 1707 |
f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
|
| 1708 |
f'\n| gpu_memory_utilization={gpu_memory_utilization} '
|
|
@@ -1748,9 +1770,9 @@ def launch():
|
|
| 1748 |
print(f'Cannot print model worker: {e}')
|
| 1749 |
|
| 1750 |
try:
|
| 1751 |
-
llm.llm_engine.scheduler_config.max_model_len =
|
| 1752 |
-
llm.llm_engine.scheduler_config.max_num_batched_tokens =
|
| 1753 |
-
llm.llm_engine.tokenizer.add_special_tokens = False
|
| 1754 |
except Exception as e:
|
| 1755 |
print(f'Cannot set parameters: {e}')
|
| 1756 |
|
|
@@ -1902,4 +1924,4 @@ def main():
|
|
| 1902 |
|
| 1903 |
|
| 1904 |
if __name__ == "__main__":
|
| 1905 |
-
main()
|
|
|
|
| 217 |
</div>
|
| 218 |
</div>
|
| 219 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
|
| 222 |
MODEL_DESC = f"""
|
|
|
|
| 1019 |
|
| 1020 |
|
| 1021 |
|
| 1022 |
+
# def vllm_abort(self: Any):
|
| 1023 |
+
# sh = self.llm_engine.scheduler
|
| 1024 |
+
# for g in (sh.waiting + sh.running + sh.swapped):
|
| 1025 |
+
# sh.abort_seq_group(g.request_id)
|
| 1026 |
+
|
| 1027 |
+
# from vllm.sequence import SequenceStatus
|
| 1028 |
+
# scheduler = self.llm_engine.scheduler
|
| 1029 |
+
# for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
|
| 1030 |
+
# for seq_group in state_queue:
|
| 1031 |
+
# # if seq_group.request_id == request_id:
|
| 1032 |
+
# # Remove the sequence group from the state queue.
|
| 1033 |
+
# state_queue.remove(seq_group)
|
| 1034 |
+
# for seq in seq_group.seqs:
|
| 1035 |
+
# if seq.is_finished():
|
| 1036 |
+
# continue
|
| 1037 |
+
# scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
|
| 1038 |
+
|
| 1039 |
+
|
| 1040 |
+
def vllm_abort(self):
|
| 1041 |
sh = self.llm_engine.scheduler
|
| 1042 |
for g in (sh.waiting + sh.running + sh.swapped):
|
| 1043 |
sh.abort_seq_group(g.request_id)
|
|
|
|
| 1044 |
from vllm.sequence import SequenceStatus
|
| 1045 |
scheduler = self.llm_engine.scheduler
|
| 1046 |
for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
|
|
|
|
| 1184 |
return None
|
| 1185 |
|
| 1186 |
|
| 1187 |
+
|
| 1188 |
+
TURN_TEMPLATE = "<|im_start|>{role}\n{content}</s>"
|
| 1189 |
+
TURN_PREFIX = "<|im_start|>{role}\n"
|
| 1190 |
+
|
| 1191 |
+
|
| 1192 |
+
def chatml_chat_convo_format(conversations, add_assistant_prefix: bool, default_system=SYSTEM_PROMPT_1):
|
| 1193 |
+
if conversations[0]['role'] != 'system':
|
| 1194 |
+
conversations = [{"role": "system", "content": default_system}] + conversations
|
| 1195 |
+
text = ''
|
| 1196 |
+
for turn_id, turn in enumerate(conversations):
|
| 1197 |
+
prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
|
| 1198 |
+
text += prompt
|
| 1199 |
+
if add_assistant_prefix:
|
| 1200 |
+
prompt = TURN_PREFIX.format(role='assistant')
|
| 1201 |
+
text += prompt
|
| 1202 |
+
return text
|
| 1203 |
+
|
| 1204 |
+
|
| 1205 |
+
def chatml_format(message, history=None, system_prompt=None):
|
| 1206 |
+
conversations = []
|
| 1207 |
+
system_prompt = system_prompt or "You are a helpful assistant."
|
| 1208 |
+
if history is not None and len(history) > 0:
|
| 1209 |
+
for i, (prompt, res) in enumerate(history):
|
| 1210 |
+
conversations.append({"role": "user", "content": prompt.strip()})
|
| 1211 |
+
conversations.append({"role": "assistant", "content": res.strip()})
|
| 1212 |
+
conversations.append({"role": "user", "content": message.strip()})
|
| 1213 |
+
return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
|
| 1214 |
+
|
| 1215 |
+
|
| 1216 |
def chat_response_stream_multiturn(
|
| 1217 |
message: str,
|
| 1218 |
history: List[Tuple[str, str]],
|
|
|
|
| 1260 |
return
|
| 1261 |
|
| 1262 |
# history will be appended with message later on
|
| 1263 |
+
|
| 1264 |
+
# full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
|
| 1265 |
+
# message, history, sys_prompt=system_prompt
|
| 1266 |
+
# )
|
| 1267 |
+
full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
|
| 1268 |
+
print(full_prompt)
|
| 1269 |
|
| 1270 |
if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
|
| 1271 |
raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
|
|
|
|
| 1275 |
max_tokens=max_tokens,
|
| 1276 |
frequency_penalty=frequency_penalty,
|
| 1277 |
presence_penalty=presence_penalty,
|
| 1278 |
+
# stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'],
|
| 1279 |
+
stop=['<s>', '</s>', '<|im_start|>', '<|im_end|>'],
|
| 1280 |
)
|
| 1281 |
cur_out = None
|
| 1282 |
|
| 1283 |
for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
|
| 1284 |
if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
|
| 1285 |
+
# cur_out = cur_out.replace("\\n", "\n")
|
| 1286 |
|
| 1287 |
# optionally check safety, and respond
|
| 1288 |
if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
|
|
|
|
| 1591 |
max_tokens: int,
|
| 1592 |
frequency_penalty: float,
|
| 1593 |
presence_penalty: float,
|
| 1594 |
+
stop_strings: str = "[STOP],<s>,</s>,<|im_start|>",
|
| 1595 |
current_time: Optional[float] = None,
|
| 1596 |
system_prompt: Optional[str] = SYSTEM_PROMPT_1
|
| 1597 |
):
|
|
|
|
| 1625 |
remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
|
| 1626 |
|
| 1627 |
if prompt_mode == 'chat':
|
| 1628 |
+
prompt_format_fn = chatml_format
|
| 1629 |
elif prompt_mode == 'few-shot':
|
| 1630 |
from functools import partial
|
| 1631 |
prompt_format_fn = partial(
|
| 1632 |
+
chatml_format, include_end_instruct=False
|
| 1633 |
)
|
| 1634 |
else:
|
| 1635 |
raise gr.Error(f'Wrong mode {prompt_mode}')
|
|
|
|
| 1724 |
f'\n| frequence_penalty={frequence_penalty} '
|
| 1725 |
f'\n| presence_penalty={presence_penalty} '
|
| 1726 |
f'\n| temperature={temperature} '
|
| 1727 |
+
# f'\n| hf_model_name={hf_model_name} '
|
| 1728 |
f'\n| model_path={model_path} '
|
| 1729 |
f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
|
| 1730 |
f'\n| gpu_memory_utilization={gpu_memory_utilization} '
|
|
|
|
| 1770 |
print(f'Cannot print model worker: {e}')
|
| 1771 |
|
| 1772 |
try:
|
| 1773 |
+
llm.llm_engine.scheduler_config.max_model_len = 8192
|
| 1774 |
+
llm.llm_engine.scheduler_config.max_num_batched_tokens = 8192
|
| 1775 |
+
# llm.llm_engine.tokenizer.add_special_tokens = False
|
| 1776 |
except Exception as e:
|
| 1777 |
print(f'Cannot set parameters: {e}')
|
| 1778 |
|
|
|
|
| 1924 |
|
| 1925 |
|
| 1926 |
if __name__ == "__main__":
|
| 1927 |
+
main()
|