Update app.py
Browse files
app.py
CHANGED
|
@@ -14,20 +14,8 @@ hf_hub_download(
|
|
| 14 |
filename="fluentlylm-prinum-q4_k_m.gguf",
|
| 15 |
local_dir="./models"
|
| 16 |
)
|
| 17 |
-
model = "fluentlylm-prinum-q4_k_m.gguf"
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
llm = Llama(
|
| 22 |
-
model_path=f"models/{model}",
|
| 23 |
-
flash_attn=True,
|
| 24 |
-
n_gpu_layers=90,
|
| 25 |
-
n_batch=1536,
|
| 26 |
-
n_ctx=8192,
|
| 27 |
-
)
|
| 28 |
-
provider = LlamaCppPythonProvider(llm)
|
| 29 |
-
|
| 30 |
-
@spaces.GPU(duration=120)
|
| 31 |
def respond(
|
| 32 |
message,
|
| 33 |
history: list[tuple[str, str]],
|
|
@@ -39,6 +27,16 @@ def respond(
|
|
| 39 |
top_k,
|
| 40 |
repeat_penalty,
|
| 41 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
agent = LlamaCppAgent(
|
| 44 |
provider,
|
|
@@ -82,13 +80,13 @@ def respond(
|
|
| 82 |
outputs += output
|
| 83 |
yield outputs
|
| 84 |
|
| 85 |
-
def create_interface(description):
|
| 86 |
return gr.ChatInterface(
|
| 87 |
respond,
|
| 88 |
additional_inputs=[
|
| 89 |
-
gr.Textbox(value=
|
| 90 |
-
gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness
|
| 91 |
-
gr.Slider(minimum=1, maximum=4096, value=
|
| 92 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
|
| 93 |
gr.Slider(
|
| 94 |
minimum=0.1,
|
|
@@ -112,19 +110,19 @@ def create_interface(description):
|
|
| 112 |
label="Repetition penalty",
|
| 113 |
),
|
| 114 |
],
|
| 115 |
-
title="",
|
| 116 |
description=description,
|
| 117 |
chatbot=gr.Chatbot(
|
| 118 |
-
label=
|
| 119 |
scale=1,
|
| 120 |
show_copy_button=True
|
| 121 |
)
|
| 122 |
)
|
| 123 |
|
| 124 |
-
description = """#
|
| 125 |
-
interface = create_interface(description)
|
| 126 |
|
| 127 |
-
demo = gr.Blocks(
|
| 128 |
|
| 129 |
with demo:
|
| 130 |
interface.render()
|
|
|
|
| 14 |
filename="fluentlylm-prinum-q4_k_m.gguf",
|
| 15 |
local_dir="./models"
|
| 16 |
)
|
|
|
|
| 17 |
|
| 18 |
+
@spaces.GPU(duration=110)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def respond(
|
| 20 |
message,
|
| 21 |
history: list[tuple[str, str]],
|
|
|
|
| 27 |
top_k,
|
| 28 |
repeat_penalty,
|
| 29 |
):
|
| 30 |
+
chat_template = MessagesFormatterType.GEMMA_2
|
| 31 |
+
|
| 32 |
+
llm = Llama(
|
| 33 |
+
model_path=f"models/{model}",
|
| 34 |
+
flash_attn=True,
|
| 35 |
+
n_gpu_layers=81,
|
| 36 |
+
n_batch=1024,
|
| 37 |
+
n_ctx=8192,
|
| 38 |
+
)
|
| 39 |
+
provider = LlamaCppPythonProvider(llm)
|
| 40 |
|
| 41 |
agent = LlamaCppAgent(
|
| 42 |
provider,
|
|
|
|
| 80 |
outputs += output
|
| 81 |
yield outputs
|
| 82 |
|
| 83 |
+
def create_interface(model_name, description):
|
| 84 |
return gr.ChatInterface(
|
| 85 |
respond,
|
| 86 |
additional_inputs=[
|
| 87 |
+
gr.Textbox(value=model_name, label="Model", interactive=False),
|
| 88 |
+
gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
|
| 89 |
+
gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
|
| 90 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
|
| 91 |
gr.Slider(
|
| 92 |
minimum=0.1,
|
|
|
|
| 110 |
label="Repetition penalty",
|
| 111 |
),
|
| 112 |
],
|
| 113 |
+
title=f"**FluentlyLM Prinum** ```on ZeroGPU```",
|
| 114 |
description=description,
|
| 115 |
chatbot=gr.Chatbot(
|
| 116 |
+
label=None,
|
| 117 |
scale=1,
|
| 118 |
show_copy_button=True
|
| 119 |
)
|
| 120 |
)
|
| 121 |
|
| 122 |
+
description = """# **FluentlyLM Prinum ```on ZeroGPU```"""
|
| 123 |
+
interface = create_interface('fluentlylm-prinum-q4_k_m.gguf', description)
|
| 124 |
|
| 125 |
+
demo = gr.Blocks()
|
| 126 |
|
| 127 |
with demo:
|
| 128 |
interface.render()
|