Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,7 +22,7 @@ ROLE_TOKENS = {
|
|
| 22 |
}
|
| 23 |
|
| 24 |
CONTEXT_SIZE = 2000
|
| 25 |
-
ENABLE_GPU =
|
| 26 |
GPU_LAYERS = 70
|
| 27 |
|
| 28 |
# Create a lock object
|
|
@@ -39,8 +39,8 @@ app.logger.setLevel(logging.DEBUG) # Set the desired logging level
|
|
| 39 |
#repo_name = "IlyaGusev/saiga2_13b_gguf"
|
| 40 |
#model_name = "model-q4_K.gguf"
|
| 41 |
|
| 42 |
-
repo_name = "IlyaGusev/
|
| 43 |
-
model_name = "
|
| 44 |
|
| 45 |
#repo_name = "IlyaGusev/saiga2_7b_gguf"
|
| 46 |
#model_name = "model-q4_K.gguf"
|
|
@@ -98,8 +98,8 @@ def init_model(context_size, enable_gpu=False, gpu_layer_number=35):
|
|
| 98 |
logits_all=True,
|
| 99 |
#n_threads=12,
|
| 100 |
verbose=True,
|
| 101 |
-
n_gpu_layers=gpu_layer_number
|
| 102 |
-
n_gqa=8 #must be set for 70b models
|
| 103 |
)
|
| 104 |
return model
|
| 105 |
else:
|
|
@@ -110,8 +110,8 @@ def init_model(context_size, enable_gpu=False, gpu_layer_number=35):
|
|
| 110 |
#n_batch=100,
|
| 111 |
logits_all=True,
|
| 112 |
#n_threads=12,
|
| 113 |
-
verbose=True
|
| 114 |
-
n_gqa=8 #must be set for 70b models
|
| 115 |
)
|
| 116 |
return model
|
| 117 |
|
|
@@ -236,7 +236,7 @@ def generate_and_log_tokens(user_request, model, generator):
|
|
| 236 |
global response_tokens
|
| 237 |
for token in generate_tokens(model, generator):
|
| 238 |
if token == b'': # or (max_new_tokens is not None and i >= max_new_tokens):
|
| 239 |
-
log(user_request, response_tokens.decode("utf-8", errors="ignore"))
|
| 240 |
response_tokens = bytearray()
|
| 241 |
break
|
| 242 |
response_tokens.extend(token)
|
|
@@ -271,6 +271,8 @@ def generate_response():
|
|
| 271 |
for message in messages:
|
| 272 |
if message.get("from") == "assistant":
|
| 273 |
message_tokens = get_message_tokens(model=model, role="bot", content=message.get("content", ""))
|
|
|
|
|
|
|
| 274 |
else:
|
| 275 |
message_tokens = get_message_tokens(model=model, role="user", content=message.get("content", ""))
|
| 276 |
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
CONTEXT_SIZE = 2000
|
| 25 |
+
ENABLE_GPU = False
|
| 26 |
GPU_LAYERS = 70
|
| 27 |
|
| 28 |
# Create a lock object
|
|
|
|
| 39 |
#repo_name = "IlyaGusev/saiga2_13b_gguf"
|
| 40 |
#model_name = "model-q4_K.gguf"
|
| 41 |
|
| 42 |
+
repo_name = "IlyaGusev/saiga2_13b_gguf"
|
| 43 |
+
model_name = "model-q8_0.gguf"
|
| 44 |
|
| 45 |
#repo_name = "IlyaGusev/saiga2_7b_gguf"
|
| 46 |
#model_name = "model-q4_K.gguf"
|
|
|
|
| 98 |
logits_all=True,
|
| 99 |
#n_threads=12,
|
| 100 |
verbose=True,
|
| 101 |
+
n_gpu_layers=gpu_layer_number#,
|
| 102 |
+
#n_gqa=8 #must be set for 70b models
|
| 103 |
)
|
| 104 |
return model
|
| 105 |
else:
|
|
|
|
| 110 |
#n_batch=100,
|
| 111 |
logits_all=True,
|
| 112 |
#n_threads=12,
|
| 113 |
+
verbose=True#,
|
| 114 |
+
#n_gqa=8 #must be set for 70b models
|
| 115 |
)
|
| 116 |
return model
|
| 117 |
|
|
|
|
| 236 |
global response_tokens
|
| 237 |
for token in generate_tokens(model, generator):
|
| 238 |
if token == b'': # or (max_new_tokens is not None and i >= max_new_tokens):
|
| 239 |
+
#log(user_request, response_tokens.decode("utf-8", errors="ignore"))
|
| 240 |
response_tokens = bytearray()
|
| 241 |
break
|
| 242 |
response_tokens.extend(token)
|
|
|
|
| 271 |
for message in messages:
|
| 272 |
if message.get("from") == "assistant":
|
| 273 |
message_tokens = get_message_tokens(model=model, role="bot", content=message.get("content", ""))
|
| 274 |
+
elif message.get("from") == "system":
|
| 275 |
+
message_tokens = get_message_tokens(model=model, role="system", content=message.get("content", ""))
|
| 276 |
else:
|
| 277 |
message_tokens = get_message_tokens(model=model, role="user", content=message.get("content", ""))
|
| 278 |
|