File size: 1,152 Bytes
d02f5a2
a743fcc
b06a28f
f64166c
a724c56
f01ccc1
b06a28f
 
d02f5a2
 
a724c56
d02f5a2
f64166c
a743fcc
d02f5a2
f64166c
a743fcc
f01ccc1
d02f5a2
 
a724c56
 
 
f64166c
 
 
d02f5a2
 
f64166c
 
 
 
 
d02f5a2
f64166c
 
 
a724c56
d02f5a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from flask import Flask, request, jsonify, send_file
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import threading, time

app = Flask(__name__)
start_time = time.time()

REPO = "TheBloke/Qwen2.5-1.8B-Chat-GGUF"
FILE = "qwen2_5-1.8b-chat.Q4_K_M.gguf"

print("🔽 Downloading model...")
MODEL_PATH = hf_hub_download(REPO, FILE, local_dir=".", local_dir_use_symlinks=False)

print("🔄 Loading model...")
llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=8)

@app.route("/", methods=["GET"])
def root():
    return send_file("index.html")

@app.route("/chat", methods=["POST"])
def chat():
    msg = request.json.get("message", "").strip()
    if not msg:
        return jsonify({"error": "Empty message"}), 400
    prompt = f"<|user|>\n{msg}\n<|assistant|>"
    out = llm(prompt, max_tokens=300, temperature=0.7, stop=["<|user|>", "<|assistant|>"])
    return jsonify({"reply": out["choices"][0]["text"].strip()})

@app.route("/status")
def status():
    return jsonify({
        "uptime": round(time.time() - start_time),
        "model": FILE
    })

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)