|
|
from flask import Flask, request, jsonify, send_file |
|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
import threading, time |
|
|
|
|
|
app = Flask(__name__) |
|
|
start_time = time.time() |
|
|
|
|
|
REPO = "TheBloke/Qwen2.5-1.8B-Chat-GGUF" |
|
|
FILE = "qwen2_5-1.8b-chat.Q4_K_M.gguf" |
|
|
|
|
|
print("🔽 Downloading model...") |
|
|
MODEL_PATH = hf_hub_download(REPO, FILE, local_dir=".", local_dir_use_symlinks=False) |
|
|
|
|
|
print("🔄 Loading model...") |
|
|
llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=8) |
|
|
|
|
|
@app.route("/", methods=["GET"]) |
|
|
def root(): |
|
|
return send_file("index.html") |
|
|
|
|
|
@app.route("/chat", methods=["POST"]) |
|
|
def chat(): |
|
|
msg = request.json.get("message", "").strip() |
|
|
if not msg: |
|
|
return jsonify({"error": "Empty message"}), 400 |
|
|
prompt = f"<|user|>\n{msg}\n<|assistant|>" |
|
|
out = llm(prompt, max_tokens=300, temperature=0.7, stop=["<|user|>", "<|assistant|>"]) |
|
|
return jsonify({"reply": out["choices"][0]["text"].strip()}) |
|
|
|
|
|
@app.route("/status") |
|
|
def status(): |
|
|
return jsonify({ |
|
|
"uptime": round(time.time() - start_time), |
|
|
"model": FILE |
|
|
}) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.run(host="0.0.0.0", port=7860) |