File size: 1,152 Bytes
d02f5a2 a743fcc b06a28f f64166c a724c56 f01ccc1 b06a28f d02f5a2 a724c56 d02f5a2 f64166c a743fcc d02f5a2 f64166c a743fcc f01ccc1 d02f5a2 a724c56 f64166c d02f5a2 f64166c d02f5a2 f64166c a724c56 d02f5a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
from flask import Flask, request, jsonify, send_file
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import threading, time
app = Flask(__name__)
start_time = time.time()
REPO = "TheBloke/Qwen2.5-1.8B-Chat-GGUF"
FILE = "qwen2_5-1.8b-chat.Q4_K_M.gguf"
print("🔽 Downloading model...")
MODEL_PATH = hf_hub_download(REPO, FILE, local_dir=".", local_dir_use_symlinks=False)
print("🔄 Loading model...")
llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=8)
@app.route("/", methods=["GET"])
def root():
return send_file("index.html")
@app.route("/chat", methods=["POST"])
def chat():
msg = request.json.get("message", "").strip()
if not msg:
return jsonify({"error": "Empty message"}), 400
prompt = f"<|user|>\n{msg}\n<|assistant|>"
out = llm(prompt, max_tokens=300, temperature=0.7, stop=["<|user|>", "<|assistant|>"])
return jsonify({"reply": out["choices"][0]["text"].strip()})
@app.route("/status")
def status():
return jsonify({
"uptime": round(time.time() - start_time),
"model": FILE
})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |