from flask import Flask, request, jsonify, send_file from llama_cpp import Llama from huggingface_hub import hf_hub_download import threading, time app = Flask(__name__) start_time = time.time() REPO = "TheBloke/Qwen2.5-1.8B-Chat-GGUF" FILE = "qwen2_5-1.8b-chat.Q4_K_M.gguf" print("🔽 Downloading model...") MODEL_PATH = hf_hub_download(REPO, FILE, local_dir=".", local_dir_use_symlinks=False) print("🔄 Loading model...") llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=8) @app.route("/", methods=["GET"]) def root(): return send_file("index.html") @app.route("/chat", methods=["POST"]) def chat(): msg = request.json.get("message", "").strip() if not msg: return jsonify({"error": "Empty message"}), 400 prompt = f"<|user|>\n{msg}\n<|assistant|>" out = llm(prompt, max_tokens=300, temperature=0.7, stop=["<|user|>", "<|assistant|>"]) return jsonify({"reply": out["choices"][0]["text"].strip()}) @app.route("/status") def status(): return jsonify({ "uptime": round(time.time() - start_time), "model": FILE }) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)