Spaces:

Agents-MCP-Hackathon
/

PodcastGenerator

Runtime error

App Files Files Community

PodcastGenerator / modal_setup.py

imessam

DEV: first

ee980d6 5 months ago

raw

history blame contribute delete

2.3 kB

	import modal
	import os


	app_name : str = "example-vllm-openai-compatible"

	app = modal.App(name=app_name)



	print(f"setting up container image ...")

	vllm_image = (
	modal.Image.debian_slim(python_version="3.12")
	.pip_install(
	"vllm==0.7.2",
	"huggingface_hub[hf_transfer]==0.26.2",
	"flashinfer-python==0.2.0.post2", # pinning, very unstable
	extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
	)

	vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})

	print(f" done setting up container image.")




	MODELS_DIR = "/llamas",
	MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
	MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"


	print(f" downloading model weights...")


	hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
	vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)


	print(f" done downloading model weights.")



	print(f"building engine...")

	N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count

	MINUTES = 60 # seconds

	VLLM_PORT = 8000


	@app.function(
	image = vllm_image,
	secrets=[modal.Secret.from_name("api_key")],
	gpu=f"H100:{N_GPU}",
	scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
	timeout=10 * MINUTES, # how long should we wait for container start?
	volumes={
	"/root/.cache/huggingface": hf_cache_vol,
	"/root/.cache/vllm": vllm_cache_vol,
	},
	)
	@modal.concurrent(
	max_inputs=100
	) # how many requests can one replica handle? tune carefully!
	@modal.web_server(port=VLLM_PORT, startup_timeout=50 * MINUTES)
	def serve():
	import subprocess

	API_KEY = os.environ["MODAL_API_KEY"]

	cmd = [
	"vllm",
	"serve",
	"--uvicorn-log-level=info",
	MODEL_NAME,
	"--revision",
	MODEL_REVISION,
	"--host",
	"0.0.0.0",
	"--port",
	str(VLLM_PORT),
	"--api-key",
	API_KEY,
	"--enable-auto-tool-choice"
	" ",
	"--tool-call-parser",
	"llama3_json"
	]

	subprocess.Popen(" ".join(cmd), shell=True)


	print(f"done building engine.")