bai-granite

Build error

bai-granite / main.py

Pratham Bhat

Trace logs, and model loading occurs only once at the start

a6c4516 7 months ago

4.21 kB

	# from https://huggingface.co/spaces/iiced/mixtral-46.7b-fastapi/blob/main/main.py
	# example of use:
	# curl -X POST \
	# -H "Content-Type: application/json" \
	# -d '{
	# "prompt": "What is the capital of France?",
	# "history": [],
	# "system_prompt": "You are a very powerful AI assistant."
	# }' \
	# https://phk0-bai.hf.space/generate/

	from fastapi import FastAPI
	from pydantic import BaseModel
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import sys
	import uvicorn
	import torch
	# torch.mps.empty_cache()
	# torch.set_num_threads(1)

	import os
	# os.environ["HF_HOME"] = "/.cache"
	# os.environ["TRANSFORMERS_CACHE"] = "/.cache"
	os.environ["TQDM_DISABLE"] = "0"
	os.environ["TQDM_FORCE"] = "1"

	from transformers.utils import logging
	logging.set_verbosity_info()
	logger = logging.get_logger() # optional: get a logger instance if you want to customize
	logger.info("Hugging Face Transformers download started.")


	app = FastAPI()

	class Item(BaseModel):
	prompt: str
	history: list
	system_prompt: str
	temperature: float = 0.0
	max_new_tokens: int = 900
	top_p: float = 0.15
	repetition_penalty: float = 1.0

	def format_prompt(system, message, history):
	prompt = [{"role": "system", "content": system}]
	for user_prompt, bot_response in history:
	prompt += {"role": "user", "content": user_prompt}
	prompt += {"role": "assistant", "content": bot_response}
	prompt += {"role": "user", "content": message}
	return prompt

	def setup():
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# if torch.backends.mps.is_available():
	# device = torch.device("mps")
	# x = torch.ones(1, device=device)
	# print (x)
	# else:
	# device="cpu"
	# print ("MPS device not found.")

	# device = "auto"
	# device=torch.device("cpu")

	model_path = "ibm-granite/granite-34b-code-instruct-8k"
	print("Loading tokenizer for model: " + model_path, file=sys.stderr)
	tokenizer = AutoTokenizer.from_pretrained(model_path)

	print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
	# drop device_map if running on CPU
	model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
	model.eval()

	return model, tokenizer, device

	def generate(item: Item, model, tokenizer, device):
	# device = "cuda" if torch.cuda.is_available() else "cpu"

	# model_path = "ibm-granite/granite-34b-code-instruct-8k"

	# print("Loading tokenizer for model: " + model_path, file=sys.stderr)
	# tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/code/huggingface/transformers")
	# # drop device_map if running on CPU

	# print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
	# model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
	# model.eval()

	print("Adapting the input into a template...", file=sys.stderr)
	# change input text as desired
	chat = format_prompt(item.system_prompt, item.prompt, item.history)
	chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

	print("Tokenizing text", file=sys.stderr)
	# tokenize the text
	input_tokens = tokenizer(chat, return_tensors="pt")

	print("Transferring tokens to device: " + device, file=sys.stderr)
	# transfer tokenized inputs to the device
	for i in input_tokens:
	input_tokens[i] = input_tokens[i].to(device)

	print("Generating output tokens", file=sys.stderr)
	# generate output tokens
	output = model.generate(**input_tokens, max_new_tokens=900)

	print("Decoding output tokens", file=sys.stderr)
	output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
	return output_text

	model, tokenizer, device = setup()

	# model, tokenizer, device = setup()

	@app.post("/generate/")
	async def generate_text(item: Item):
	# return {"response": generate(item)}
	return {"response": generate(item, model, tokenizer, device)}

	@app.get("/")
	async def generate_text_root(item: Item):
	return {"response": "try entry point: /generate/"}