thinkingnew commited on
Commit
cd55902
·
1 Parent(s): 589af9a
Files changed (1) hide show
  1. app.py +5 -32
app.py CHANGED
@@ -2,51 +2,24 @@ from fastapi import FastAPI
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
5
- import os
6
 
7
  app = FastAPI()
8
 
9
- # Define paths
10
  base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
11
  adapter_path = "thinkingnew/llama_invs_adapter"
12
 
13
- # Check if GPU is available
14
- device = "cuda" if torch.cuda.is_available() else "cpu"
15
-
16
- # Create offload folder if needed
17
- offload_dir = "./offload"
18
- os.makedirs(offload_dir, exist_ok=True)
19
-
20
- # Load base model with offloading support
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
- base_model_path,
23
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
24
- device_map="auto",
25
- offload_folder=offload_dir if device == "cpu" else None # Offload to disk if running on CPU
26
  )
27
-
28
- # Load adapter
29
  model = PeftModel.from_pretrained(base_model, adapter_path)
30
-
31
- # Load tokenizer
32
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
33
 
34
- # Load pipeline once for better performance
35
- text_pipe = pipeline(
36
- task="text-generation",
37
- model=model,
38
- tokenizer=tokenizer,
39
- max_length=512,
40
- device=0 if device == "cuda" else -1 # Use GPU index 0 if available, otherwise CPU
41
- )
42
-
43
- # Root endpoint for testing
44
  @app.get("/")
45
  async def root():
46
  return {"message": "Model is running! Use /generate/ for text generation."}
47
-
48
- # Text generation endpoint
49
  @app.post("/generate/")
50
  async def generate_text(prompt: str):
51
- result = text_pipe(f"<s>[INST] {prompt} [/INST]")
52
- return {"response": result[0]['generated_text']}
 
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
 
5
 
6
  app = FastAPI()
7
 
8
+ # Load Model from Hugging Face Hub
9
  base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
10
  adapter_path = "thinkingnew/llama_invs_adapter"
11
 
 
 
 
 
 
 
 
 
12
  base_model = AutoModelForCausalLM.from_pretrained(
13
+ base_model_path, torch_dtype=torch.float16, device_map="auto"
 
 
 
14
  )
 
 
15
  model = PeftModel.from_pretrained(base_model, adapter_path)
 
 
16
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
17
 
 
 
 
 
 
 
 
 
 
 
18
  @app.get("/")
19
  async def root():
20
  return {"message": "Model is running! Use /generate/ for text generation."}
 
 
21
  @app.post("/generate/")
22
  async def generate_text(prompt: str):
23
+ pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
24
+ result = pipe(f"<s>[INST] {prompt} [/INST]")
25
+ return {"response": result[0]['generated_text']}