thinkingnew commited on
Commit
c6509f9
·
1 Parent(s): cd55902
Files changed (2) hide show
  1. .app.py.swp +0 -0
  2. app.py +45 -8
.app.py.swp ADDED
Binary file (4.1 kB). View file
 
app.py CHANGED
@@ -2,24 +2,61 @@ from fastapi import FastAPI
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
 
5
 
6
  app = FastAPI()
7
 
8
- # Load Model from Hugging Face Hub
9
  base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
10
  adapter_path = "thinkingnew/llama_invs_adapter"
11
 
12
- base_model = AutoModelForCausalLM.from_pretrained(
13
- base_model_path, torch_dtype=torch.float16, device_map="auto"
14
- )
15
- model = PeftModel.from_pretrained(base_model, adapter_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
17
 
 
 
 
 
 
 
 
 
 
 
18
  @app.get("/")
19
  async def root():
20
  return {"message": "Model is running! Use /generate/ for text generation."}
 
 
21
  @app.post("/generate/")
22
  async def generate_text(prompt: str):
23
- pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
24
- result = pipe(f"<s>[INST] {prompt} [/INST]")
25
- return {"response": result[0]['generated_text']}
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
5
+ import os
6
 
7
  app = FastAPI()
8
 
9
+ # Define paths
10
  base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B"
11
  adapter_path = "thinkingnew/llama_invs_adapter"
12
 
13
+ # Check if GPU is available
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ # Create offload directory if running on CPU
17
+ offload_dir = "./offload"
18
+ os.makedirs(offload_dir, exist_ok=True)
19
+
20
+ # Load base model
21
+ try:
22
+ base_model = AutoModelForCausalLM.from_pretrained(
23
+ base_model_path,
24
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
25
+ device_map="auto",
26
+ offload_folder=offload_dir if device == "cpu" else None # Offload to disk if running on CPU
27
+ )
28
+ except Exception as e:
29
+ print(f"Error loading base model: {e}")
30
+ raise
31
+
32
+ # Load adapter
33
+ try:
34
+ model = PeftModel.from_pretrained(
35
+ base_model, adapter_path, offload_dir=offload_dir if device == "cpu" else None
36
+ )
37
+ except Exception as e:
38
+ print(f"Error loading adapter: {e}")
39
+ raise
40
+
41
+ # Load tokenizer
42
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
43
 
44
+ # Load pipeline once for better performance
45
+ text_pipe = pipeline(
46
+ task="text-generation",
47
+ model=model,
48
+ tokenizer=tokenizer,
49
+ max_length=512,
50
+ device=0 if device == "cuda" else -1
51
+ )
52
+
53
+ # Root endpoint for testing
54
  @app.get("/")
55
  async def root():
56
  return {"message": "Model is running! Use /generate/ for text generation."}
57
+
58
+ # Text generation endpoint
59
  @app.post("/generate/")
60
  async def generate_text(prompt: str):
61
+ result = text_pipe(f"<s>[INST] {prompt} [/INST]")
62
+ return {"response": result[0]['generated_text']}