thinkingnew commited on
Commit
589af9a
·
1 Parent(s): 4ec308a
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -2,6 +2,7 @@ from fastapi import FastAPI
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
 
5
 
6
  app = FastAPI()
7
 
@@ -12,23 +13,31 @@ adapter_path = "thinkingnew/llama_invs_adapter"
12
  # Check if GPU is available
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
 
15
- # Load base model
 
 
 
 
16
  base_model = AutoModelForCausalLM.from_pretrained(
17
- base_model_path, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto"
18
- ).to(device)
 
 
 
19
 
20
  # Load adapter
21
- model = PeftModel.from_pretrained(base_model, adapter_path).to(device)
22
 
23
  # Load tokenizer
24
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
25
 
26
- # Load pipeline once (for better performance)
27
  text_pipe = pipeline(
28
  task="text-generation",
29
  model=model,
30
  tokenizer=tokenizer,
31
- max_length=512
 
32
  )
33
 
34
  # Root endpoint for testing
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  from peft import PeftModel
4
  import torch
5
+ import os
6
 
7
  app = FastAPI()
8
 
 
13
  # Check if GPU is available
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
+ # Create offload folder if needed
17
+ offload_dir = "./offload"
18
+ os.makedirs(offload_dir, exist_ok=True)
19
+
20
+ # Load base model with offloading support
21
  base_model = AutoModelForCausalLM.from_pretrained(
22
+ base_model_path,
23
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
24
+ device_map="auto",
25
+ offload_folder=offload_dir if device == "cpu" else None # Offload to disk if running on CPU
26
+ )
27
 
28
  # Load adapter
29
+ model = PeftModel.from_pretrained(base_model, adapter_path)
30
 
31
  # Load tokenizer
32
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
33
 
34
+ # Load pipeline once for better performance
35
  text_pipe = pipeline(
36
  task="text-generation",
37
  model=model,
38
  tokenizer=tokenizer,
39
+ max_length=512,
40
+ device=0 if device == "cuda" else -1 # Use GPU index 0 if available, otherwise CPU
41
  )
42
 
43
  # Root endpoint for testing