Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -36,16 +36,15 @@ class ModelManager:
|
|
| 36 |
print("ํ ํฌ๋์ด์ ๋ก๋ฉ ์๋ฃ")
|
| 37 |
|
| 38 |
print("๋ชจ๋ธ ๋ก๋ฉ ์์...")
|
|
|
|
| 39 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 40 |
MODEL_ID,
|
| 41 |
token=HF_TOKEN,
|
| 42 |
torch_dtype=torch.float16,
|
| 43 |
-
device_map=
|
| 44 |
trust_remote_code=True,
|
| 45 |
-
low_cpu_mem_usage=True
|
| 46 |
-
max_memory={0: "13GB"} # GPU ๋ฉ๋ชจ๋ฆฌ ์ ํ
|
| 47 |
)
|
| 48 |
-
self.model.eval()
|
| 49 |
print("๋ชจ๋ธ ๋ก๋ฉ ์๋ฃ")
|
| 50 |
|
| 51 |
except Exception as e:
|
|
@@ -55,11 +54,13 @@ class ModelManager:
|
|
| 55 |
@spaces.GPU
|
| 56 |
def generate_text(self, prompt, max_tokens, temperature, top_p):
|
| 57 |
try:
|
|
|
|
|
|
|
| 58 |
input_ids = self.tokenizer.encode(
|
| 59 |
prompt,
|
| 60 |
return_tensors="pt",
|
| 61 |
add_special_tokens=True
|
| 62 |
-
).to(
|
| 63 |
|
| 64 |
with torch.no_grad():
|
| 65 |
output_ids = self.model.generate(
|
|
@@ -73,11 +74,15 @@ class ModelManager:
|
|
| 73 |
num_return_sequences=1
|
| 74 |
)
|
| 75 |
|
|
|
|
|
|
|
| 76 |
return self.tokenizer.decode(
|
| 77 |
output_ids[0][input_ids.shape[1]:],
|
| 78 |
skip_special_tokens=True
|
| 79 |
)
|
| 80 |
except Exception as e:
|
|
|
|
|
|
|
| 81 |
raise Exception(f"ํ
์คํธ ์์ฑ ์คํจ: {e}")
|
| 82 |
|
| 83 |
def generate_response(self, messages, max_tokens=4000, temperature=0.7, top_p=0.9):
|
|
|
|
| 36 |
print("ํ ํฌ๋์ด์ ๋ก๋ฉ ์๋ฃ")
|
| 37 |
|
| 38 |
print("๋ชจ๋ธ ๋ก๋ฉ ์์...")
|
| 39 |
+
# CUDA ์ด๊ธฐํ ๋ฐฉ์ง๋ฅผ ์ํ ์ค์
|
| 40 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 41 |
MODEL_ID,
|
| 42 |
token=HF_TOKEN,
|
| 43 |
torch_dtype=torch.float16,
|
| 44 |
+
device_map=None, # ์ด๊ธฐ์๋ device_map์ ์ค์ ํ์ง ์์
|
| 45 |
trust_remote_code=True,
|
| 46 |
+
low_cpu_mem_usage=True
|
|
|
|
| 47 |
)
|
|
|
|
| 48 |
print("๋ชจ๋ธ ๋ก๋ฉ ์๋ฃ")
|
| 49 |
|
| 50 |
except Exception as e:
|
|
|
|
| 54 |
@spaces.GPU
|
| 55 |
def generate_text(self, prompt, max_tokens, temperature, top_p):
|
| 56 |
try:
|
| 57 |
+
# GPU ์ปจํ
์คํธ ๋ด์์ device ์ค์
|
| 58 |
+
self.model = self.model.to("cuda")
|
| 59 |
input_ids = self.tokenizer.encode(
|
| 60 |
prompt,
|
| 61 |
return_tensors="pt",
|
| 62 |
add_special_tokens=True
|
| 63 |
+
).to("cuda")
|
| 64 |
|
| 65 |
with torch.no_grad():
|
| 66 |
output_ids = self.model.generate(
|
|
|
|
| 74 |
num_return_sequences=1
|
| 75 |
)
|
| 76 |
|
| 77 |
+
# CPU๋ก ๋ค์ ์ด๋
|
| 78 |
+
self.model = self.model.to("cpu")
|
| 79 |
return self.tokenizer.decode(
|
| 80 |
output_ids[0][input_ids.shape[1]:],
|
| 81 |
skip_special_tokens=True
|
| 82 |
)
|
| 83 |
except Exception as e:
|
| 84 |
+
if self.model.device.type == "cuda":
|
| 85 |
+
self.model = self.model.to("cpu")
|
| 86 |
raise Exception(f"ํ
์คํธ ์์ฑ ์คํจ: {e}")
|
| 87 |
|
| 88 |
def generate_response(self, messages, max_tokens=4000, temperature=0.7, top_p=0.9):
|