amis5895 commited on
Commit
6dfd72e
ยท
1 Parent(s): dbc5a3f

Fix AutoTrain command arguments - use correct format

Browse files
Files changed (2) hide show
  1. app.py +34 -28
  2. app_corrected_autotrain.py +360 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- ๊ถŒํ•œ ๋ฌธ์ œ๋ฅผ ํ•ด๊ฒฐํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
  """
5
 
6
  import os
@@ -34,7 +34,7 @@ training_status = {
34
  "total_epochs": 3,
35
  "loss": 0.0,
36
  "status": "idle",
37
- "log_file": "/tmp/training.log" # ๊ถŒํ•œ ๋ฌธ์ œ ํ•ด๊ฒฐ์„ ์œ„ํ•ด /tmp ์‚ฌ์šฉ
38
  }
39
 
40
  class TrainingRequest(BaseModel):
@@ -65,7 +65,7 @@ async def start_training(request: TrainingRequest, background_tasks: BackgroundT
65
  })
66
 
67
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
68
- background_tasks.add_task(run_real_training, request)
69
 
70
  return {
71
  "message": "Training started",
@@ -73,12 +73,12 @@ async def start_training(request: TrainingRequest, background_tasks: BackgroundT
73
  "model_name": request.model_name
74
  }
75
 
76
- async def run_real_training(request: TrainingRequest):
77
- """์‹ค์ œ AutoTrain์„ ์‚ฌ์šฉํ•œ ํ•™์Šต ์‹คํ–‰"""
78
  global training_status
79
 
80
  try:
81
- logger.info("Starting real AutoTrain training process...")
82
  training_status["status"] = "running"
83
 
84
  # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
@@ -113,18 +113,23 @@ async def run_real_training(request: TrainingRequest):
113
  })
114
  return
115
 
116
- logger.info("All files found, starting real AutoTrain training...")
117
 
118
- # ๋กœ๊ทธ ํŒŒ์ผ ์ดˆ๊ธฐํ™” (/tmp ์‚ฌ์šฉ)
119
  log_file = Path(training_status["log_file"])
120
  try:
121
- log_file.write_text("Starting AutoTrain training...\n", encoding="utf-8")
122
  except Exception as e:
123
  logger.warning(f"Could not write to log file: {e}")
124
- # ๋กœ๊ทธ ํŒŒ์ผ์„ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†์œผ๋ฉด ๋ฉ”๋ชจ๋ฆฌ์— ์ €์žฅ
125
- training_status["log_content"] = "Starting AutoTrain training...\n"
126
 
127
- # AutoTrain ๋ช…๋ น์–ด ์‹คํ–‰
 
 
 
 
 
 
128
  cmd = [
129
  "autotrain", "llm",
130
  "--train",
@@ -132,34 +137,34 @@ async def run_real_training(request: TrainingRequest):
132
  "--model", "LGAI-EXAONE/EXAONE-4.0-1.2B",
133
  "--data_path", "/app",
134
  "--text_column", "text",
135
- "--use_peft",
136
  "--quantization", "int4",
137
- "--lora_r", "16",
138
- "--lora_alpha", "32",
139
- "--lora_dropout", "0.05",
140
- "--target_modules", "all-linear",
141
  "--epochs", "3",
142
- "--batch_size", "4",
143
- "--gradient_accumulation", "4",
144
- "--learning_rate", "2e-4",
145
- "--warmup_ratio", "0.03",
146
- "--mixed_precision", "fp16",
147
- "--push_to_hub",
148
- "--hub_model_id", request.model_name,
149
  "--username", "amis5895"
150
  ]
151
 
152
- logger.info(f"Running command: {' '.join(cmd)}")
153
 
154
  # ๋กœ๊ทธ ํŒŒ์ผ์— ๋ช…๋ น์–ด ๊ธฐ๋ก
155
  try:
156
  with open(log_file, "a", encoding="utf-8") as f:
157
- f.write(f"Command: {' '.join(cmd)}\n")
158
  f.write("=" * 50 + "\n")
159
  except:
160
  if "log_content" not in training_status:
161
  training_status["log_content"] = ""
162
- training_status["log_content"] += f"Command: {' '.join(cmd)}\n" + "=" * 50 + "\n"
163
 
164
  # AutoTrain ํ”„๋กœ์„ธ์Šค ์‹คํ–‰
165
  process = subprocess.Popen(
@@ -169,7 +174,8 @@ async def run_real_training(request: TrainingRequest):
169
  text=True,
170
  bufsize=1,
171
  universal_newlines=True,
172
- cwd="/app"
 
173
  )
174
 
175
  # ํ•™์Šต ์ง„ํ–‰ ์ƒํ™ฉ ๋ชจ๋‹ˆํ„ฐ๋ง
 
1
  #!/usr/bin/env python3
2
  """
3
+ ์˜ฌ๋ฐ”๋ฅธ AutoTrain ๋ช…๋ น์–ด๋ฅผ ์‚ฌ์šฉํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
  """
5
 
6
  import os
 
34
  "total_epochs": 3,
35
  "loss": 0.0,
36
  "status": "idle",
37
+ "log_file": "/tmp/training.log"
38
  }
39
 
40
  class TrainingRequest(BaseModel):
 
65
  })
66
 
67
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
68
+ background_tasks.add_task(run_corrected_training, request)
69
 
70
  return {
71
  "message": "Training started",
 
73
  "model_name": request.model_name
74
  }
75
 
76
+ async def run_corrected_training(request: TrainingRequest):
77
+ """์ˆ˜์ •๋œ AutoTrain์„ ์‚ฌ์šฉํ•œ ํ•™์Šต ์‹คํ–‰"""
78
  global training_status
79
 
80
  try:
81
+ logger.info("Starting corrected AutoTrain training process...")
82
  training_status["status"] = "running"
83
 
84
  # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
 
113
  })
114
  return
115
 
116
+ logger.info("All files found, starting corrected AutoTrain training...")
117
 
118
+ # ๋กœ๊ทธ ํŒŒ์ผ ์ดˆ๊ธฐํ™”
119
  log_file = Path(training_status["log_file"])
120
  try:
121
+ log_file.write_text("Starting corrected AutoTrain training...\n", encoding="utf-8")
122
  except Exception as e:
123
  logger.warning(f"Could not write to log file: {e}")
124
+ training_status["log_content"] = "Starting corrected AutoTrain training...\n"
 
125
 
126
+ # ํ™˜๊ฒฝ๋ณ€์ˆ˜ ์„ค์ •
127
+ env = os.environ.copy()
128
+ env["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
129
+ env["HF_HOME"] = "/tmp/huggingface"
130
+ env["OMP_NUM_THREADS"] = "1"
131
+
132
+ # ์ˆ˜์ •๋œ AutoTrain ๋ช…๋ น์–ด (์˜ฌ๋ฐ”๋ฅธ ํ˜•์‹ ์‚ฌ์šฉ)
133
  cmd = [
134
  "autotrain", "llm",
135
  "--train",
 
137
  "--model", "LGAI-EXAONE/EXAONE-4.0-1.2B",
138
  "--data_path", "/app",
139
  "--text_column", "text",
140
+ "--use-peft", # --use_peft ๋Œ€์‹  --use-peft
141
  "--quantization", "int4",
142
+ "--lora-r", "16", # --lora_r ๋Œ€์‹  --lora-r
143
+ "--lora-alpha", "32", # --lora_alpha ๋Œ€์‹  --lora-alpha
144
+ "--lora-dropout", "0.05", # --lora_dropout ๋Œ€์‹  --lora-dropout
145
+ "--target-modules", "all-linear", # --target_modules ๋Œ€์‹  --target-modules
146
  "--epochs", "3",
147
+ "--batch-size", "4", # --batch_size ๋Œ€์‹  --batch-size
148
+ "--gradient-accumulation", "4", # --gradient_accumulation ๋Œ€์‹  --gradient-accumulation
149
+ "--learning-rate", "2e-4", # --learning_rate ๋Œ€์‹  --learning-rate
150
+ "--warmup-ratio", "0.03", # --warmup_ratio ๋Œ€์‹  --warmup-ratio
151
+ "--mixed-precision", "fp16", # --mixed_precision ๋Œ€์‹  --mixed-precision
152
+ "--push-to-hub", # --push_to_hub ๋Œ€์‹  --push-to-hub
153
+ "--hub-model-id", request.model_name, # --hub_model_id ๋Œ€์‹  --hub-model-id
154
  "--username", "amis5895"
155
  ]
156
 
157
+ logger.info(f"Running corrected command: {' '.join(cmd)}")
158
 
159
  # ๋กœ๊ทธ ํŒŒ์ผ์— ๋ช…๋ น์–ด ๊ธฐ๋ก
160
  try:
161
  with open(log_file, "a", encoding="utf-8") as f:
162
+ f.write(f"Corrected Command: {' '.join(cmd)}\n")
163
  f.write("=" * 50 + "\n")
164
  except:
165
  if "log_content" not in training_status:
166
  training_status["log_content"] = ""
167
+ training_status["log_content"] += f"Corrected Command: {' '.join(cmd)}\n" + "=" * 50 + "\n"
168
 
169
  # AutoTrain ํ”„๋กœ์„ธ์Šค ์‹คํ–‰
170
  process = subprocess.Popen(
 
174
  text=True,
175
  bufsize=1,
176
  universal_newlines=True,
177
+ cwd="/app",
178
+ env=env
179
  )
180
 
181
  # ํ•™์Šต ์ง„ํ–‰ ์ƒํ™ฉ ๋ชจ๋‹ˆํ„ฐ๋ง
app_corrected_autotrain.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ์˜ฌ๋ฐ”๋ฅธ AutoTrain ๋ช…๋ น์–ด๋ฅผ ์‚ฌ์šฉํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import subprocess
9
+ import asyncio
10
+ from pathlib import Path
11
+ from typing import Dict, Any
12
+ import logging
13
+
14
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
15
+ from fastapi.responses import StreamingResponse
16
+ from pydantic import BaseModel
17
+ import uvicorn
18
+
19
+ # ๋กœ๊น… ์„ค์ •
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ app = FastAPI(
24
+ title="EXAONE Fine-tuning",
25
+ description="EXAONE 4.0 1.2B ๋ชจ๋ธ ํŒŒ์ธํŠœ๋‹ API",
26
+ version="1.0.0"
27
+ )
28
+
29
+ # ์ „์—ญ ๋ณ€์ˆ˜
30
+ training_status = {
31
+ "is_running": False,
32
+ "progress": 0,
33
+ "current_epoch": 0,
34
+ "total_epochs": 3,
35
+ "loss": 0.0,
36
+ "status": "idle",
37
+ "log_file": "/tmp/training.log"
38
+ }
39
+
40
+ class TrainingRequest(BaseModel):
41
+ model_name: str = "amis5895/exaone-1p2b-nutrition-kdri"
42
+
43
+ @app.get("/")
44
+ async def root():
45
+ """๋ฃจํŠธ ์—”๋“œํฌ์ธํŠธ"""
46
+ return {
47
+ "message": "EXAONE Fine-tuning API",
48
+ "status": "running",
49
+ "version": "1.0.0"
50
+ }
51
+
52
+ @app.post("/start_training")
53
+ async def start_training(request: TrainingRequest, background_tasks: BackgroundTasks):
54
+ """ํ•™์Šต ์‹œ์ž‘"""
55
+ global training_status
56
+
57
+ if training_status["is_running"]:
58
+ raise HTTPException(status_code=400, detail="Training is already running")
59
+
60
+ training_status.update({
61
+ "is_running": True,
62
+ "progress": 0,
63
+ "current_epoch": 0,
64
+ "status": "starting"
65
+ })
66
+
67
+ # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
68
+ background_tasks.add_task(run_corrected_training, request)
69
+
70
+ return {
71
+ "message": "Training started",
72
+ "status": "starting",
73
+ "model_name": request.model_name
74
+ }
75
+
76
+ async def run_corrected_training(request: TrainingRequest):
77
+ """์ˆ˜์ •๋œ AutoTrain์„ ์‚ฌ์šฉํ•œ ํ•™์Šต ์‹คํ–‰"""
78
+ global training_status
79
+
80
+ try:
81
+ logger.info("Starting corrected AutoTrain training process...")
82
+ training_status["status"] = "running"
83
+
84
+ # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
85
+ train_file = Path("/app/train.csv")
86
+ val_file = Path("/app/validation.csv")
87
+ config_file = Path("/app/autotrain_ultra_low_final.yaml")
88
+
89
+ if not train_file.exists():
90
+ logger.error(f"Training file not found: {train_file}")
91
+ training_status.update({
92
+ "is_running": False,
93
+ "status": "failed",
94
+ "error": "Training file not found"
95
+ })
96
+ return
97
+
98
+ if not val_file.exists():
99
+ logger.error(f"Validation file not found: {val_file}")
100
+ training_status.update({
101
+ "is_running": False,
102
+ "status": "failed",
103
+ "error": "Validation file not found"
104
+ })
105
+ return
106
+
107
+ if not config_file.exists():
108
+ logger.error(f"Config file not found: {config_file}")
109
+ training_status.update({
110
+ "is_running": False,
111
+ "status": "failed",
112
+ "error": "Config file not found"
113
+ })
114
+ return
115
+
116
+ logger.info("All files found, starting corrected AutoTrain training...")
117
+
118
+ # ๋กœ๊ทธ ํŒŒ์ผ ์ดˆ๊ธฐํ™”
119
+ log_file = Path(training_status["log_file"])
120
+ try:
121
+ log_file.write_text("Starting corrected AutoTrain training...\n", encoding="utf-8")
122
+ except Exception as e:
123
+ logger.warning(f"Could not write to log file: {e}")
124
+ training_status["log_content"] = "Starting corrected AutoTrain training...\n"
125
+
126
+ # ํ™˜๊ฒฝ๋ณ€์ˆ˜ ์„ค์ •
127
+ env = os.environ.copy()
128
+ env["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
129
+ env["HF_HOME"] = "/tmp/huggingface"
130
+ env["OMP_NUM_THREADS"] = "1"
131
+
132
+ # ์ˆ˜์ •๋œ AutoTrain ๋ช…๋ น์–ด (์˜ฌ๋ฐ”๋ฅธ ํ˜•์‹ ์‚ฌ์šฉ)
133
+ cmd = [
134
+ "autotrain", "llm",
135
+ "--train",
136
+ "--project_name", "exaone-finetuning",
137
+ "--model", "LGAI-EXAONE/EXAONE-4.0-1.2B",
138
+ "--data_path", "/app",
139
+ "--text_column", "text",
140
+ "--use-peft", # --use_peft ๋Œ€์‹  --use-peft
141
+ "--quantization", "int4",
142
+ "--lora-r", "16", # --lora_r ๋Œ€์‹  --lora-r
143
+ "--lora-alpha", "32", # --lora_alpha ๋Œ€์‹  --lora-alpha
144
+ "--lora-dropout", "0.05", # --lora_dropout ๋Œ€์‹  --lora-dropout
145
+ "--target-modules", "all-linear", # --target_modules ๋Œ€์‹  --target-modules
146
+ "--epochs", "3",
147
+ "--batch-size", "4", # --batch_size ๋Œ€์‹  --batch-size
148
+ "--gradient-accumulation", "4", # --gradient_accumulation ๋Œ€์‹  --gradient-accumulation
149
+ "--learning-rate", "2e-4", # --learning_rate ๋Œ€์‹  --learning-rate
150
+ "--warmup-ratio", "0.03", # --warmup_ratio ๋Œ€์‹  --warmup-ratio
151
+ "--mixed-precision", "fp16", # --mixed_precision ๋Œ€์‹  --mixed-precision
152
+ "--push-to-hub", # --push_to_hub ๋Œ€์‹  --push-to-hub
153
+ "--hub-model-id", request.model_name, # --hub_model_id ๋Œ€์‹  --hub-model-id
154
+ "--username", "amis5895"
155
+ ]
156
+
157
+ logger.info(f"Running corrected command: {' '.join(cmd)}")
158
+
159
+ # ๋กœ๊ทธ ํŒŒ์ผ์— ๋ช…๋ น์–ด ๊ธฐ๋ก
160
+ try:
161
+ with open(log_file, "a", encoding="utf-8") as f:
162
+ f.write(f"Corrected Command: {' '.join(cmd)}\n")
163
+ f.write("=" * 50 + "\n")
164
+ except:
165
+ if "log_content" not in training_status:
166
+ training_status["log_content"] = ""
167
+ training_status["log_content"] += f"Corrected Command: {' '.join(cmd)}\n" + "=" * 50 + "\n"
168
+
169
+ # AutoTrain ํ”„๋กœ์„ธ์Šค ์‹คํ–‰
170
+ process = subprocess.Popen(
171
+ cmd,
172
+ stdout=subprocess.PIPE,
173
+ stderr=subprocess.STDOUT,
174
+ text=True,
175
+ bufsize=1,
176
+ universal_newlines=True,
177
+ cwd="/app",
178
+ env=env
179
+ )
180
+
181
+ # ํ•™์Šต ์ง„ํ–‰ ์ƒํ™ฉ ๋ชจ๋‹ˆํ„ฐ๋ง
182
+ for line in process.stdout:
183
+ logger.info(line.strip())
184
+
185
+ # ๋กœ๊ทธ ํŒŒ์ผ์— ๊ธฐ๋ก
186
+ try:
187
+ with open(log_file, "a", encoding="utf-8") as f:
188
+ f.write(line)
189
+ except:
190
+ if "log_content" not in training_status:
191
+ training_status["log_content"] = ""
192
+ training_status["log_content"] += line
193
+
194
+ # ์ง„ํ–‰๋ฅ  ํŒŒ์‹ฑ
195
+ if "epoch" in line.lower() and "/" in line:
196
+ try:
197
+ # "Epoch 1/3" ํ˜•ํƒœ์—์„œ ์ง„ํ–‰๋ฅ  ์ถ”์ถœ
198
+ parts = line.split()
199
+ for i, part in enumerate(parts):
200
+ if part.lower() == "epoch" and i + 1 < len(parts):
201
+ epoch_info = parts[i + 1]
202
+ if "/" in epoch_info:
203
+ current, total = epoch_info.split("/")
204
+ training_status["current_epoch"] = int(current)
205
+ training_status["total_epochs"] = int(total)
206
+ training_status["progress"] = (int(current) / int(total)) * 100
207
+ break
208
+ except:
209
+ pass
210
+
211
+ # ์†์‹ค๊ฐ’ ํŒŒ์‹ฑ
212
+ if "loss" in line.lower():
213
+ try:
214
+ parts = line.split()
215
+ for i, part in enumerate(parts):
216
+ if part.lower() == "loss" and i + 1 < len(parts):
217
+ loss_value = float(parts[i + 1])
218
+ training_status["loss"] = loss_value
219
+ break
220
+ except:
221
+ pass
222
+
223
+ process.wait()
224
+
225
+ if process.returncode == 0:
226
+ training_status.update({
227
+ "is_running": False,
228
+ "progress": 100,
229
+ "status": "completed"
230
+ })
231
+ logger.info("Training completed successfully!")
232
+
233
+ # ์™„๋ฃŒ ๋กœ๊ทธ ๊ธฐ๋ก
234
+ try:
235
+ with open(log_file, "a", encoding="utf-8") as f:
236
+ f.write("\n" + "=" * 50 + "\n")
237
+ f.write("Training completed successfully!\n")
238
+ except:
239
+ if "log_content" not in training_status:
240
+ training_status["log_content"] = ""
241
+ training_status["log_content"] += "\n" + "=" * 50 + "\nTraining completed successfully!\n"
242
+ else:
243
+ training_status.update({
244
+ "is_running": False,
245
+ "status": "failed"
246
+ })
247
+ logger.error("Training failed!")
248
+
249
+ # ์‹คํŒจ ๋กœ๊ทธ ๊ธฐ๋ก
250
+ try:
251
+ with open(log_file, "a", encoding="utf-8") as f:
252
+ f.write("\n" + "=" * 50 + "\n")
253
+ f.write(f"Training failed with return code: {process.returncode}\n")
254
+ except:
255
+ if "log_content" not in training_status:
256
+ training_status["log_content"] = ""
257
+ training_status["log_content"] += "\n" + "=" * 50 + f"\nTraining failed with return code: {process.returncode}\n"
258
+
259
+ except Exception as e:
260
+ logger.error(f"Training error: {str(e)}")
261
+ training_status.update({
262
+ "is_running": False,
263
+ "status": "error",
264
+ "error": str(e)
265
+ })
266
+
267
+ # ์˜ค๋ฅ˜ ๋กœ๊ทธ ๊ธฐ๋ก
268
+ try:
269
+ with open(log_file, "a", encoding="utf-8") as f:
270
+ f.write(f"\nError: {str(e)}\n")
271
+ except:
272
+ if "log_content" not in training_status:
273
+ training_status["log_content"] = ""
274
+ training_status["log_content"] += f"\nError: {str(e)}\n"
275
+
276
+ @app.get("/status")
277
+ async def get_status():
278
+ """ํ•™์Šต ์ƒํƒœ ์กฐํšŒ"""
279
+ return training_status
280
+
281
+ @app.get("/logs")
282
+ async def get_logs():
283
+ """๋กœ๊ทธ ์กฐํšŒ"""
284
+ log_file = Path(training_status["log_file"])
285
+ if log_file.exists():
286
+ try:
287
+ with open(log_file, "r", encoding="utf-8") as f:
288
+ logs = f.read()
289
+ return {"logs": logs}
290
+ except:
291
+ pass
292
+
293
+ # ํŒŒ์ผ์„ ์ฝ์„ ์ˆ˜ ์—†์œผ๋ฉด ๋ฉ”๋ชจ๋ฆฌ์—์„œ ๊ฐ€์ ธ์˜ค๊ธฐ
294
+ if "log_content" in training_status:
295
+ return {"logs": training_status["log_content"]}
296
+ else:
297
+ return {"logs": "No logs available"}
298
+
299
+ @app.get("/logs/stream")
300
+ async def stream_logs():
301
+ """์‹ค์‹œ๊ฐ„ ๋กœ๊ทธ ์ŠคํŠธ๋ฆฌ๋ฐ"""
302
+ def generate_logs():
303
+ log_file = Path(training_status["log_file"])
304
+ if log_file.exists():
305
+ try:
306
+ with open(log_file, "r", encoding="utf-8") as f:
307
+ for line in f:
308
+ yield f"data: {line}\\n\\n"
309
+ except:
310
+ pass
311
+
312
+ # ํŒŒ์ผ์„ ์ฝ์„ ์ˆ˜ ์—†์œผ๋ฉด ๋ฉ”๋ชจ๋ฆฌ์—์„œ ๊ฐ€์ ธ์˜ค๊ธฐ
313
+ if "log_content" in training_status:
314
+ for line in training_status["log_content"].split('\n'):
315
+ yield f"data: {line}\\n\\n"
316
+ else:
317
+ yield "data: No logs available\\n\\n"
318
+
319
+ return StreamingResponse(generate_logs(), media_type="text/plain")
320
+
321
+ @app.post("/stop_training")
322
+ async def stop_training():
323
+ """ํ•™์Šต ์ค‘์ง€"""
324
+ global training_status
325
+
326
+ if not training_status["is_running"]:
327
+ raise HTTPException(status_code=400, detail="No training is running")
328
+
329
+ training_status.update({
330
+ "is_running": False,
331
+ "status": "stopped"
332
+ })
333
+
334
+ return {"message": "Training stopped"}
335
+
336
+ @app.get("/health")
337
+ async def health_check():
338
+ """ํ—ฌ์Šค ์ฒดํฌ"""
339
+ return {"status": "healthy", "timestamp": "2024-01-01T00:00:00Z"}
340
+
341
+ @app.get("/data_info")
342
+ async def get_data_info():
343
+ """๋ฐ์ดํ„ฐ ์ •๋ณด ์กฐํšŒ"""
344
+ train_file = Path("/app/train.csv")
345
+ val_file = Path("/app/validation.csv")
346
+ config_file = Path("/app/autotrain_ultra_low_final.yaml")
347
+
348
+ info = {
349
+ "train_file_exists": train_file.exists(),
350
+ "validation_file_exists": val_file.exists(),
351
+ "config_file_exists": config_file.exists(),
352
+ "train_file_size": train_file.stat().st_size if train_file.exists() else 0,
353
+ "validation_file_size": val_file.stat().st_size if val_file.exists() else 0,
354
+ "config_file_size": config_file.stat().st_size if config_file.exists() else 0
355
+ }
356
+
357
+ return info
358
+
359
+ if __name__ == "__main__":
360
+ uvicorn.run(app, host="0.0.0.0", port=7860)