amis5895 commited on
Commit
fc9016a
ยท
1 Parent(s): 44c69fc

Update app.py with simple training simulation

Browse files
Files changed (2) hide show
  1. app.py +56 -73
  2. app_simple.py +197 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
  """
5
 
6
  import os
@@ -38,8 +38,6 @@ training_status = {
38
 
39
  class TrainingRequest(BaseModel):
40
  model_name: str = "amis5895/exaone-1p2b-nutrition-kdri"
41
- dataset_path: str = "/app/data"
42
- config_path: str = "/app/autotrain_ultra_low_final.yaml"
43
 
44
  @app.get("/")
45
  async def root():
@@ -66,7 +64,7 @@ async def start_training(request: TrainingRequest, background_tasks: BackgroundT
66
  })
67
 
68
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
69
- background_tasks.add_task(run_training, request)
70
 
71
  return {
72
  "message": "Training started",
@@ -74,89 +72,60 @@ async def start_training(request: TrainingRequest, background_tasks: BackgroundT
74
  "model_name": request.model_name
75
  }
76
 
77
- async def run_training(request: TrainingRequest):
78
- """์‹ค์ œ ํ•™์Šต ์‹คํ–‰"""
79
  global training_status
80
 
81
  try:
82
- logger.info("Starting training process...")
83
  training_status["status"] = "running"
84
 
85
- # AutoTrain ๋ช…๋ น์–ด ์‹คํ–‰
86
- cmd = [
87
- "autotrain", "llm",
88
- "--train",
89
- "--project_name", "exaone-finetuning",
90
- "--model", "LGAI-EXAONE/EXAONE-4.0-1.2B",
91
- "--data_path", request.dataset_path,
92
- "--text_column", "text",
93
- "--use_peft",
94
- "--quantization", "int4",
95
- "--lora_r", "16",
96
- "--lora_alpha", "32",
97
- "--lora_dropout", "0.05",
98
- "--target_modules", "all-linear",
99
- "--epochs", "3",
100
- "--batch_size", "4",
101
- "--gradient_accumulation", "4",
102
- "--learning_rate", "2e-4",
103
- "--warmup_ratio", "0.03",
104
- "--mixed_precision", "fp16",
105
- "--push_to_hub",
106
- "--hub_model_id", request.model_name,
107
- "--username", "amis5895"
108
- ]
109
 
110
- process = subprocess.Popen(
111
- cmd,
112
- stdout=subprocess.PIPE,
113
- stderr=subprocess.STDOUT,
114
- text=True,
115
- bufsize=1,
116
- universal_newlines=True
117
- )
118
-
119
- # ํ•™์Šต ์ง„ํ–‰ ์ƒํ™ฉ ๋ชจ๋‹ˆํ„ฐ๋ง
120
- for line in process.stdout:
121
- logger.info(line.strip())
122
-
123
- # ์ง„ํ–‰๋ฅ  ํŒŒ์‹ฑ (๊ฐ„๋‹จํ•œ ์˜ˆ์‹œ)
124
- if "epoch" in line.lower():
125
- training_status["current_epoch"] += 1
126
- training_status["progress"] = (training_status["current_epoch"] / training_status["total_epochs"]) * 100
127
-
128
- if "loss" in line.lower():
129
- try:
130
- # ์†์‹ค๊ฐ’ ์ถ”์ถœ (๊ฐ„๋‹จํ•œ ์˜ˆ์‹œ)
131
- parts = line.split()
132
- for i, part in enumerate(parts):
133
- if part == "loss" and i + 1 < len(parts):
134
- training_status["loss"] = float(parts[i + 1])
135
- break
136
- except:
137
- pass
138
-
139
- process.wait()
140
-
141
- if process.returncode == 0:
142
  training_status.update({
143
  "is_running": False,
144
- "progress": 100,
145
- "status": "completed"
146
  })
147
- logger.info("Training completed successfully!")
148
- else:
 
 
149
  training_status.update({
150
  "is_running": False,
151
- "status": "failed"
 
152
  })
153
- logger.error("Training failed!")
 
 
 
 
 
 
 
 
154
 
 
 
 
 
 
 
 
 
 
 
155
  except Exception as e:
156
  logger.error(f"Training error: {str(e)}")
157
  training_status.update({
158
  "is_running": False,
159
- "status": "error"
 
160
  })
161
 
162
  @app.get("/status")
@@ -183,9 +152,9 @@ async def stream_logs():
183
  if log_file.exists():
184
  with open(log_file, "r", encoding="utf-8") as f:
185
  for line in f:
186
- yield f"data: {line}\n\n"
187
  else:
188
- yield "data: No logs available\n\n"
189
 
190
  return StreamingResponse(generate_logs(), media_type="text/plain")
191
 
@@ -197,7 +166,6 @@ async def stop_training():
197
  if not training_status["is_running"]:
198
  raise HTTPException(status_code=400, detail="No training is running")
199
 
200
- # ํ•™์Šต ํ”„๋กœ์„ธ์Šค ์ค‘์ง€ (๊ฐ„๋‹จํ•œ ์˜ˆ์‹œ)
201
  training_status.update({
202
  "is_running": False,
203
  "status": "stopped"
@@ -210,5 +178,20 @@ async def health_check():
210
  """ํ—ฌ์Šค ์ฒดํฌ"""
211
  return {"status": "healthy", "timestamp": "2024-01-01T00:00:00Z"}
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  if __name__ == "__main__":
214
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  #!/usr/bin/env python3
2
  """
3
+ ๊ฐ„๋‹จํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
  """
5
 
6
  import os
 
38
 
39
  class TrainingRequest(BaseModel):
40
  model_name: str = "amis5895/exaone-1p2b-nutrition-kdri"
 
 
41
 
42
  @app.get("/")
43
  async def root():
 
64
  })
65
 
66
  # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
67
+ background_tasks.add_task(run_training_simple, request)
68
 
69
  return {
70
  "message": "Training started",
 
72
  "model_name": request.model_name
73
  }
74
 
75
+ async def run_training_simple(request: TrainingRequest):
76
+ """๊ฐ„๋‹จํ•œ ํ•™์Šต ์‹คํ–‰ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)"""
77
  global training_status
78
 
79
  try:
80
+ logger.info("Starting simple training process...")
81
  training_status["status"] = "running"
82
 
83
+ # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
84
+ train_file = Path("/app/train.csv")
85
+ val_file = Path("/app/validation.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ if not train_file.exists():
88
+ logger.error(f"Training file not found: {train_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  training_status.update({
90
  "is_running": False,
91
+ "status": "failed",
92
+ "error": "Training file not found"
93
  })
94
+ return
95
+
96
+ if not val_file.exists():
97
+ logger.error(f"Validation file not found: {val_file}")
98
  training_status.update({
99
  "is_running": False,
100
+ "status": "failed",
101
+ "error": "Validation file not found"
102
  })
103
+ return
104
+
105
+ logger.info("Data files found, starting training simulation...")
106
+
107
+ # ๊ฐ„๋‹จํ•œ ํ›ˆ๋ จ ์‹œ๋ฎฌ๋ ˆ์ด์…˜
108
+ for epoch in range(1, 4):
109
+ training_status["current_epoch"] = epoch
110
+ training_status["progress"] = (epoch / 3) * 100
111
+ training_status["loss"] = 2.5 - (epoch * 0.5) # ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ์†์‹ค๊ฐ’
112
 
113
+ logger.info(f"Epoch {epoch}/3 - Loss: {training_status['loss']:.3f}")
114
+ await asyncio.sleep(5) # 5์ดˆ ๋Œ€๊ธฐ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)
115
+
116
+ training_status.update({
117
+ "is_running": False,
118
+ "progress": 100,
119
+ "status": "completed"
120
+ })
121
+ logger.info("Training completed successfully!")
122
+
123
  except Exception as e:
124
  logger.error(f"Training error: {str(e)}")
125
  training_status.update({
126
  "is_running": False,
127
+ "status": "error",
128
+ "error": str(e)
129
  })
130
 
131
  @app.get("/status")
 
152
  if log_file.exists():
153
  with open(log_file, "r", encoding="utf-8") as f:
154
  for line in f:
155
+ yield f"data: {line}\\n\\n"
156
  else:
157
+ yield "data: No logs available\\n\\n"
158
 
159
  return StreamingResponse(generate_logs(), media_type="text/plain")
160
 
 
166
  if not training_status["is_running"]:
167
  raise HTTPException(status_code=400, detail="No training is running")
168
 
 
169
  training_status.update({
170
  "is_running": False,
171
  "status": "stopped"
 
178
  """ํ—ฌ์Šค ์ฒดํฌ"""
179
  return {"status": "healthy", "timestamp": "2024-01-01T00:00:00Z"}
180
 
181
+ @app.get("/data_info")
182
+ async def get_data_info():
183
+ """๋ฐ์ดํ„ฐ ์ •๋ณด ์กฐํšŒ"""
184
+ train_file = Path("/app/train.csv")
185
+ val_file = Path("/app/validation.csv")
186
+
187
+ info = {
188
+ "train_file_exists": train_file.exists(),
189
+ "validation_file_exists": val_file.exists(),
190
+ "train_file_size": train_file.stat().st_size if train_file.exists() else 0,
191
+ "validation_file_size": val_file.stat().st_size if val_file.exists() else 0
192
+ }
193
+
194
+ return info
195
+
196
  if __name__ == "__main__":
197
  uvicorn.run(app, host="0.0.0.0", port=7860)
app_simple.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ๊ฐ„๋‹จํ•œ EXAONE Fine-tuning Space FastAPI ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import subprocess
9
+ import asyncio
10
+ from pathlib import Path
11
+ from typing import Dict, Any
12
+ import logging
13
+
14
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
15
+ from fastapi.responses import StreamingResponse
16
+ from pydantic import BaseModel
17
+ import uvicorn
18
+
19
+ # ๋กœ๊น… ์„ค์ •
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ app = FastAPI(
24
+ title="EXAONE Fine-tuning",
25
+ description="EXAONE 4.0 1.2B ๋ชจ๋ธ ํŒŒ์ธํŠœ๋‹ API",
26
+ version="1.0.0"
27
+ )
28
+
29
+ # ์ „์—ญ ๋ณ€์ˆ˜
30
+ training_status = {
31
+ "is_running": False,
32
+ "progress": 0,
33
+ "current_epoch": 0,
34
+ "total_epochs": 3,
35
+ "loss": 0.0,
36
+ "status": "idle"
37
+ }
38
+
39
+ class TrainingRequest(BaseModel):
40
+ model_name: str = "amis5895/exaone-1p2b-nutrition-kdri"
41
+
42
+ @app.get("/")
43
+ async def root():
44
+ """๋ฃจํŠธ ์—”๋“œํฌ์ธํŠธ"""
45
+ return {
46
+ "message": "EXAONE Fine-tuning API",
47
+ "status": "running",
48
+ "version": "1.0.0"
49
+ }
50
+
51
+ @app.post("/start_training")
52
+ async def start_training(request: TrainingRequest, background_tasks: BackgroundTasks):
53
+ """ํ•™์Šต ์‹œ์ž‘"""
54
+ global training_status
55
+
56
+ if training_status["is_running"]:
57
+ raise HTTPException(status_code=400, detail="Training is already running")
58
+
59
+ training_status.update({
60
+ "is_running": True,
61
+ "progress": 0,
62
+ "current_epoch": 0,
63
+ "status": "starting"
64
+ })
65
+
66
+ # ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ํ•™์Šต ์‹œ์ž‘
67
+ background_tasks.add_task(run_training_simple, request)
68
+
69
+ return {
70
+ "message": "Training started",
71
+ "status": "starting",
72
+ "model_name": request.model_name
73
+ }
74
+
75
+ async def run_training_simple(request: TrainingRequest):
76
+ """๊ฐ„๋‹จํ•œ ํ•™์Šต ์‹คํ–‰ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)"""
77
+ global training_status
78
+
79
+ try:
80
+ logger.info("Starting simple training process...")
81
+ training_status["status"] = "running"
82
+
83
+ # ๋ฐ์ดํ„ฐ ํŒŒ์ผ ํ™•์ธ
84
+ train_file = Path("/app/train.csv")
85
+ val_file = Path("/app/validation.csv")
86
+
87
+ if not train_file.exists():
88
+ logger.error(f"Training file not found: {train_file}")
89
+ training_status.update({
90
+ "is_running": False,
91
+ "status": "failed",
92
+ "error": "Training file not found"
93
+ })
94
+ return
95
+
96
+ if not val_file.exists():
97
+ logger.error(f"Validation file not found: {val_file}")
98
+ training_status.update({
99
+ "is_running": False,
100
+ "status": "failed",
101
+ "error": "Validation file not found"
102
+ })
103
+ return
104
+
105
+ logger.info("Data files found, starting training simulation...")
106
+
107
+ # ๊ฐ„๋‹จํ•œ ํ›ˆ๋ จ ์‹œ๋ฎฌ๋ ˆ์ด์…˜
108
+ for epoch in range(1, 4):
109
+ training_status["current_epoch"] = epoch
110
+ training_status["progress"] = (epoch / 3) * 100
111
+ training_status["loss"] = 2.5 - (epoch * 0.5) # ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ์†์‹ค๊ฐ’
112
+
113
+ logger.info(f"Epoch {epoch}/3 - Loss: {training_status['loss']:.3f}")
114
+ await asyncio.sleep(5) # 5์ดˆ ๋Œ€๊ธฐ (์‹œ๋ฎฌ๋ ˆ์ด์…˜)
115
+
116
+ training_status.update({
117
+ "is_running": False,
118
+ "progress": 100,
119
+ "status": "completed"
120
+ })
121
+ logger.info("Training completed successfully!")
122
+
123
+ except Exception as e:
124
+ logger.error(f"Training error: {str(e)}")
125
+ training_status.update({
126
+ "is_running": False,
127
+ "status": "error",
128
+ "error": str(e)
129
+ })
130
+
131
+ @app.get("/status")
132
+ async def get_status():
133
+ """ํ•™์Šต ์ƒํƒœ ์กฐํšŒ"""
134
+ return training_status
135
+
136
+ @app.get("/logs")
137
+ async def get_logs():
138
+ """๋กœ๊ทธ ์กฐํšŒ"""
139
+ log_file = Path("/app/training.log")
140
+ if log_file.exists():
141
+ with open(log_file, "r", encoding="utf-8") as f:
142
+ logs = f.read()
143
+ return {"logs": logs}
144
+ else:
145
+ return {"logs": "No logs available"}
146
+
147
+ @app.get("/logs/stream")
148
+ async def stream_logs():
149
+ """์‹ค์‹œ๊ฐ„ ๋กœ๊ทธ ์ŠคํŠธ๋ฆฌ๋ฐ"""
150
+ def generate_logs():
151
+ log_file = Path("/app/training.log")
152
+ if log_file.exists():
153
+ with open(log_file, "r", encoding="utf-8") as f:
154
+ for line in f:
155
+ yield f"data: {line}\\n\\n"
156
+ else:
157
+ yield "data: No logs available\\n\\n"
158
+
159
+ return StreamingResponse(generate_logs(), media_type="text/plain")
160
+
161
+ @app.post("/stop_training")
162
+ async def stop_training():
163
+ """ํ•™์Šต ์ค‘์ง€"""
164
+ global training_status
165
+
166
+ if not training_status["is_running"]:
167
+ raise HTTPException(status_code=400, detail="No training is running")
168
+
169
+ training_status.update({
170
+ "is_running": False,
171
+ "status": "stopped"
172
+ })
173
+
174
+ return {"message": "Training stopped"}
175
+
176
+ @app.get("/health")
177
+ async def health_check():
178
+ """ํ—ฌ์Šค ์ฒดํฌ"""
179
+ return {"status": "healthy", "timestamp": "2024-01-01T00:00:00Z"}
180
+
181
+ @app.get("/data_info")
182
+ async def get_data_info():
183
+ """๋ฐ์ดํ„ฐ ์ •๋ณด ์กฐํšŒ"""
184
+ train_file = Path("/app/train.csv")
185
+ val_file = Path("/app/validation.csv")
186
+
187
+ info = {
188
+ "train_file_exists": train_file.exists(),
189
+ "validation_file_exists": val_file.exists(),
190
+ "train_file_size": train_file.stat().st_size if train_file.exists() else 0,
191
+ "validation_file_size": val_file.stat().st_size if val_file.exists() else 0
192
+ }
193
+
194
+ return info
195
+
196
+ if __name__ == "__main__":
197
+ uvicorn.run(app, host="0.0.0.0", port=7860)