Enable to remove GPU after model training is done
Browse files- app.py +1 -1
- app_training.py +6 -3
- requirements.txt +1 -1
- trainer.py +13 -2
app.py
CHANGED
|
@@ -43,7 +43,7 @@ def show_warning(warning_text: str) -> gr.Blocks:
|
|
| 43 |
|
| 44 |
|
| 45 |
pipe = InferencePipeline(HF_TOKEN)
|
| 46 |
-
trainer = Trainer()
|
| 47 |
|
| 48 |
with gr.Blocks(css='style.css') as demo:
|
| 49 |
if os.getenv('IS_SHARED_UI'):
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
pipe = InferencePipeline(HF_TOKEN)
|
| 46 |
+
trainer = Trainer(HF_TOKEN)
|
| 47 |
|
| 48 |
with gr.Blocks(css='style.css') as demo:
|
| 49 |
if os.getenv('IS_SHARED_UI'):
|
app_training.py
CHANGED
|
@@ -92,9 +92,10 @@ def create_training_demo(trainer: Trainer,
|
|
| 92 |
- **Note:** Due to [this issue](https://github.com/huggingface/accelerate/issues/944), currently, training will not terminate properly if you use W&B.
|
| 93 |
''')
|
| 94 |
|
| 95 |
-
# TODO currently disabled
|
| 96 |
remove_gpu_after_training = gr.Checkbox(
|
| 97 |
-
label='Remove GPU after training',
|
|
|
|
|
|
|
| 98 |
run_button = gr.Button('Start Training')
|
| 99 |
|
| 100 |
with gr.Box():
|
|
@@ -125,12 +126,14 @@ def create_training_demo(trainer: Trainer,
|
|
| 125 |
use_private_repo,
|
| 126 |
delete_existing_repo,
|
| 127 |
upload_to,
|
|
|
|
| 128 |
],
|
| 129 |
outputs=output_message)
|
| 130 |
return demo
|
| 131 |
|
| 132 |
|
| 133 |
if __name__ == '__main__':
|
| 134 |
-
|
|
|
|
| 135 |
demo = create_training_demo(trainer)
|
| 136 |
demo.queue(max_size=1).launch(share=False)
|
|
|
|
| 92 |
- **Note:** Due to [this issue](https://github.com/huggingface/accelerate/issues/944), currently, training will not terminate properly if you use W&B.
|
| 93 |
''')
|
| 94 |
|
|
|
|
| 95 |
remove_gpu_after_training = gr.Checkbox(
|
| 96 |
+
label='Remove GPU after training',
|
| 97 |
+
value=False,
|
| 98 |
+
interactive=bool(os.getenv('SPACE_ID')))
|
| 99 |
run_button = gr.Button('Start Training')
|
| 100 |
|
| 101 |
with gr.Box():
|
|
|
|
| 126 |
use_private_repo,
|
| 127 |
delete_existing_repo,
|
| 128 |
upload_to,
|
| 129 |
+
remove_gpu_after_training,
|
| 130 |
],
|
| 131 |
outputs=output_message)
|
| 132 |
return demo
|
| 133 |
|
| 134 |
|
| 135 |
if __name__ == '__main__':
|
| 136 |
+
hf_token = os.getenv('HF_TOKEN')
|
| 137 |
+
trainer = Trainer(hf_token)
|
| 138 |
demo = create_training_demo(trainer)
|
| 139 |
demo.queue(max_size=1).launch(share=False)
|
requirements.txt
CHANGED
|
@@ -4,7 +4,7 @@ datasets==2.8.0
|
|
| 4 |
git+https://github.com/huggingface/diffusers@febaf863026bd014b7a14349336544fc109d0f57#egg=diffusers
|
| 5 |
ftfy==6.1.1
|
| 6 |
gradio==3.14.0
|
| 7 |
-
huggingface-hub
|
| 8 |
Pillow==9.4.0
|
| 9 |
python-slugify==7.0.0
|
| 10 |
tensorboard==2.11.2
|
|
|
|
| 4 |
git+https://github.com/huggingface/diffusers@febaf863026bd014b7a14349336544fc109d0f57#egg=diffusers
|
| 5 |
ftfy==6.1.1
|
| 6 |
gradio==3.14.0
|
| 7 |
+
git+https://github.com/huggingface/huggingface_hub@bdb9d06b5e67269d702860ca60e1cdb106a66c91#egg=huggingface-hub
|
| 8 |
Pillow==9.4.0
|
| 9 |
python-slugify==7.0.0
|
| 10 |
tensorboard==2.11.2
|
trainer.py
CHANGED
|
@@ -11,6 +11,7 @@ import gradio as gr
|
|
| 11 |
import PIL.Image
|
| 12 |
import slugify
|
| 13 |
import torch
|
|
|
|
| 14 |
|
| 15 |
from constants import UploadTarget
|
| 16 |
|
|
@@ -30,6 +31,10 @@ def pad_image(image: PIL.Image.Image) -> PIL.Image.Image:
|
|
| 30 |
|
| 31 |
|
| 32 |
class Trainer:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def prepare_dataset(self, instance_images: list, resolution: int,
|
| 34 |
instance_data_dir: pathlib.Path) -> None:
|
| 35 |
shutil.rmtree(instance_data_dir, ignore_errors=True)
|
|
@@ -64,6 +69,7 @@ class Trainer:
|
|
| 64 |
use_private_repo: bool,
|
| 65 |
delete_existing_repo: bool,
|
| 66 |
upload_to: str,
|
|
|
|
| 67 |
) -> str:
|
| 68 |
if not torch.cuda.is_available():
|
| 69 |
raise gr.Error('CUDA is not available.')
|
|
@@ -116,8 +122,7 @@ class Trainer:
|
|
| 116 |
if use_wandb:
|
| 117 |
command += ' --report_to wandb'
|
| 118 |
if upload_to_hub:
|
| 119 |
-
|
| 120 |
-
command += f' --push_to_hub --hub_token {hf_token}'
|
| 121 |
if use_private_repo:
|
| 122 |
command += ' --private_repo'
|
| 123 |
if delete_existing_repo:
|
|
@@ -127,6 +132,12 @@ class Trainer:
|
|
| 127 |
|
| 128 |
subprocess.run(shlex.split(command))
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
with open(output_dir / 'train.sh', 'w') as f:
|
| 131 |
command_s = ' '.join(command.split())
|
| 132 |
f.write(command_s)
|
|
|
|
| 11 |
import PIL.Image
|
| 12 |
import slugify
|
| 13 |
import torch
|
| 14 |
+
from huggingface_hub import HfApi
|
| 15 |
|
| 16 |
from constants import UploadTarget
|
| 17 |
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
class Trainer:
|
| 34 |
+
def __init__(self, hf_token: str | None = None):
|
| 35 |
+
self.hf_token = hf_token
|
| 36 |
+
self.api = HfApi(token=hf_token)
|
| 37 |
+
|
| 38 |
def prepare_dataset(self, instance_images: list, resolution: int,
|
| 39 |
instance_data_dir: pathlib.Path) -> None:
|
| 40 |
shutil.rmtree(instance_data_dir, ignore_errors=True)
|
|
|
|
| 69 |
use_private_repo: bool,
|
| 70 |
delete_existing_repo: bool,
|
| 71 |
upload_to: str,
|
| 72 |
+
remove_gpu_after_training: bool,
|
| 73 |
) -> str:
|
| 74 |
if not torch.cuda.is_available():
|
| 75 |
raise gr.Error('CUDA is not available.')
|
|
|
|
| 122 |
if use_wandb:
|
| 123 |
command += ' --report_to wandb'
|
| 124 |
if upload_to_hub:
|
| 125 |
+
command += f' --push_to_hub --hub_token {self.hf_token}'
|
|
|
|
| 126 |
if use_private_repo:
|
| 127 |
command += ' --private_repo'
|
| 128 |
if delete_existing_repo:
|
|
|
|
| 132 |
|
| 133 |
subprocess.run(shlex.split(command))
|
| 134 |
|
| 135 |
+
if remove_gpu_after_training:
|
| 136 |
+
space_id = os.getenv('SPACE_ID')
|
| 137 |
+
if space_id:
|
| 138 |
+
self.api.request_space_hardware(repo_id=space_id,
|
| 139 |
+
hardware='cpu-basic')
|
| 140 |
+
|
| 141 |
with open(output_dir / 'train.sh', 'w') as f:
|
| 142 |
command_s = ' '.join(command.split())
|
| 143 |
f.write(command_s)
|