Spaces:
Runtime error
Runtime error
| def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir): | |
| ''' | |
| DeepSpeed configration | |
| https://huggingface.co/docs/transformers/main_classes/deepspeed | |
| ''' | |
| if nvme_offload_dir: | |
| ds_config = { | |
| "fp16": { | |
| "enabled": not ds_bf16, | |
| }, | |
| "bf16": { | |
| "enabled": ds_bf16, | |
| }, | |
| "zero_optimization": { | |
| "stage": 3, | |
| "offload_param": { | |
| "device": "nvme", | |
| "nvme_path": nvme_offload_dir, | |
| "pin_memory": True, | |
| "buffer_count": 5, | |
| "buffer_size": 1e9, | |
| "max_in_cpu": 1e9 | |
| }, | |
| "overlap_comm": True, | |
| "reduce_bucket_size": "auto", | |
| "contiguous_gradients": True, | |
| "sub_group_size": 1e8, | |
| "stage3_prefetch_bucket_size": "auto", | |
| "stage3_param_persistence_threshold": "auto", | |
| "stage3_max_live_parameters": "auto", | |
| "stage3_max_reuse_distance": "auto", | |
| }, | |
| "aio": { | |
| "block_size": 262144, | |
| "queue_depth": 32, | |
| "thread_count": 1, | |
| "single_submit": False, | |
| "overlap_events": True | |
| }, | |
| "steps_per_print": 2000, | |
| "train_batch_size": train_batch_size, | |
| "train_micro_batch_size_per_gpu": 1, | |
| "wall_clock_breakdown": False | |
| } | |
| else: | |
| ds_config = { | |
| "fp16": { | |
| "enabled": not ds_bf16, | |
| }, | |
| "bf16": { | |
| "enabled": ds_bf16, | |
| }, | |
| "zero_optimization": { | |
| "stage": 3, | |
| "offload_param": { | |
| "device": "cpu", | |
| "pin_memory": True | |
| }, | |
| "overlap_comm": True, | |
| "contiguous_gradients": True, | |
| "reduce_bucket_size": "auto", | |
| "stage3_prefetch_bucket_size": "auto", | |
| "stage3_param_persistence_threshold": "auto", | |
| "stage3_max_live_parameters": "auto", | |
| "stage3_max_reuse_distance": "auto", | |
| }, | |
| "steps_per_print": 2000, | |
| "train_batch_size": train_batch_size, | |
| "train_micro_batch_size_per_gpu": 1, | |
| "wall_clock_breakdown": False | |
| } | |
| return ds_config | |