Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """Distributed process launcher. | |
| This code is modified from https://github.com/pytorch/pytorch/blob/v1.3.0/torch/distributed/launch.py. | |
| """ | |
| import os | |
| import subprocess | |
| import sys | |
| from argparse import ArgumentParser | |
| from argparse import REMAINDER | |
| def parse_args(): | |
| """Parse arguments.""" | |
| parser = ArgumentParser( | |
| description="PyTorch distributed training launch " | |
| "helper utilty that will spawn up " | |
| "multiple distributed processes" | |
| ) | |
| # Optional arguments for the launch helper | |
| parser.add_argument( | |
| "--nnodes", | |
| type=int, | |
| default=1, | |
| help="The number of nodes to use for distributed " "training", | |
| ) | |
| parser.add_argument( | |
| "--node_rank", | |
| type=int, | |
| default=0, | |
| help="The rank of the node for multi-node distributed " "training", | |
| ) | |
| parser.add_argument( | |
| "--nproc_per_node", | |
| type=int, | |
| default=1, | |
| help="The number of processes to launch on each node, " | |
| "for GPU training, this is recommended to be set " | |
| "to the number of GPUs in your system so that " | |
| "each process can be bound to a single GPU.", | |
| ) | |
| parser.add_argument( | |
| "--master_addr", | |
| default="127.0.0.1", | |
| type=str, | |
| help="Master node (rank 0)'s address, should be either " | |
| "the IP address or the hostname of node 0, for " | |
| "single node multi-proc training, the " | |
| "--master_addr can simply be 127.0.0.1", | |
| ) | |
| parser.add_argument( | |
| "--master_port", | |
| default=29500, | |
| type=int, | |
| help="Master node (rank 0)'s free port that needs to " | |
| "be used for communciation during distributed " | |
| "training", | |
| ) | |
| parser.add_argument( | |
| "--use_env", | |
| default=False, | |
| action="store_true", | |
| help="Use environment variable to pass " | |
| "'local rank'. For legacy reasons, the default value is False. " | |
| "If set to True, the script will not pass " | |
| "--local_rank as argument, and will instead set LOCAL_RANK.", | |
| ) | |
| parser.add_argument( | |
| "-m", | |
| "--module", | |
| default=False, | |
| action="store_true", | |
| help="Changes each process to interpret the launch script " | |
| "as a python module, executing with the same behavior as" | |
| "'python -m'.", | |
| ) | |
| parser.add_argument( | |
| "-c", | |
| "--command", | |
| default=False, | |
| action="store_true", | |
| help="Changes each process to interpret the launch script " "as a command.", | |
| ) | |
| # positional | |
| parser.add_argument( | |
| "training_script", | |
| type=str, | |
| help="The full path to the single GPU training " | |
| "program/script/command to be launched in parallel, " | |
| "followed by all the arguments for the " | |
| "training script", | |
| ) | |
| # rest from the training program | |
| parser.add_argument("training_script_args", nargs=REMAINDER) | |
| return parser.parse_args() | |
| def main(): | |
| """Launch distributed processes.""" | |
| args = parse_args() | |
| # world size in terms of number of processes | |
| dist_world_size = args.nproc_per_node * args.nnodes | |
| # set PyTorch distributed related environmental variables | |
| current_env = os.environ.copy() | |
| current_env["MASTER_ADDR"] = args.master_addr | |
| current_env["MASTER_PORT"] = str(args.master_port) | |
| current_env["WORLD_SIZE"] = str(dist_world_size) | |
| processes = [] | |
| if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1: | |
| current_env["OMP_NUM_THREADS"] = str(1) | |
| print( | |
| "*****************************************\n" | |
| "Setting OMP_NUM_THREADS environment variable for each process " | |
| "to be {} in default, to avoid your system being overloaded, " | |
| "please further tune the variable for optimal performance in " | |
| "your application as needed. \n" | |
| "*****************************************".format( | |
| current_env["OMP_NUM_THREADS"] | |
| ) | |
| ) | |
| for local_rank in range(0, args.nproc_per_node): | |
| # each process's rank | |
| dist_rank = args.nproc_per_node * args.node_rank + local_rank | |
| current_env["RANK"] = str(dist_rank) | |
| current_env["LOCAL_RANK"] = str(local_rank) | |
| # spawn the processes | |
| if args.command: | |
| cmd = [args.training_script] | |
| else: | |
| cmd = [sys.executable, "-u"] | |
| if args.module: | |
| cmd.append("-m") | |
| cmd.append(args.training_script) | |
| if not args.use_env: | |
| cmd.append("--local_rank={}".format(local_rank)) | |
| cmd.extend(args.training_script_args) | |
| process = subprocess.Popen(cmd, env=current_env) | |
| processes.append(process) | |
| for process in processes: | |
| process.wait() | |
| if process.returncode != 0: | |
| raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) | |
| if __name__ == "__main__": | |
| main() | |