# From PyTorch: # # Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2016- Facebook, Inc (Adam Paszke) # Copyright (c) 2014- Facebook, Inc (Soumith Chintala) # Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) # Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) # Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) # Copyright (c) 2011-2013 NYU (Clement Farabet) # Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) # Copyright (c) 2006 Idiap Research Institute (Samy Bengio) # Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) # # From Caffe2: # # Copyright (c) 2016-present, Facebook Inc. All rights reserved. # # All contributions by Facebook: # Copyright (c) 2016 Facebook Inc. # # All contributions by Google: # Copyright (c) 2015 Google Inc. # All rights reserved. # # All contributions by Yangqing Jia: # Copyright (c) 2015 Yangqing Jia # All rights reserved. # # All contributions from Caffe: # Copyright(c) 2013, 2014, 2015, the respective contributors # All rights reserved. # # All other contributions: # Copyright(c) 2015, 2016 the respective contributors # All rights reserved. # # Caffe2 uses a copyright model similar to Caffe: each contributor holds # copyright over their contributions to Caffe2. The project versioning records # all such contribution and copyright details. If a contributor wants to further # mark their specific copyright on a particular contribution, they should # indicate their copyright solely in the commit message of the change when it is # committed. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America # and IDIAP Research Institute nor the names of its contributors may be # used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import sys import subprocess import os import socket import time from argparse import ArgumentParser, REMAINDER import torch def parse_args(): """ Helper function parsing the command line options @retval ArgumentParser """ parser = ArgumentParser(description="PyTorch distributed training launch " "helper utilty that will spawn up " "multiple distributed processes") # Optional arguments for the launch helper parser.add_argument("--nnodes", type=int, default=1, help="The number of nodes to use for distributed " "training") parser.add_argument("--node_rank", type=int, default=0, help="The rank of the node for multi-node distributed " "training") parser.add_argument("--nproc_per_node", type=int, default=1, help="The number of processes to launch on each node, " "for GPU training, this is recommended to be set " "to the number of GPUs in your system so that " "each process can be bound to a single GPU.") parser.add_argument("--master_addr", default="127.0.0.1", type=str, help="Master node (rank 0)'s address, should be either " "the IP address or the hostname of node 0, for " "single node multi-proc training, the " "--master_addr can simply be 127.0.0.1") parser.add_argument("--master_port", default=29500, type=int, help="Master node (rank 0)'s free port that needs to " "be used for communciation during distributed " "training") # positional parser.add_argument("training_script", type=str, help="The full path to the single GPU training " "program/script to be launched in parallel, " "followed by all the arguments for the " "training script") # rest from the training program parser.add_argument('training_script_args', nargs=REMAINDER) return parser.parse_args() def main(): args = parse_args() # world size in terms of number of processes dist_world_size = args.nproc_per_node * args.nnodes # set PyTorch distributed related environmental variables current_env = os.environ.copy() current_env["MASTER_ADDR"] = args.master_addr current_env["MASTER_PORT"] = str(args.master_port) current_env["WORLD_SIZE"] = str(dist_world_size) processes = [] for local_rank in range(0, args.nproc_per_node): # each process's rank dist_rank = args.nproc_per_node * args.node_rank + local_rank current_env["RANK"] = str(dist_rank) current_env["LOCAL_RANK"] = str(local_rank) # spawn the processes cmd = [sys.executable, "-u", args.training_script] + args.training_script_args print(cmd) stdout = None if local_rank == 0 else open("GPU_"+str(local_rank)+".log", "w") process = subprocess.Popen(cmd, env=current_env, stdout=stdout) processes.append(process) try: up = True error = False while up and not error: up = False for p in processes: ret = p.poll() if ret is None: up = True elif ret != 0: error = True time.sleep(1) if error: for p in processes: if p.poll() is None: p.terminate() exit(1) except KeyboardInterrupt: for p in processes: p.terminate() raise except SystemExit: for p in processes: p.terminate() raise except: for p in processes: p.terminate() raise if __name__ == "__main__": main()