215 lines
7.1 KiB
Python
215 lines
7.1 KiB
Python
# From PyTorch:
|
|
#
|
|
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
|
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
|
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
|
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
|
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
|
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
|
# Copyright (c) 2011-2013 NYU (Clement Farabet)
|
|
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
|
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
|
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
|
#
|
|
# From Caffe2:
|
|
#
|
|
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
|
|
#
|
|
# All contributions by Facebook:
|
|
# Copyright (c) 2016 Facebook Inc.
|
|
#
|
|
# All contributions by Google:
|
|
# Copyright (c) 2015 Google Inc.
|
|
# All rights reserved.
|
|
#
|
|
# All contributions by Yangqing Jia:
|
|
# Copyright (c) 2015 Yangqing Jia
|
|
# All rights reserved.
|
|
#
|
|
# All contributions from Caffe:
|
|
# Copyright(c) 2013, 2014, 2015, the respective contributors
|
|
# All rights reserved.
|
|
#
|
|
# All other contributions:
|
|
# Copyright(c) 2015, 2016 the respective contributors
|
|
# All rights reserved.
|
|
#
|
|
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
|
|
# copyright over their contributions to Caffe2. The project versioning records
|
|
# all such contribution and copyright details. If a contributor wants to further
|
|
# mark their specific copyright on a particular contribution, they should
|
|
# indicate their copyright solely in the commit message of the change when it is
|
|
# committed.
|
|
#
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
#
|
|
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
|
|
# and IDIAP Research Institute nor the names of its contributors may be
|
|
# used to endorse or promote products derived from this software without
|
|
# specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
import sys
|
|
import subprocess
|
|
import os
|
|
import socket
|
|
import time
|
|
from argparse import ArgumentParser, REMAINDER
|
|
|
|
import torch
|
|
|
|
|
|
def parse_args():
|
|
"""
|
|
Helper function parsing the command line options
|
|
@retval ArgumentParser
|
|
"""
|
|
parser = ArgumentParser(
|
|
description="PyTorch distributed training launch "
|
|
"helper utilty that will spawn up "
|
|
"multiple distributed processes"
|
|
)
|
|
|
|
# Optional arguments for the launch helper
|
|
parser.add_argument(
|
|
"--nnodes",
|
|
type=int,
|
|
default=1,
|
|
help="The number of nodes to use for distributed " "training",
|
|
)
|
|
parser.add_argument(
|
|
"--node_rank",
|
|
type=int,
|
|
default=0,
|
|
help="The rank of the node for multi-node distributed " "training",
|
|
)
|
|
parser.add_argument(
|
|
"--nproc_per_node",
|
|
type=int,
|
|
default=1,
|
|
help="The number of processes to launch on each node, "
|
|
"for GPU training, this is recommended to be set "
|
|
"to the number of GPUs in your system so that "
|
|
"each process can be bound to a single GPU.",
|
|
)
|
|
parser.add_argument(
|
|
"--master_addr",
|
|
default="127.0.0.1",
|
|
type=str,
|
|
help="Master node (rank 0)'s address, should be either "
|
|
"the IP address or the hostname of node 0, for "
|
|
"single node multi-proc training, the "
|
|
"--master_addr can simply be 127.0.0.1",
|
|
)
|
|
parser.add_argument(
|
|
"--master_port",
|
|
default=29500,
|
|
type=int,
|
|
help="Master node (rank 0)'s free port that needs to "
|
|
"be used for communciation during distributed "
|
|
"training",
|
|
)
|
|
|
|
# positional
|
|
parser.add_argument(
|
|
"training_script",
|
|
type=str,
|
|
help="The full path to the single GPU training "
|
|
"program/script to be launched in parallel, "
|
|
"followed by all the arguments for the "
|
|
"training script",
|
|
)
|
|
|
|
# rest from the training program
|
|
parser.add_argument("training_script_args", nargs=REMAINDER)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
# world size in terms of number of processes
|
|
dist_world_size = args.nproc_per_node * args.nnodes
|
|
|
|
# set PyTorch distributed related environmental variables
|
|
current_env = os.environ.copy()
|
|
current_env["MASTER_ADDR"] = args.master_addr
|
|
current_env["MASTER_PORT"] = str(args.master_port)
|
|
current_env["WORLD_SIZE"] = str(dist_world_size)
|
|
|
|
processes = []
|
|
|
|
for local_rank in range(0, args.nproc_per_node):
|
|
# each process's rank
|
|
dist_rank = args.nproc_per_node * args.node_rank + local_rank
|
|
current_env["RANK"] = str(dist_rank)
|
|
current_env["LOCAL_RANK"] = str(local_rank)
|
|
|
|
# spawn the processes
|
|
cmd = [sys.executable, "-u", args.training_script] + args.training_script_args
|
|
|
|
print(cmd)
|
|
|
|
stdout = (
|
|
None if local_rank == 0 else open("GPU_" + str(local_rank) + ".log", "w")
|
|
)
|
|
|
|
process = subprocess.Popen(cmd, env=current_env, stdout=stdout, stderr=stdout)
|
|
processes.append(process)
|
|
|
|
try:
|
|
up = True
|
|
error = False
|
|
while up and not error:
|
|
up = False
|
|
for p in processes:
|
|
ret = p.poll()
|
|
if ret is None:
|
|
up = True
|
|
elif ret != 0:
|
|
error = True
|
|
time.sleep(1)
|
|
|
|
if error:
|
|
for p in processes:
|
|
if p.poll() is None:
|
|
p.terminate()
|
|
exit(1)
|
|
|
|
except KeyboardInterrupt:
|
|
for p in processes:
|
|
p.terminate()
|
|
raise
|
|
except SystemExit:
|
|
for p in processes:
|
|
p.terminate()
|
|
raise
|
|
except:
|
|
for p in processes:
|
|
p.terminate()
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|