[MRCNN] use native AMP, upgrade opencv-python
This commit is contained in:
parent
cc194960e6
commit
abe062867f
|
@ -19,7 +19,7 @@ FROM ${FROM_IMAGE_NAME}
|
||||||
RUN pip install --upgrade --no-cache-dir pip \
|
RUN pip install --upgrade --no-cache-dir pip \
|
||||||
&& pip install --no-cache-dir \
|
&& pip install --no-cache-dir \
|
||||||
mlperf-compliance==0.0.10 \
|
mlperf-compliance==0.0.10 \
|
||||||
opencv-python==3.4.1.15 \
|
opencv-python==4.2.0.32 \
|
||||||
git+https://github.com/NVIDIA/dllogger \
|
git+https://github.com/NVIDIA/dllogger \
|
||||||
yacs
|
yacs
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ Mask R-CNN is a convolution based neural network for the task of object instance
|
||||||
The repository also contains scripts to interactively launch training, benchmarking and inference routines in a Docker container.
|
The repository also contains scripts to interactively launch training, benchmarking and inference routines in a Docker container.
|
||||||
|
|
||||||
The major differences between the official implementation of the paper and our version of Mask R-CNN are as follows:
|
The major differences between the official implementation of the paper and our version of Mask R-CNN are as follows:
|
||||||
- Mixed precision support with [PyTorch AMP](https://github.com/NVIDIA/apex).
|
- Mixed precision support with [PyTorch AMP](https://pytorch.org/docs/stable/amp.html).
|
||||||
- Gradient accumulation to simulate larger batches.
|
- Gradient accumulation to simulate larger batches.
|
||||||
- Custom fused CUDA kernels for faster computations.
|
- Custom fused CUDA kernels for faster computations.
|
||||||
|
|
||||||
|
@ -117,7 +117,8 @@ The following features are supported by this model.
|
||||||
|
|
||||||
| **Feature** | **Mask R-CNN** |
|
| **Feature** | **Mask R-CNN** |
|
||||||
|:---------:|:----------:|
|
|:---------:|:----------:|
|
||||||
|APEX AMP|Yes|
|
|APEX AMP|No|
|
||||||
|
|PyTorch AMP|Yes|
|
||||||
|APEX DDP|Yes|
|
|APEX DDP|Yes|
|
||||||
|
|
||||||
#### Features
|
#### Features
|
||||||
|
@ -150,22 +151,8 @@ For information about:
|
||||||
|
|
||||||
#### Enabling mixed precision
|
#### Enabling mixed precision
|
||||||
|
|
||||||
In this repository, mixed precision training is enabled by NVIDIA’s [APEX](https://github.com/NVIDIA/apex) library. The APEX library has an automatic mixed precision module that allows mixed precision to be enabled with minimal code changes.
|
In this repository, mixed precision training is enabled by using Pytorch's [AMP](https://pytorch.org/docs/stable/amp.html).
|
||||||
|
|
||||||
Automatic mixed precision can be enabled with the following code changes:
|
|
||||||
|
|
||||||
```
|
|
||||||
from apex import amp
|
|
||||||
if fp16:
|
|
||||||
# Wrap optimizer and model
|
|
||||||
model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale="dynamic")
|
|
||||||
|
|
||||||
if fp16:
|
|
||||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
|
||||||
scaled_loss.backward()
|
|
||||||
```
|
|
||||||
|
|
||||||
Where <opt_level> is the optimization level. In the MaskRCNN, "O1" is set as the optimization level. Mixed precision training can be turned on by passing in the argument fp16 to the pre-training and fine-tuning Python scripts. Shell scripts all have a positional argument available to enable mixed precision training.
|
|
||||||
|
|
||||||
#### Enabling TF32
|
#### Enabling TF32
|
||||||
|
|
||||||
|
@ -608,6 +595,10 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
|
||||||
|
|
||||||
### Changelog
|
### Changelog
|
||||||
|
|
||||||
|
October 2021
|
||||||
|
- Replace APEX AMP with PyTorch native AMP
|
||||||
|
- Use opencv-python version 4.2.0.32
|
||||||
|
|
||||||
June 2020
|
June 2020
|
||||||
- Updated accuracy and performance tables to include A100 results
|
- Updated accuracy and performance tables to include A100 results
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ FROM ${FROM_IMAGE_NAME}
|
||||||
RUN pip install --upgrade --no-cache-dir pip \
|
RUN pip install --upgrade --no-cache-dir pip \
|
||||||
&& pip install --no-cache-dir \
|
&& pip install --no-cache-dir \
|
||||||
mlperf-compliance==0.0.10 \
|
mlperf-compliance==0.0.10 \
|
||||||
opencv-python==3.4.1.15 \
|
opencv-python==4.2.0.32 \
|
||||||
git+https://github.com/NVIDIA/dllogger \
|
git+https://github.com/NVIDIA/dllogger \
|
||||||
yacs
|
yacs
|
||||||
|
|
||||||
|
|
|
@ -318,8 +318,6 @@ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
|
||||||
# Precision of input, allowable: (float32, float16)
|
# Precision of input, allowable: (float32, float16)
|
||||||
_C.DTYPE = "float32"
|
_C.DTYPE = "float32"
|
||||||
|
|
||||||
# Enable verbosity in apex.amp
|
|
||||||
_C.AMP_VERBOSE = False
|
|
||||||
|
|
||||||
# Evaluate every epoch
|
# Evaluate every epoch
|
||||||
_C.PER_EPOCH_EVAL = False
|
_C.PER_EPOCH_EVAL = False
|
||||||
|
|
|
@ -10,13 +10,6 @@ import torch.distributed as dist
|
||||||
from maskrcnn_benchmark.utils.comm import get_world_size
|
from maskrcnn_benchmark.utils.comm import get_world_size
|
||||||
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
|
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
|
||||||
|
|
||||||
try:
|
|
||||||
from apex import amp
|
|
||||||
use_amp = True
|
|
||||||
except ImportError:
|
|
||||||
print('Use APEX for multi-precision via apex.amp')
|
|
||||||
use_amp = False
|
|
||||||
|
|
||||||
def reduce_loss_dict(loss_dict):
|
def reduce_loss_dict(loss_dict):
|
||||||
"""
|
"""
|
||||||
Reduce the loss dictionary from all processes so that process with rank
|
Reduce the loss dictionary from all processes so that process with rank
|
||||||
|
@ -63,6 +56,8 @@ def do_train(
|
||||||
model.train()
|
model.train()
|
||||||
start_training_time = time.time()
|
start_training_time = time.time()
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
if use_amp:
|
||||||
|
scaler = torch.cuda.amp.GradScaler(init_scale=8192.0)
|
||||||
for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
|
for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
|
||||||
data_time = time.time() - end
|
data_time = time.time() - end
|
||||||
iteration = iteration + 1
|
iteration = iteration + 1
|
||||||
|
@ -71,7 +66,11 @@ def do_train(
|
||||||
images = images.to(device)
|
images = images.to(device)
|
||||||
targets = [target.to(device) for target in targets]
|
targets = [target.to(device) for target in targets]
|
||||||
|
|
||||||
loss_dict = model(images, targets)
|
if use_amp:
|
||||||
|
with torch.cuda.amp.autocast():
|
||||||
|
loss_dict = model(images, targets)
|
||||||
|
else:
|
||||||
|
loss_dict = model(images, targets)
|
||||||
|
|
||||||
losses = sum(loss for loss in loss_dict.values())
|
losses = sum(loss for loss in loss_dict.values())
|
||||||
|
|
||||||
|
@ -84,23 +83,27 @@ def do_train(
|
||||||
# Note: If mixed precision is not used, this ends up doing nothing
|
# Note: If mixed precision is not used, this ends up doing nothing
|
||||||
# Otherwise apply loss scaling for mixed-precision recipe
|
# Otherwise apply loss scaling for mixed-precision recipe
|
||||||
if use_amp:
|
if use_amp:
|
||||||
with amp.scale_loss(losses, optimizer) as scaled_losses:
|
scaler.scale(losses).backward()
|
||||||
scaled_losses.backward()
|
|
||||||
else:
|
else:
|
||||||
losses.backward()
|
losses.backward()
|
||||||
|
|
||||||
if not cfg.SOLVER.ACCUMULATE_GRAD:
|
def _take_step():
|
||||||
optimizer.step()
|
if use_amp:
|
||||||
|
scaler.step(optimizer)
|
||||||
|
scaler.update()
|
||||||
|
else:
|
||||||
|
optimizer.step()
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
if not cfg.SOLVER.ACCUMULATE_GRAD:
|
||||||
|
_take_step()
|
||||||
else:
|
else:
|
||||||
if (iteration + 1) % cfg.SOLVER.ACCUMULATE_STEPS == 0:
|
if (iteration + 1) % cfg.SOLVER.ACCUMULATE_STEPS == 0:
|
||||||
for param in model.parameters():
|
for param in model.parameters():
|
||||||
if param.grad is not None:
|
if param.grad is not None:
|
||||||
param.grad.data.div_(cfg.SOLVER.ACCUMULATE_STEPS)
|
param.grad.data.div_(cfg.SOLVER.ACCUMULATE_STEPS)
|
||||||
optimizer.step()
|
_take_step()
|
||||||
scheduler.step()
|
|
||||||
optimizer.zero_grad()
|
|
||||||
|
|
||||||
batch_time = time.time() - end
|
batch_time = time.time() - end
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
|
@ -3,10 +3,10 @@
|
||||||
# from ._utils import _C
|
# from ._utils import _C
|
||||||
from maskrcnn_benchmark import _C
|
from maskrcnn_benchmark import _C
|
||||||
|
|
||||||
from apex import amp
|
from torch.cuda.amp import custom_fwd
|
||||||
|
|
||||||
# Only valid with fp32 inputs - give AMP the hint
|
# Only valid with fp32 inputs - give AMP the hint
|
||||||
nms = amp.float_function(_C.nms)
|
nms = custom_fwd(_C.nms)
|
||||||
|
|
||||||
# nms.__doc__ = """
|
# nms.__doc__ = """
|
||||||
# This function performs Non-maximum suppresion"""
|
# This function performs Non-maximum suppresion"""
|
||||||
|
|
|
@ -8,8 +8,6 @@ from torch.nn.modules.utils import _pair
|
||||||
|
|
||||||
from maskrcnn_benchmark import _C
|
from maskrcnn_benchmark import _C
|
||||||
|
|
||||||
from apex import amp
|
|
||||||
|
|
||||||
class _ROIAlign(Function):
|
class _ROIAlign(Function):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
|
def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
|
||||||
|
@ -55,7 +53,7 @@ class ROIAlign(nn.Module):
|
||||||
self.spatial_scale = spatial_scale
|
self.spatial_scale = spatial_scale
|
||||||
self.sampling_ratio = sampling_ratio
|
self.sampling_ratio = sampling_ratio
|
||||||
|
|
||||||
@amp.float_function
|
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
|
||||||
def forward(self, input, rois):
|
def forward(self, input, rois):
|
||||||
return roi_align(
|
return roi_align(
|
||||||
input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
|
input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
|
||||||
|
|
|
@ -7,8 +7,6 @@ from torch.nn.modules.utils import _pair
|
||||||
|
|
||||||
from maskrcnn_benchmark import _C
|
from maskrcnn_benchmark import _C
|
||||||
|
|
||||||
from apex import amp
|
|
||||||
|
|
||||||
class _ROIPool(Function):
|
class _ROIPool(Function):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def forward(ctx, input, roi, output_size, spatial_scale):
|
def forward(ctx, input, roi, output_size, spatial_scale):
|
||||||
|
@ -53,7 +51,7 @@ class ROIPool(nn.Module):
|
||||||
self.output_size = output_size
|
self.output_size = output_size
|
||||||
self.spatial_scale = spatial_scale
|
self.spatial_scale = spatial_scale
|
||||||
|
|
||||||
@amp.float_function
|
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
|
||||||
def forward(self, input, rois):
|
def forward(self, input, rois):
|
||||||
return roi_pool(input, rois, self.output_size, self.spatial_scale)
|
return roi_pool(input, rois, self.output_size, self.spatial_scale)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
mlperf-compliance==0.0.10
|
mlperf-compliance==0.0.10
|
||||||
opencv-python==3.4.1.15
|
opencv-python==4.2.0.32
|
||||||
yacs
|
yacs
|
||||||
git+https://github.com/NVIDIA/cocoapi.git@nvidia/master#egg=cocoapi&subdirectory=PythonAPI
|
git+https://github.com/NVIDIA/cocoapi.git@nvidia/master#egg=cocoapi&subdirectory=PythonAPI
|
||||||
|
|
|
@ -20,13 +20,6 @@ from maskrcnn_benchmark.utils.miscellaneous import mkdir
|
||||||
from maskrcnn_benchmark.utils.logger import format_step
|
from maskrcnn_benchmark.utils.logger import format_step
|
||||||
import dllogger
|
import dllogger
|
||||||
|
|
||||||
# Check if we can enable mixed-precision via apex.amp
|
|
||||||
try:
|
|
||||||
from apex import amp
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError('Use APEX for mixed precision via apex.amp')
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
|
parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -100,9 +93,7 @@ def main():
|
||||||
if args.fp16:
|
if args.fp16:
|
||||||
use_mixed_precision = True
|
use_mixed_precision = True
|
||||||
else:
|
else:
|
||||||
use_mixed_precision = cfg.DTYPE == "float16"
|
use_mixed_precision = cfg.DTYPE == "float16"
|
||||||
amp_opt_level = 'O1' if use_mixed_precision else 'O0'
|
|
||||||
model = amp.initialize(model, opt_level=amp_opt_level)
|
|
||||||
|
|
||||||
output_dir = cfg.OUTPUT_DIR
|
output_dir = cfg.OUTPUT_DIR
|
||||||
checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
|
checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
|
||||||
|
@ -122,19 +113,35 @@ def main():
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
|
for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
|
||||||
result = inference(
|
if use_mixed_precision:
|
||||||
model,
|
with torch.cuda.amp.autocast():
|
||||||
data_loader_val,
|
result = inference(
|
||||||
dataset_name=dataset_name,
|
model,
|
||||||
iou_types=iou_types,
|
data_loader_val,
|
||||||
box_only=cfg.MODEL.RPN_ONLY,
|
dataset_name=dataset_name,
|
||||||
device=cfg.MODEL.DEVICE,
|
iou_types=iou_types,
|
||||||
expected_results=cfg.TEST.EXPECTED_RESULTS,
|
box_only=cfg.MODEL.RPN_ONLY,
|
||||||
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
|
device=cfg.MODEL.DEVICE,
|
||||||
output_folder=output_folder,
|
expected_results=cfg.TEST.EXPECTED_RESULTS,
|
||||||
skip_eval=args.skip_eval,
|
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
|
||||||
dllogger=dllogger,
|
output_folder=output_folder,
|
||||||
)
|
skip_eval=args.skip_eval,
|
||||||
|
dllogger=dllogger,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
result = inference(
|
||||||
|
model,
|
||||||
|
data_loader_val,
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
iou_types=iou_types,
|
||||||
|
box_only=cfg.MODEL.RPN_ONLY,
|
||||||
|
device=cfg.MODEL.DEVICE,
|
||||||
|
expected_results=cfg.TEST.EXPECTED_RESULTS,
|
||||||
|
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
|
||||||
|
output_folder=output_folder,
|
||||||
|
skip_eval=args.skip_eval,
|
||||||
|
dllogger=dllogger,
|
||||||
|
)
|
||||||
synchronize()
|
synchronize()
|
||||||
results.append(result)
|
results.append(result)
|
||||||
|
|
||||||
|
|
|
@ -34,13 +34,6 @@ import dllogger
|
||||||
from maskrcnn_benchmark.utils.logger import format_step
|
from maskrcnn_benchmark.utils.logger import format_step
|
||||||
|
|
||||||
# See if we can use apex.DistributedDataParallel instead of the torch default,
|
# See if we can use apex.DistributedDataParallel instead of the torch default,
|
||||||
# and enable mixed-precision via apex.amp
|
|
||||||
try:
|
|
||||||
from apex import amp
|
|
||||||
use_amp = True
|
|
||||||
except ImportError:
|
|
||||||
print('Use APEX for multi-precision via apex.amp')
|
|
||||||
use_amp = False
|
|
||||||
try:
|
try:
|
||||||
from apex.parallel import DistributedDataParallel as DDP
|
from apex.parallel import DistributedDataParallel as DDP
|
||||||
use_apex_ddp = True
|
use_apex_ddp = True
|
||||||
|
@ -98,15 +91,11 @@ def train(cfg, local_rank, distributed, fp16, dllogger):
|
||||||
optimizer = make_optimizer(cfg, model)
|
optimizer = make_optimizer(cfg, model)
|
||||||
scheduler = make_lr_scheduler(cfg, optimizer)
|
scheduler = make_lr_scheduler(cfg, optimizer)
|
||||||
|
|
||||||
if use_amp:
|
use_amp = False
|
||||||
# Initialize mixed-precision training
|
if fp16:
|
||||||
if fp16:
|
use_amp = True
|
||||||
use_mixed_precision = True
|
else:
|
||||||
else:
|
use_amp = cfg.DTYPE == "float16"
|
||||||
use_mixed_precision = cfg.DTYPE == "float16"
|
|
||||||
|
|
||||||
amp_opt_level = 'O1' if use_mixed_precision else 'O0'
|
|
||||||
model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
|
|
||||||
|
|
||||||
if distributed:
|
if distributed:
|
||||||
if use_apex_ddp:
|
if use_apex_ddp:
|
||||||
|
|
Loading…
Reference in a new issue