[MRCNN] use native AMP, upgrade opencv-python

2021-10-21 13:21:50 -07:00 · 2021-10-21 13:21:50 -07:00 · abe062867f
parent cc194960e6
commit abe062867f
11 changed files with 69 additions and 85 deletions
--- a/PyTorch/Segmentation/MaskRCNN/Dockerfile
+++ b/PyTorch/Segmentation/MaskRCNN/Dockerfile
@ -19,7 +19,7 @@ FROM ${FROM_IMAGE_NAME}
 RUN pip install --upgrade --no-cache-dir pip \
 && pip install --no-cache-dir \
      mlperf-compliance==0.0.10 \
-      opencv-python==3.4.1.15 \
+      opencv-python==4.2.0.32 \
      git+https://github.com/NVIDIA/dllogger \
      yacs
--- a/PyTorch/Segmentation/MaskRCNN/README.md
+++ b/PyTorch/Segmentation/MaskRCNN/README.md
@ -47,7 +47,7 @@ Mask R-CNN is a convolution based neural network for the task of object instance
 The repository also contains scripts to interactively launch training, benchmarking and inference routines in a Docker container.
 The major differences between the official implementation of the paper and our version of Mask R-CNN are as follows:
-  - Mixed precision support with [PyTorch AMP](https://github.com/NVIDIA/apex).
+  - Mixed precision support with [PyTorch AMP](https://pytorch.org/docs/stable/amp.html).
  - Gradient accumulation to simulate larger batches.
  - Custom fused CUDA kernels for faster computations.
@ -117,7 +117,8 @@ The following features are supported by this model.
 | **Feature** | **Mask R-CNN** |
 |:---------:|:----------:|
-|APEX AMP|Yes|
+|APEX AMP|No|
 |PyTorch AMP|Yes|
 |APEX DDP|Yes|
 #### Features
@ -150,22 +151,8 @@ For information about:
 #### Enabling mixed precision
-In this repository, mixed precision training is enabled by NVIDIA’s [APEX](https://github.com/NVIDIA/apex) library. The APEX library has an automatic mixed precision module that allows mixed precision to be enabled with minimal code changes.
+In this repository, mixed precision training is enabled by using Pytorch's [AMP](https://pytorch.org/docs/stable/amp.html).
- 
+
 Automatic mixed precision can be enabled with the following code changes: 
 ```
 from apex import amp
 if fp16:
    # Wrap optimizer and model
    model, optimizer = amp.initialize(model, optimizer, opt_level=<opt_level>, loss_scale="dynamic")
 if fp16:
    with amp.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
   ```
 Where <opt_level> is the optimization level. In the MaskRCNN, "O1" is set as the optimization level. Mixed precision training can be turned on by passing in the argument fp16 to the pre-training and fine-tuning Python scripts. Shell scripts all have a positional argument available to enable mixed precision training.
 #### Enabling TF32
@ -608,6 +595,10 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 ### Changelog
 October 2021
 - Replace APEX AMP with PyTorch native AMP
 - Use opencv-python version 4.2.0.32
 June 2020
 - Updated accuracy and performance tables to include A100 results
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/Dockerfile
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/Dockerfile
@ -19,7 +19,7 @@ FROM ${FROM_IMAGE_NAME}
 RUN pip install --upgrade --no-cache-dir pip \
 && pip install --no-cache-dir \
      mlperf-compliance==0.0.10 \
-      opencv-python==3.4.1.15 \
+      opencv-python==4.2.0.32 \
      git+https://github.com/NVIDIA/dllogger \
      yacs
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/config/defaults.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/config/defaults.py
@ -318,8 +318,6 @@ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
 # Precision of input, allowable: (float32, float16)
 _C.DTYPE = "float32"
 # Enable verbosity in apex.amp
 _C.AMP_VERBOSE = False
 # Evaluate every epoch
 _C.PER_EPOCH_EVAL = False
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/engine/trainer.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/engine/trainer.py
@ -10,13 +10,6 @@ import torch.distributed as dist
 from maskrcnn_benchmark.utils.comm import get_world_size
 from maskrcnn_benchmark.utils.metric_logger import MetricLogger
 try:
    from apex import amp
    use_amp = True
 except ImportError:
    print('Use APEX for multi-precision via apex.amp')
    use_amp = False
 def reduce_loss_dict(loss_dict):
    """
    Reduce the loss dictionary from all processes so that process with rank
@ -63,6 +56,8 @@ def do_train(
    model.train()
    start_training_time = time.time()
    end = time.time()
    if use_amp:
        scaler = torch.cuda.amp.GradScaler(init_scale=8192.0)
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
@ -71,7 +66,11 @@ def do_train(
        images = images.to(device)
        targets = [target.to(device) for target in targets]
-        loss_dict = model(images, targets)
+        if use_amp:
            with torch.cuda.amp.autocast():
                loss_dict = model(images, targets)
        else:
            loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
@ -84,23 +83,27 @@ def do_train(
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        if use_amp:        
-            with amp.scale_loss(losses, optimizer) as scaled_losses:
+            scaler.scale(losses).backward()
                scaled_losses.backward()
        else:
            losses.backward()
-        if not cfg.SOLVER.ACCUMULATE_GRAD:
+        def _take_step():
-            optimizer.step()
+            if use_amp:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        if not cfg.SOLVER.ACCUMULATE_GRAD:
            _take_step()
        else:
            if (iteration + 1) % cfg.SOLVER.ACCUMULATE_STEPS == 0:
                for param in model.parameters():
                    if param.grad is not None:
                        param.grad.data.div_(cfg.SOLVER.ACCUMULATE_STEPS)
-                optimizer.step()
+                _take_step()
                scheduler.step()
                optimizer.zero_grad()
        batch_time = time.time() - end
        end = time.time()
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/layers/nms.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/layers/nms.py
@ -3,10 +3,10 @@
 # from ._utils import _C
 from maskrcnn_benchmark import _C
-from apex import amp
+from torch.cuda.amp import custom_fwd
 # Only valid with fp32 inputs - give AMP the hint
-nms = amp.float_function(_C.nms)
+nms = custom_fwd(_C.nms)
 # nms.__doc__ = """
 # This function performs Non-maximum suppresion"""
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/layers/roi_align.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/layers/roi_align.py
@ -8,8 +8,6 @@ from torch.nn.modules.utils import _pair
 from maskrcnn_benchmark import _C
 from apex import amp
 class _ROIAlign(Function):
    @staticmethod
    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
@ -55,7 +53,7 @@ class ROIAlign(nn.Module):
        self.spatial_scale = spatial_scale
        self.sampling_ratio = sampling_ratio
-    @amp.float_function
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
    def forward(self, input, rois):
        return roi_align(
            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/layers/roi_pool.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/layers/roi_pool.py
@ -7,8 +7,6 @@ from torch.nn.modules.utils import _pair
 from maskrcnn_benchmark import _C
 from apex import amp
 class _ROIPool(Function):
    @staticmethod
    def forward(ctx, input, roi, output_size, spatial_scale):
@ -53,7 +51,7 @@ class ROIPool(nn.Module):
        self.output_size = output_size
        self.spatial_scale = spatial_scale
-    @amp.float_function
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
    def forward(self, input, rois):
        return roi_pool(input, rois, self.output_size, self.spatial_scale)
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/requirements.txt
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/requirements.txt
@ -1,4 +1,4 @@
 mlperf-compliance==0.0.10
-opencv-python==3.4.1.15
+opencv-python==4.2.0.32
 yacs
 git+https://github.com/NVIDIA/cocoapi.git@nvidia/master#egg=cocoapi&subdirectory=PythonAPI
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/tools/test_net.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/tools/test_net.py
@ -20,13 +20,6 @@ from maskrcnn_benchmark.utils.miscellaneous import mkdir
 from maskrcnn_benchmark.utils.logger import format_step
 import dllogger
 # Check if we can enable mixed-precision via apex.amp
 try:
    from apex import amp
 except ImportError:
    raise ImportError('Use APEX for mixed precision via apex.amp')
 def main():
    parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
    parser.add_argument(
@ -100,9 +93,7 @@ def main():
    if args.fp16:
        use_mixed_precision = True
    else:
-        use_mixed_precision = cfg.DTYPE == "float16"
+        use_mixed_precision = cfg.DTYPE == "float16" 
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    model = amp.initialize(model, opt_level=amp_opt_level)    
    output_dir = cfg.OUTPUT_DIR
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
@ -122,19 +113,35 @@ def main():
    results = []
    for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
-        result = inference(
+        if use_mixed_precision:
-            model,
+            with torch.cuda.amp.autocast():
-            data_loader_val,
+                result = inference(
-            dataset_name=dataset_name,
+                    model,
-            iou_types=iou_types,
+                    data_loader_val,
-            box_only=cfg.MODEL.RPN_ONLY,
+                    dataset_name=dataset_name,
-            device=cfg.MODEL.DEVICE,
+                    iou_types=iou_types,
-            expected_results=cfg.TEST.EXPECTED_RESULTS,
+                    box_only=cfg.MODEL.RPN_ONLY,
-            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
+                    device=cfg.MODEL.DEVICE,
-            output_folder=output_folder,
+                    expected_results=cfg.TEST.EXPECTED_RESULTS,
-            skip_eval=args.skip_eval,
+                    expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
-            dllogger=dllogger,
+                    output_folder=output_folder,
-        )
+                    skip_eval=args.skip_eval,
                    dllogger=dllogger,
                )
        else:
            result = inference(
                    model,
                    data_loader_val,
                    dataset_name=dataset_name,
                    iou_types=iou_types,
                    box_only=cfg.MODEL.RPN_ONLY,
                    device=cfg.MODEL.DEVICE,
                    expected_results=cfg.TEST.EXPECTED_RESULTS,
                    expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                    output_folder=output_folder,
                    skip_eval=args.skip_eval,
                    dllogger=dllogger,
                )
        synchronize()
        results.append(result)
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/tools/train_net.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/tools/train_net.py
@ -34,13 +34,6 @@ import dllogger
 from maskrcnn_benchmark.utils.logger import format_step
 # See if we can use apex.DistributedDataParallel instead of the torch default,
 # and enable mixed-precision via apex.amp
 try:
    from apex import amp
    use_amp = True
 except ImportError:
    print('Use APEX for multi-precision via apex.amp')
    use_amp = False
 try:
    from apex.parallel import DistributedDataParallel as DDP
    use_apex_ddp = True
@ -98,15 +91,11 @@ def train(cfg, local_rank, distributed, fp16, dllogger):
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)
-    if use_amp:
+    use_amp = False
-        # Initialize mixed-precision training
+    if fp16:
-        if fp16:
+        use_amp = True
-            use_mixed_precision = True
+    else:
-        else:
+        use_amp = cfg.DTYPE == "float16"
            use_mixed_precision = cfg.DTYPE == "float16"
        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
        model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
    if distributed:
        if use_apex_ddp: