DeepLearningExamples/PyTorch/SpeechRecognition/QuartzNet/common/dali/data_loader.py

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#           http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import math
import os

import torch
import torch.distributed as dist

from .iterator import DaliIterator, SyntheticDataIterator
from .pipeline import make_dali_asr_pipeline
from common.helpers import print_once


def _parse_json(json_path: str, start_label=0, predicate=lambda json: True):
    """
    Parses json file to the format required by DALI.

    Args:
        json_path: path to json file
        start_label: the label, starting from which DALI will assign
            consecutive int numbers to every transcript
        predicate: function, that accepts a sample descriptor
            (i.e. json dictionary) as an argument. If the predicate for a given
            sample returns True, it will be included in the dataset.

    Returns:
        output_files: dict that maps file name to label assigned by DALI
        transcripts: dict that maps label assigned by DALI to the transcript
    """
    global cnt
    with open(json_path) as f:
        librispeech_json = json.load(f)
    output_files = {}
    transcripts = {}
    curr_label = start_label
    for original_sample in librispeech_json:
        if not predicate(original_sample):
            continue
        transcripts[curr_label] = original_sample['transcript']
        output_files[original_sample['files'][-1]['fname']] = curr_label
        curr_label += 1
    return output_files, transcripts


def _dict_to_file(dict: dict, filename: str):
    with open(filename, "w") as f:
        for key, value in dict.items():
            f.write("{} {}\n".format(key, value))


class DaliDataLoader:
    """
    DataLoader is the main entry point to the data preprocessing pipeline.
    To use, create an object and then just iterate over `data_iterator`.
    DataLoader will do the rest for you.
    Example:
        data_layer = DataLoader(DaliTrainPipeline, path, json, bs, ngpu)
        data_it = data_layer.data_iterator
        for data in data_it:
            print(data)  # Here's your preprocessed data

    Args:
        device_type: Which device to use for preprocessing. Choose: "cpu", "gpu"
        pipeline_type: Choose: "train", "val", "synth"
    """
    def __init__(self, gpu_id, dataset_path: str, config_data: dict,
                 config_features: dict, json_names: list, symbols: list,
                 batch_size: int, pipeline_type: str,
                 grad_accumulation_steps: int = 1,
                 synth_iters_per_epoch: int = 544, device_type: str = "gpu"):

        self.batch_size = batch_size
        self.grad_accumulation_steps = grad_accumulation_steps
        self.drop_last = (pipeline_type == 'train')
        self.device_type = device_type
        pipeline_type = self._parse_pipeline_type(pipeline_type)
        if pipeline_type == "synth":
            self._dali_data_iterator = self._init_synth_iterator(
                self.batch_size,
                config_features['nfilt'],
                iters_per_epoch=synth_iters_per_epoch,
                ngpus=torch.distributed.get_world_size())
        else:
            self._dali_data_iterator = self._init_iterator(
                gpu_id=gpu_id,
                dataset_path=dataset_path,
                config_data=config_data,
                config_features=config_features,
                json_names=json_names,
                symbols=symbols,
                train_pipeline=pipeline_type == "train")

    def _init_iterator(self, gpu_id, dataset_path, config_data,
                       config_features, json_names: list, symbols: list,
                       train_pipeline: bool):
        """Returns an iterator over data preprocessed with Dali."""

        def hash_list_of_strings(li):
            return str(abs(hash(''.join(li))))

        output_files, transcripts = {}, {}
        max_duration = config_data['max_duration']
        for jname in json_names:
            of, tr = _parse_json(
                jname if jname[0] == '/' else os.path.join(dataset_path, jname),
                len(output_files),
                predicate=lambda json: json['original_duration'] <= max_duration)
            output_files.update(of)
            transcripts.update(tr)
        file_list_path = os.path.join(
            "/tmp", "asr_dali.file_list." + hash_list_of_strings(json_names))
        _dict_to_file(output_files, file_list_path)
        self.dataset_size = len(output_files)
        print_once('Dataset read by DALI. '
                   f'Number of samples: {self.dataset_size}')

        pipeline = make_dali_asr_pipeline(
            config_data=config_data,
            config_features=config_features,
            device_id=gpu_id,
            file_root=dataset_path,
            file_list=file_list_path,
            device_type=self.device_type,
            batch_size=self.batch_size,
            train_pipeline=train_pipeline)

        return DaliIterator([pipeline], transcripts=transcripts,
                            symbols=symbols, batch_size=self.batch_size,
                            reader_name="file_reader",
                            train_iterator=train_pipeline)

    def _init_synth_iterator(self, batch_size, nfeatures, iters_per_epoch,
                             ngpus):
        self.dataset_size = ngpus * iters_per_epoch * batch_size
        return SyntheticDataIterator(batch_size, nfeatures, regenerate=True)

    @staticmethod
    def _parse_pipeline_type(pipeline_type):
        pipe = pipeline_type.lower()
        assert pipe in ("train", "val", "synth"), \
            'Invalid pipeline type (choices: "train", "val", "synth").'
        return pipe

    def _shard_size(self):
        """
        Total number of samples handled by a single GPU in a single epoch.
        """
        world_size = dist.get_world_size() if dist.is_initialized() else 1
        if self.drop_last:
            divisor = world_size * self.batch_size * self.grad_accumulation_steps
            return self.dataset_size // divisor * divisor // world_size
        else:
            return int(math.ceil(self.dataset_size / world_size))

    def __len__(self):
        """
        Number of batches handled by each GPU.
        """
        if self.drop_last:
            assert self._shard_size() % self.batch_size == 0, \
                f'{self._shard_size()} {self.batch_size}'

        return int(math.ceil(self._shard_size() / self.batch_size))

    def data_iterator(self):
        return self._dali_data_iterator

    def __iter__(self):
        return self._dali_data_iterator