DeepLearningExamples/PyTorch/SpeechRecognition/QuartzNet/utils/download_librispeech.py

# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#!/usr/bin/env python

import os
import argparse
import pandas as pd

from download_utils import download_file, md5_checksum, extract

parser = argparse.ArgumentParser(description='Download, verify and extract dataset files')
parser.add_argument('csv', type=str,
                    help='CSV file with urls and checksums to download.')
parser.add_argument('dest', type=str,
                    help='Download destnation folder.')
parser.add_argument('-e', type=str, default=None,
                    help='Extraction destnation folder. Defaults to download folder if not provided')
parser.add_argument('--skip_download', action='store_true',
                    help='Skip downloading the files')
parser.add_argument('--skip_checksum', action='store_true',
                    help='Skip checksum')
parser.add_argument('--skip_extract', action='store_true',
                    help='Skip extracting files')
args = parser.parse_args()
args.e = args.e or args.dest


df = pd.read_csv(args.csv, delimiter=',')


if not args.skip_download:
    for url in df.url:
        fname = url.split('/')[-1]
        print("Downloading %s:" % fname)
        download_file(url=url, dest_folder=args.dest, fname=fname)
else:
    print("Skipping file download")


if not args.skip_checksum:
    for index, row in df.iterrows():
        url = row['url']
        md5 = row['md5']
        fname = url.split('/')[-1]
        fpath = os.path.join(args.dest, fname)
        print("Verifing %s: " % fname, end='')
        ret = md5_checksum(fpath=fpath, target_hash=md5)
        print("Passed" if ret else "Failed")
else:
    print("Skipping checksum")


if not args.skip_extract:
    for url in df.url:
        fname = url.split('/')[-1]
        fpath = os.path.join(args.dest, fname)
        print("Decompressing %s:" % fpath)
        extract(fpath=fpath, dest_folder=args.e)
else:
    print("Skipping file extraction")