DeepLearningExamples/PyTorch/SpeechRecognition/QuartzNet/utils/download_librispeech.py
2021-09-14 06:03:36 -07:00

73 lines
2.5 KiB
Python

# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env python
import os
import argparse
import pandas as pd
from download_utils import download_file, md5_checksum, extract
parser = argparse.ArgumentParser(description='Download, verify and extract dataset files')
parser.add_argument('csv', type=str,
help='CSV file with urls and checksums to download.')
parser.add_argument('dest', type=str,
help='Download destnation folder.')
parser.add_argument('-e', type=str, default=None,
help='Extraction destnation folder. Defaults to download folder if not provided')
parser.add_argument('--skip_download', action='store_true',
help='Skip downloading the files')
parser.add_argument('--skip_checksum', action='store_true',
help='Skip checksum')
parser.add_argument('--skip_extract', action='store_true',
help='Skip extracting files')
args = parser.parse_args()
args.e = args.e or args.dest
df = pd.read_csv(args.csv, delimiter=',')
if not args.skip_download:
for url in df.url:
fname = url.split('/')[-1]
print("Downloading %s:" % fname)
download_file(url=url, dest_folder=args.dest, fname=fname)
else:
print("Skipping file download")
if not args.skip_checksum:
for index, row in df.iterrows():
url = row['url']
md5 = row['md5']
fname = url.split('/')[-1]
fpath = os.path.join(args.dest, fname)
print("Verifing %s: " % fname, end='')
ret = md5_checksum(fpath=fpath, target_hash=md5)
print("Passed" if ret else "Failed")
else:
print("Skipping checksum")
if not args.skip_extract:
for url in df.url:
fname = url.split('/')[-1]
fpath = os.path.join(args.dest, fname)
print("Decompressing %s:" % fpath)
extract(fpath=fpath, dest_folder=args.e)
else:
print("Skipping file extraction")