[BERT/PyT][BERT/TF] Use mirror and wget (#833)

* Use mirror and wget

Mirror speedup download by 10x
wget fixes the stall with urllib

* add comment

* Update wikidownloader of bert tf1 to use mirror and wget

Co-authored-by: Swetha Mandava <sweth.mandava@gmail.com>
This commit is contained in:
Sharath TS 2021-02-17 14:49:39 -08:00 committed by GitHub
parent 0f5ff94854
commit 3459f97d58
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 11 deletions

View file

@ -16,6 +16,7 @@ import os
import urllib.request
import subprocess
import sys
import subprocess
class WikiDownloader:
def __init__(self, language, save_path):
@ -25,9 +26,10 @@ class WikiDownloader:
os.makedirs(self.save_path)
self.language = language
# Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work
self.download_urls = {
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
}
self.output_files = {
@ -45,13 +47,15 @@ class WikiDownloader:
if os.path.isfile(self.save_path + '/' + filename):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + filename, "wb") as handle:
handle.write(response.read())
cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
print('Running:', cmd)
status = subprocess.run(cmd)
if status.returncode != 0:
raise RuntimeError('Wiki download not successful')
# Always unzipping since this is relatively fast and will overwrite
print('Unzipping:', self.output_files[self.language])
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
else:
assert False, 'WikiDownloader not implemented for this language yet.'
assert False, 'WikiDownloader not implemented for this language yet.'

View file

@ -26,8 +26,8 @@ class WikiDownloader:
self.language = language
self.download_urls = {
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
}
self.output_files = {
@ -45,9 +45,11 @@ class WikiDownloader:
if os.path.isfile(self.save_path + '/' + filename):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + filename, "wb") as handle:
handle.write(response.read())
cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
print('Running:', cmd)
status = subprocess.run(cmd)
if status.returncode != 0:
raise RuntimeError('Wiki download not successful')
# Always unzipping since this is relatively fast and will overwrite
print('Unzipping:', self.output_files[self.language])