[BERT/PyT][BERT/TF] Use mirror and wget (#833)
* Use mirror and wget Mirror speedup download by 10x wget fixes the stall with urllib * add comment * Update wikidownloader of bert tf1 to use mirror and wget Co-authored-by: Swetha Mandava <sweth.mandava@gmail.com>
This commit is contained in:
parent
0f5ff94854
commit
3459f97d58
|
@ -16,6 +16,7 @@ import os
|
|||
import urllib.request
|
||||
import subprocess
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
class WikiDownloader:
|
||||
def __init__(self, language, save_path):
|
||||
|
@ -25,9 +26,10 @@ class WikiDownloader:
|
|||
os.makedirs(self.save_path)
|
||||
|
||||
self.language = language
|
||||
# Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work
|
||||
self.download_urls = {
|
||||
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
|
||||
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
|
||||
'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
|
||||
'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
|
||||
}
|
||||
|
||||
self.output_files = {
|
||||
|
@ -45,13 +47,15 @@ class WikiDownloader:
|
|||
if os.path.isfile(self.save_path + '/' + filename):
|
||||
print('** Download file already exists, skipping download')
|
||||
else:
|
||||
response = urllib.request.urlopen(url)
|
||||
with open(self.save_path + '/' + filename, "wb") as handle:
|
||||
handle.write(response.read())
|
||||
cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
|
||||
print('Running:', cmd)
|
||||
status = subprocess.run(cmd)
|
||||
if status.returncode != 0:
|
||||
raise RuntimeError('Wiki download not successful')
|
||||
|
||||
# Always unzipping since this is relatively fast and will overwrite
|
||||
print('Unzipping:', self.output_files[self.language])
|
||||
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
|
||||
|
||||
else:
|
||||
assert False, 'WikiDownloader not implemented for this language yet.'
|
||||
assert False, 'WikiDownloader not implemented for this language yet.'
|
||||
|
|
|
@ -26,8 +26,8 @@ class WikiDownloader:
|
|||
|
||||
self.language = language
|
||||
self.download_urls = {
|
||||
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
|
||||
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
|
||||
'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
|
||||
'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
|
||||
}
|
||||
|
||||
self.output_files = {
|
||||
|
@ -45,9 +45,11 @@ class WikiDownloader:
|
|||
if os.path.isfile(self.save_path + '/' + filename):
|
||||
print('** Download file already exists, skipping download')
|
||||
else:
|
||||
response = urllib.request.urlopen(url)
|
||||
with open(self.save_path + '/' + filename, "wb") as handle:
|
||||
handle.write(response.read())
|
||||
cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
|
||||
print('Running:', cmd)
|
||||
status = subprocess.run(cmd)
|
||||
if status.returncode != 0:
|
||||
raise RuntimeError('Wiki download not successful')
|
||||
|
||||
# Always unzipping since this is relatively fast and will overwrite
|
||||
print('Unzipping:', self.output_files[self.language])
|
||||
|
|
Loading…
Reference in a new issue