[BERT/PyT][BERT/TF] Use mirror and wget (#833)

* Use mirror and wget Mirror speedup download by 10x wget fixes the stall with urllib * add comment * Update wikidownloader of bert tf1 to use mirror and wget Co-authored-by: Swetha Mandava <sweth.mandava@gmail.com>
2021-02-17 14:49:39 -08:00 · 2021-02-17 14:49:39 -08:00 · 3459f97d58
parent 0f5ff94854
commit 3459f97d58
2 changed files with 17 additions and 11 deletions
--- a/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
+++ b/PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
@ -16,6 +16,7 @@ import os
 import urllib.request
 import subprocess
 import sys
+import subprocess

 class WikiDownloader:
    def __init__(self, language, save_path):
@ -25,9 +26,10 @@ class WikiDownloader:
            os.makedirs(self.save_path)

        self.language = language
+        # Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work
        self.download_urls = {
-            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+            'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
        }

        self.output_files = {
@ -45,13 +47,15 @@ class WikiDownloader:
            if os.path.isfile(self.save_path + '/' + filename):
                print('** Download file already exists, skipping download')
            else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + filename, "wb") as handle:
-                    handle.write(response.read())
+                cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
+                print('Running:', cmd)
+                status = subprocess.run(cmd)
+                if status.returncode != 0:
+                    raise RuntimeError('Wiki download not successful')

            # Always unzipping since this is relatively fast and will overwrite
            print('Unzipping:', self.output_files[self.language])
            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)

        else:
-            assert False, 'WikiDownloader not implemented for this language yet.'
+            assert False, 'WikiDownloader not implemented for this language yet.'
--- a/TensorFlow/LanguageModeling/BERT/data/WikiDownloader.py
+++ b/TensorFlow/LanguageModeling/BERT/data/WikiDownloader.py
@ -26,8 +26,8 @@ class WikiDownloader:

        self.language = language
        self.download_urls = {
-            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+            'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
        }

        self.output_files = {
@ -45,9 +45,11 @@ class WikiDownloader:
            if os.path.isfile(self.save_path + '/' + filename):
                print('** Download file already exists, skipping download')
            else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + filename, "wb") as handle:
-                    handle.write(response.read())
+                cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename)]
+                print('Running:', cmd)
+                status = subprocess.run(cmd)
+                if status.returncode != 0:
+                    raise RuntimeError('Wiki download not successful')

            # Always unzipping since this is relatively fast and will overwrite
            print('Unzipping:', self.output_files[self.language])