DeepLearningExamples/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/download_wikipedia.sh
Przemek Strzelczyk 0663b67c1a Updating models
2019-07-08 22:51:28 +02:00

31 lines
1.2 KiB
Bash
Executable file

#! /bin/bash
WIKI_DUMP="ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2"
N_PROCS_PREPROCESS=4 # Adjust this based on memory requirements and available number of cores
# Download Wikipedia dump file
mkdir -p ./download
# Not using --noclobber since it emits an error if exists (incompatible with bash 'set -e')
echo "Downloading Wikidump"
if [ ! -f ./download/wikidump.xml.bz2 ]; then
wget -O ./download/wikidump.xml.bz2 ${WIKI_DUMP}
fi
# Extract dump
echo "Extracting Wikidump"
mkdir -p ./raw_data
if [ ! -f ./raw_data/wikidump.xml ]; then
pv ./download/wikidump.xml.bz2 | bunzip2 -kdc > ./raw_data/wikidump.xml
fi
# Wikiextractor.py - Creates lots of folders/files in "doc format"
echo "Running Wikiextractor"
mkdir -p ./extracted_articles
/workspace/wikiextractor/WikiExtractor.py ./raw_data/wikidump.xml -b 1000M --processes ${N_PROCS_PREPROCESS} -o ./extracted_articles
# Remove XML Tags and extraneous titles (since they are not sentences)
# Also clean to remove lines between paragraphs within article and use space-separated articles
echo "Cleaning and formatting files (one article per line)"
python3 ./remove_tags_and_clean.py ./extracted_articles ./wikipedia_corpus.txt