#! /usr/bin/env bash # Copyright 2017 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. set -e export LANG=C.UTF-8 export LC_ALL=C.UTF-8 OUTPUT_DIR=${1:-"data/wmt16_de_en"} echo "Writing to ${OUTPUT_DIR}. To change this, set the OUTPUT_DIR environment variable." OUTPUT_DIR_DATA="${OUTPUT_DIR}/data" mkdir -p $OUTPUT_DIR_DATA echo "Downloading Europarl v7. This may take a while..." wget -nc -nv -O ${OUTPUT_DIR_DATA}/europarl-v7-de-en.tgz \ http://www.statmt.org/europarl/v7/de-en.tgz echo "Downloading Common Crawl corpus. This may take a while..." wget -nc -nv -O ${OUTPUT_DIR_DATA}/common-crawl.tgz \ http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz echo "Downloading News Commentary v11. This may take a while..." wget -nc -nv -O ${OUTPUT_DIR_DATA}/nc-v11.tgz \ http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz echo "Downloading dev/test sets" wget -nc -nv -O ${OUTPUT_DIR_DATA}/dev.tgz \ http://data.statmt.org/wmt16/translation-task/dev.tgz wget -nc -nv -O ${OUTPUT_DIR_DATA}/test.tgz \ http://data.statmt.org/wmt16/translation-task/test.tgz # Extract everything echo "Extracting all files..." mkdir -p "${OUTPUT_DIR_DATA}/europarl-v7-de-en" tar -xvzf "${OUTPUT_DIR_DATA}/europarl-v7-de-en.tgz" -C "${OUTPUT_DIR_DATA}/europarl-v7-de-en" mkdir -p "${OUTPUT_DIR_DATA}/common-crawl" tar -xvzf "${OUTPUT_DIR_DATA}/common-crawl.tgz" -C "${OUTPUT_DIR_DATA}/common-crawl" mkdir -p "${OUTPUT_DIR_DATA}/nc-v11" tar -xvzf "${OUTPUT_DIR_DATA}/nc-v11.tgz" -C "${OUTPUT_DIR_DATA}/nc-v11" mkdir -p "${OUTPUT_DIR_DATA}/dev" tar -xvzf "${OUTPUT_DIR_DATA}/dev.tgz" -C "${OUTPUT_DIR_DATA}/dev" mkdir -p "${OUTPUT_DIR_DATA}/test" tar -xvzf "${OUTPUT_DIR_DATA}/test.tgz" -C "${OUTPUT_DIR_DATA}/test" # Concatenate Training data cat "${OUTPUT_DIR_DATA}/europarl-v7-de-en/europarl-v7.de-en.en" \ "${OUTPUT_DIR_DATA}/common-crawl/commoncrawl.de-en.en" \ "${OUTPUT_DIR_DATA}/nc-v11/training-parallel-nc-v11/news-commentary-v11.de-en.en" \ > "${OUTPUT_DIR}/train.en" wc -l "${OUTPUT_DIR}/train.en" cat "${OUTPUT_DIR_DATA}/europarl-v7-de-en/europarl-v7.de-en.de" \ "${OUTPUT_DIR_DATA}/common-crawl/commoncrawl.de-en.de" \ "${OUTPUT_DIR_DATA}/nc-v11/training-parallel-nc-v11/news-commentary-v11.de-en.de" \ > "${OUTPUT_DIR}/train.de" wc -l "${OUTPUT_DIR}/train.de" # Clone Moses if [ ! -d "${OUTPUT_DIR}/mosesdecoder" ]; then echo "Cloning moses for data processing" git clone https://github.com/moses-smt/mosesdecoder.git "${OUTPUT_DIR}/mosesdecoder" cd ${OUTPUT_DIR}/mosesdecoder git reset --hard 8c5eaa1a122236bbf927bde4ec610906fea599e6 cd - fi # Convert SGM files # Convert newstest2014 data into raw text format ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ < ${OUTPUT_DIR_DATA}/dev/dev/newstest2014-deen-src.de.sgm \ > ${OUTPUT_DIR_DATA}/dev/dev/newstest2014.de ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ < ${OUTPUT_DIR_DATA}/dev/dev/newstest2014-deen-ref.en.sgm \ > ${OUTPUT_DIR_DATA}/dev/dev/newstest2014.en # Convert newstest2015 data into raw text format ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ < ${OUTPUT_DIR_DATA}/dev/dev/newstest2015-deen-src.de.sgm \ > ${OUTPUT_DIR_DATA}/dev/dev/newstest2015.de ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ < ${OUTPUT_DIR_DATA}/dev/dev/newstest2015-deen-ref.en.sgm \ > ${OUTPUT_DIR_DATA}/dev/dev/newstest2015.en # Convert newstest2016 data into raw text format ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ < ${OUTPUT_DIR_DATA}/test/test/newstest2016-deen-src.de.sgm \ > ${OUTPUT_DIR_DATA}/test/test/newstest2016.de ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \ < ${OUTPUT_DIR_DATA}/test/test/newstest2016-deen-ref.en.sgm \ > ${OUTPUT_DIR_DATA}/test/test/newstest2016.en # Copy dev/test data to output dir cp ${OUTPUT_DIR_DATA}/dev/dev/newstest20*.de ${OUTPUT_DIR} cp ${OUTPUT_DIR_DATA}/dev/dev/newstest20*.en ${OUTPUT_DIR} cp ${OUTPUT_DIR_DATA}/test/test/newstest20*.de ${OUTPUT_DIR} cp ${OUTPUT_DIR_DATA}/test/test/newstest20*.en ${OUTPUT_DIR} # Tokenize data for f in ${OUTPUT_DIR}/*.de; do echo "Tokenizing $f..." ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l de -threads 8 < $f > ${f%.*}.tok.de done for f in ${OUTPUT_DIR}/*.en; do echo "Tokenizing $f..." ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l en -threads 8 < $f > ${f%.*}.tok.en done # Clean all corpora for f in ${OUTPUT_DIR}/*.en; do fbase=${f%.*} echo "Cleaning ${fbase}..." ${OUTPUT_DIR}/mosesdecoder/scripts/training/clean-corpus-n.perl $fbase de en "${fbase}.clean" 1 80 done # Create dev dataset cat "${OUTPUT_DIR}/newstest2015.tok.clean.en" \ "${OUTPUT_DIR}/newstest2016.tok.clean.en" \ > "${OUTPUT_DIR}/newstest_dev.tok.clean.en" cat "${OUTPUT_DIR}/newstest2015.tok.clean.de" \ "${OUTPUT_DIR}/newstest2016.tok.clean.de" \ > "${OUTPUT_DIR}/newstest_dev.tok.clean.de" # Filter datasets python3 scripts/filter_dataset.py \ -f1 ${OUTPUT_DIR}/train.tok.clean.en \ -f2 ${OUTPUT_DIR}/train.tok.clean.de python3 scripts/filter_dataset.py \ -f1 ${OUTPUT_DIR}/newstest_dev.tok.clean.en \ -f2 ${OUTPUT_DIR}/newstest_dev.tok.clean.de # Generate Subword Units (BPE) # Learn Shared BPE for merge_ops in 32000; do echo "Learning BPE with merge_ops=${merge_ops}. This may take a while..." cat "${OUTPUT_DIR}/train.tok.clean.de" "${OUTPUT_DIR}/train.tok.clean.en" | \ subword-nmt learn-bpe -s $merge_ops > "${OUTPUT_DIR}/bpe.${merge_ops}" echo "Apply BPE with merge_ops=${merge_ops} to tokenized files..." for lang in en de; do for f in ${OUTPUT_DIR}/*.tok.${lang} ${OUTPUT_DIR}/*.tok.clean.${lang}; do outfile="${f%.*}.bpe.${merge_ops}.${lang}" subword-nmt apply-bpe -c "${OUTPUT_DIR}/bpe.${merge_ops}" < $f > "${outfile}" echo ${outfile} done done # Create vocabulary file for BPE cat "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.en" "${OUTPUT_DIR}/train.tok.clean.bpe.${merge_ops}.de" | \ subword-nmt get-vocab | cut -f1 -d ' ' > "${OUTPUT_DIR}/vocab.bpe.${merge_ops}" done echo "All done."