#!/bin/bash # Note: There are several directories created to make it clear what has been performed at each stage of preprocessing. The intermediate files may be useful if you want to further clean/prepare/augment the data for your own applications. # NLTK was chosen as the default over spaCy simply due to speed of sentence segmentation on the large files. MERGED_DIR=$1 args="${*:2}" source utils/config.sh mkdir -p ${MERGED_DIR} corpus_file=${MERGED_DIR}/corpus.txt ## Shuffle the full corpus texts if [ ! -z $3 ] then echo "Merging $args" cat $args | sed "/^$/d" | shuf > $corpus_file else corpus_file=$2 fi # Split articles into one-sentence-per-line format for use with BERT scripts echo "Applying sentence segmentation to get one sentence per line" mkdir -p ${MERGED_DIR}/final_text_file_single python3 utils/sentence_segmentation_nltk.py $corpus_file ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt ## Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into hdf5 (choose appropriate number of shards for distributed training) echo "Shard text files - size is approximate to prevent splitting an article across shards" mkdir -p ${MERGED_DIR}/final_text_files_sharded python3 utils/shard_text_input_file.py ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt ${MERGED_DIR}/final_text_files_sharded/corpus.segmented.part. # Convert sharded text files into hdf5 that are ready for BERT pretraining echo "Creating hdf5 for each text shard" mkdir -p ${MERGED_DIR}/hdf5_shards export TARGET_DIR=${MERGED_DIR} . utils/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}