DeepLearningExamples/PyTorch/LanguageModeling/BERT/data/merge_datasets_after_creation.sh
Przemek Strzelczyk 0663b67c1a Updating models
2019-07-08 22:51:28 +02:00

30 lines
1.1 KiB
Bash
Executable file

#!/bin/bash
MERGED_DIR=$1 # e.g wikipedia+bookcorpus
INPUTFILES=$2 # directories with hdf5 files separated by comma
NUM_SHARDS=$3
source utils/config.sh
META_DIR=$MERGED_DIR/meta
mkdir -p ${MERGED_DIR}
mkdir -p ${META_DIR}
echo "create mixed dataset ids"
echo "python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}"
python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}
echo "Creating hdf5 for each text shard"
mkdir -p ${MERGED_DIR}/hdf5_shards
echo "create mixed datasets with hdf5 files"
echo "python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-${NUM_SHARDS} --random_seed=${SEED}"
python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-$((NUM_SHARDS-1)) --random_seed=${SEED}
rm -rf ${META_DIR}