From 65bf7dbe482ecf2ebf5ce5e111a814e1a2f5f1b4 Mon Sep 17 00:00:00 2001 From: Hoo Chang Shin Date: Thu, 30 May 2019 12:55:57 -0400 Subject: [PATCH] initial commit on training PubMed-gvocab-base. --- .../BERT/data/pubmed/config_gvocab_base.sh | 28 + .../data/pubmed/preprocessing_gvocab_base.sh | 23 + ...preprocessing_xargs_wrapper_gvocab_base.sh | 13 + .../BERT/data/pubmed/rename_tf-records.ipynb | 687 ++++++++++++++++++ .../pubmed/run_preprocessing_gvocab_base.sh | 28 + .../shard_text_input_file_gvocab_base.py | 43 ++ .../run_pretraining-pubmed_gvocab_base.sh | 108 +++ 7 files changed, 930 insertions(+) create mode 100755 TensorFlow/LanguageModeling/BERT/data/pubmed/config_gvocab_base.sh create mode 100755 TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_gvocab_base.sh create mode 100755 TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_xargs_wrapper_gvocab_base.sh create mode 100644 TensorFlow/LanguageModeling/BERT/data/pubmed/rename_tf-records.ipynb create mode 100755 TensorFlow/LanguageModeling/BERT/data/pubmed/run_preprocessing_gvocab_base.sh create mode 100755 TensorFlow/LanguageModeling/BERT/data/pubmed/shard_text_input_file_gvocab_base.py create mode 100755 TensorFlow/LanguageModeling/BERT/scripts/run_pretraining-pubmed_gvocab_base.sh diff --git a/TensorFlow/LanguageModeling/BERT/data/pubmed/config_gvocab_base.sh b/TensorFlow/LanguageModeling/BERT/data/pubmed/config_gvocab_base.sh new file mode 100755 index 00000000..08b95eb3 --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/data/pubmed/config_gvocab_base.sh @@ -0,0 +1,28 @@ +#! /bin/bash + +set -e + +USE_BERT_LARGE=true +MAX_SEQUENCE_LENGTH=512 +MAX_PREDICTIONS_PER_SEQUENCE=80 +MASKED_LM_PROB=0.15 +SEED=12345 +DUPE_FACTOR=5 +DO_LOWER_CASE="True" +N_LINES_PER_SHARD_APPROX=396000 # Default=396000 creates 256 shards + +N_PROCS_PREPROCESS=20 # Adjust this based on memory requirements and available number of cores +export WORKING_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +#BERT_BASE_DIR="${WORKING_DIR}/pretrained_models_google/uncased_L-12_H-768_A-12" +#BERT_LARGE_DIR="${WORKING_DIR}/pretrained_models_google/uncased_L-24_H-1024_A-16" +BERT_BASE_DIR="/workspace/bert/data/pretrained_models_google/uncased_L-12_H-768_A-12" + +#if [ "$USE_BERT_LARGE" = true ] ; then +# VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt" +#else +VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt" +#fi + +OUTPUT_DIR="${WORKING_DIR}/final_tfrecords_sharded_gvocab_base/bert_pubmed_gvocab_base_seq_${MAX_SEQUENCE_LENGTH}_pred_${MAX_PREDICTIONS_PER_SEQUENCE}" + diff --git a/TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_gvocab_base.sh b/TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_gvocab_base.sh new file mode 100755 index 00000000..113a1991 --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_gvocab_base.sh @@ -0,0 +1,23 @@ +#! /bin/bash + +SHARD_INDEX=${1} +INPUT_FILE="${WORKING_DIR}/final_text_files_sharded_gvocab_base/pubmed_sentence.part.${SHARD_INDEX}.txt" + +source /workspace/bert/data/pubmed/config_gvocab_base.sh + +OUTPUT_DIR=${WORKING_DIR}/final_tfrecords_sharded_gvocab_base +mkdir -p ${OUTPUT_DIR} + +OUTPUT_FILE="${OUTPUT_DIR}/tf_examples.tfrecord000${SHARD_INDEX}" + +python /workspace/bert/create_pretraining_data.py \ + --input_file=${INPUT_FILE} \ + --output_file=${OUTPUT_FILE} \ + --vocab_file=${VOCAB_FILE} \ + --do_lower_case=${DO_LOWER_CASE} \ + --max_seq_length=${MAX_SEQUENCE_LENGTH} \ + --max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \ + --masked_lm_prob=${MASKED_LM_PROB} \ + --random_seed=${SEED} \ + --dupe_factor=${DUPE_FACTOR} + diff --git a/TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_xargs_wrapper_gvocab_base.sh b/TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_xargs_wrapper_gvocab_base.sh new file mode 100755 index 00000000..7379f1ae --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_xargs_wrapper_gvocab_base.sh @@ -0,0 +1,13 @@ +#! /bin/bash + +source /workspace/bert/data/pubmed/config_gvocab_base.sh + +SHARD_COUNT=0 +rm -rf /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt +touch /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt +for file in /workspace/bert/data/pubmed/final_text_files_sharded_gvocab_base/*; do + echo ${SHARD_COUNT} >> /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt + SHARD_COUNT=$((SHARD_COUNT+1)) +done + +xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=/workspace/bert/data/pubmed/xarg_list_gvocab_base.txt /workspace/bert/data/pubmed/preprocessing_gvocab_base.sh diff --git a/TensorFlow/LanguageModeling/BERT/data/pubmed/rename_tf-records.ipynb b/TensorFlow/LanguageModeling/BERT/data/pubmed/rename_tf-records.ipynb new file mode 100644 index 00000000..d7910489 --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/data/pubmed/rename_tf-records.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import glob, os, sys" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "files = glob.glob('final_tfrecords_sharded_gvocab_base/*tfrecord*')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000466',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000136',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000183',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000254',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00038',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000525',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00012',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00066',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000451',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000343',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000290',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000194',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000363',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000340',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000357',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000291',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000117',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000517',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000444',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000408',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000520',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000168',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000325',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000396',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000382',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000273',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000478',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000259',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000333',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000392',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000263',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000100',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000253',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000504',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000375',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000465',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000127',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000239',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00033',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000356',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00081',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000164',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000119',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000512',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00086',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00063',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000454',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000523',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00058',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000483',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00015',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000370',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000352',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000349',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000302',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00052',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000438',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000422',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00070',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000436',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00089',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000270',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000498',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000215',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000509',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000202',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00051',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000332',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000141',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000176',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00071',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000191',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000271',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00073',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000295',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000329',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000516',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000533',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000417',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000334',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000547',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000515',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00065',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000206',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000344',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000282',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00021',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000485',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000150',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000439',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000441',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000106',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000493',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00010',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000405',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0003',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00025',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000249',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000310',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000157',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000233',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000505',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00042',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00096',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000135',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000435',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000321',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000537',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00040',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000330',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000530',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000427',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000256',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000383',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00097',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000182',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00064',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000303',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000139',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000223',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000146',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000534',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000286',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000374',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000430',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000114',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0004',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000123',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000531',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00077',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000447',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000402',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000299',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00098',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000242',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00014',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000293',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000388',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000297',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000507',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000362',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000540',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000161',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000198',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000210',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000519',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000204',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000494',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000197',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000442',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000212',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00028',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00045',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00044',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000480',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000317',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000312',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000225',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000348',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000292',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000219',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000101',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000440',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000137',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000143',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000394',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000301',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000250',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000412',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000217',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000546',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000347',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00023',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000185',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000481',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000234',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000462',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000326',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000335',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00099',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000111',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000434',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000506',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000118',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000414',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000243',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00095',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000513',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000190',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000351',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000323',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000423',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00076',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000300',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000503',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000393',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000499',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000425',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000209',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000456',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000345',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000205',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000337',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000522',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000470',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000368',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000151',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000371',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000508',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000160',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000145',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000541',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000389',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000260',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000366',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00037',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000112',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000227',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00057',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000269',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000342',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000526',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000156',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000120',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000122',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000255',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000319',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000391',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000488',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000527',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000192',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000490',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000187',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000339',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00080',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000262',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00054',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000372',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000529',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000472',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000193',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000124',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000140',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000449',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000420',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000338',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00092',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00085',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000102',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00088',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000380',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000458',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000158',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000130',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000236',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00011',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00075',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000384',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000486',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00020',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000147',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000459',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000354',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00079',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00026',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000387',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000163',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000521',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000476',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00019',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000471',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0001',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000395',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00087',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000229',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000220',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000518',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000410',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000409',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000257',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000195',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000174',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000132',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000110',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000320',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000252',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000431',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000247',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000154',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000433',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000235',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000179',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000180',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000341',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000178',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00061',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0006',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000386',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000437',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00031',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000463',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00074',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000381',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000171',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000543',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000167',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00047',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00034',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000331',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00036',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000361',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000479',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000358',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000285',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000173',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000428',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00022',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000266',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000116',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000487',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000224',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00018',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000426',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000419',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000413',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0007',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000500',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00016',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00053',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000222',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000201',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000109',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000105',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000492',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000353',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000221',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000199',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000376',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000496',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000445',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000153',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00029',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000208',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000131',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000181',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0008',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000103',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000311',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000125',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00062',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000549',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000489',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000491',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000165',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000429',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000305',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00069',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000275',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000113',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000407',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00059',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0002',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000406',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000203',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000328',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00056',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000152',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000246',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000467',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000398',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000360',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000322',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000377',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000464',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000280',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000264',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000294',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000172',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000501',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000121',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000245',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000373',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000468',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000548',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000142',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00043',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000484',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000211',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000364',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00082',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000267',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000162',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00041',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000355',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000308',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000144',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000502',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000514',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000457',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000367',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000460',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000390',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000283',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000289',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000532',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000418',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000274',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000453',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000365',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000482',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000378',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00067',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000544',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000284',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00078',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00091',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00030',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000216',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000241',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000251',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000279',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000307',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000237',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000475',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000400',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000169',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000277',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000379',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000138',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000276',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000214',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000495',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000196',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000129',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000248',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000126',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000314',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000281',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000397',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000261',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000304',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000133',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000207',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000228',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000477',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000148',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000411',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000539',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00013',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000416',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000511',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000403',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000524',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00024',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000401',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000336',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000188',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000149',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000399',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000107',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000226',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000538',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00060',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000432',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000369',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000545',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000528',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000309',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000272',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00068',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00027',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000296',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000115',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000450',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000346',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000268',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000108',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000385',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000359',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00084',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000230',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000448',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000415',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00093',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000189',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000536',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000177',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000306',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000298',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000315',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000316',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00083',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0005',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00050',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00055',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000200',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000186',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000244',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00072',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000175',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000218',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000443',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000213',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000240',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000265',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000170',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00090',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000535',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00017',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000159',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000155',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000497',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000461',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000510',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000327',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000318',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00032',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000424',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00039',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000313',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000474',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000469',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000287',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000421',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000231',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000455',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000452',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000166',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000542',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00094',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000404',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000278',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000288',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000446',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000473',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00035',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00048',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000258',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000350',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000238',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000128',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000184',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000104',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000134',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00049',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000232',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0000',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000324',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0009',\n", + " 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00046']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "files" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "550" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(files)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0003\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0004\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0001\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0006\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0007\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0008\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0002\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0005\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0000\n", + "final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0009\n" + ] + } + ], + "source": [ + "prefix = 'final_tfrecords_sharded_gvocab_base/tf_examples.'\n", + "for filei in files:\n", + " fname = filei.split('.')[-1]\n", + " numi = int(fname[8:])\n", + " \n", + " if numi < 10:\n", + " print(filei)\n", + " continue\n", + " elif numi >= 10 and numi < 100:\n", + " suffix = '00' + str(numi)\n", + " elif numi >= 100 and numi < 1000:\n", + " suffix = '0' + str(numi)\n", + " elif numi >= 1000:\n", + " suffix = str(numi)\n", + " else:\n", + " print(fname)\n", + " print('!!!')\n", + " break\n", + " \n", + " fname_ = fname[:8] + suffix\n", + " newfname = prefix + fname_\n", + " \n", + " os.rename(filei, newfname)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/TensorFlow/LanguageModeling/BERT/data/pubmed/run_preprocessing_gvocab_base.sh b/TensorFlow/LanguageModeling/BERT/data/pubmed/run_preprocessing_gvocab_base.sh new file mode 100755 index 00000000..c0e0659a --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/data/pubmed/run_preprocessing_gvocab_base.sh @@ -0,0 +1,28 @@ +#! /bin/bash + +source /workspace/bert/data/pubmed/config_gvocab_base.sh + +# Download books +#mkdir -p download +#python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ${WORKING_DIR}/download --trash-bad-count + +# Clean and prep (one book per line) +#mkdir -p ${WORKING_DIR}/intermediate_files +#python3 ${WORKING_DIR}/clean_and_merge_text.py + +# Split books into one-sentence-per-line format for use with BERT scripts +#echo "Applying sentence segmentation to get one sentence per line" +#mkdir -p ${WORKING_DIR}/final_text_file_single +#python3 ${WORKING_DIR}/sentence_segmentation_nltk.py +# Note: NLTK can be replaced with Spacy, although it is slower (2 variations provided) + +# Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into tfrecords (choose appropriate number of shards for distributed training) +echo "Shard text files - size is approximate to prevent splitting a book across shards" +mkdir -p ${WORKING_DIR}/final_text_files_sharded_gvocab_base +python3 ${WORKING_DIR}/shard_text_input_file_gvocab_base.py + +# Convert sharded text files into tfrecords that are ready for BERT pretraining +echo "Creating tfrecords for each text shard" +mkdir -p ${WORKING_DIR}/final_tfrecords_sharded_gvocab_base +. ${WORKING_DIR}/preprocessing_xargs_wrapper_gvocab_base.sh ${N_PROCS_PREPROCESS} + diff --git a/TensorFlow/LanguageModeling/BERT/data/pubmed/shard_text_input_file_gvocab_base.py b/TensorFlow/LanguageModeling/BERT/data/pubmed/shard_text_input_file_gvocab_base.py new file mode 100755 index 00000000..a0926f22 --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/data/pubmed/shard_text_input_file_gvocab_base.py @@ -0,0 +1,43 @@ +# NVIDIA + +import os + +from pdb import set_trace as bp + +input_file = os.environ['WORKING_DIR'] + '/final_text_file_single/pubmed_sentence_nltk_uncased.txt' +output_file = os.environ['WORKING_DIR'] + '/final_text_files_sharded_gvocab_base/pubmed_sentence.part.' + +doc_seperator = "\n" + +line_buffer = [] +shard_size = 396000 # Approximate, will split at next article break +line_counter = 0 +shard_index = 0 + +ifile_lines = 0 +with open(input_file) as ifile: + for line in ifile: + ifile_lines += 1 + +print("Input file contains", ifile_lines, "lines.") + +iline_counter = 1 +with open(input_file) as ifile: + for line in ifile: + if line_counter < shard_size and iline_counter < ifile_lines: + line_buffer.append(line) + line_counter += 1 + iline_counter += 1 + elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines: + line_buffer.append(line) + line_counter += 1 + iline_counter += 1 + else: + with open(output_file + str(shard_index) + ".txt", "w") as ofile: + for oline in line_buffer: + ofile.write(oline) + line_buffer = [] + line_counter = 0 + shard_index += 1 + + diff --git a/TensorFlow/LanguageModeling/BERT/scripts/run_pretraining-pubmed_gvocab_base.sh b/TensorFlow/LanguageModeling/BERT/scripts/run_pretraining-pubmed_gvocab_base.sh new file mode 100755 index 00000000..001f7612 --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/scripts/run_pretraining-pubmed_gvocab_base.sh @@ -0,0 +1,108 @@ +#! /bin/bash + +echo "Container nvidia build = " $NVIDIA_BUILD_ID + +# WIKI_DIR=/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded +# BOOKS_DIR=/workspace/bert/data/bookcorpus/final_tfrecords_sharded +PUBMED_DIR=/workspace/bert/data/pubmed/final_tfrecords_sharded_gvocab_base +BERT_CONFIG=/workspace/bert/data/pretrained_models_google/uncased_L-24_H-1024_A-16/bert_config.json +RESULTS_DIR=/results +CHECKPOINTS_DIR=/results/checkpoints + +# if [ ! -d "$WIKI_DIR" ] ; then +# echo "Error! $WIKI_DIR directory missing. Please mount wikipedia dataset." +# exit -1 +# else +# SOURCES="$WIKI_DIR/*" +# fi +# if [ ! -d "$BOOKS_DIR" ] ; then +# echo "Warning! $BOOKS_DIR directory missing. Training will proceed without book corpus." +# else +# SOURCES+=" $BOOKS_DIR/*" +# fi +if [ ! -d "$PUBMED_DIR" ] ; then + echo "Error! $WIKI_DIR directory missing. Please mount pubmed dataset." + exit -1 +else + SOURCES="$PUBMED_DIR/*" +fi +if [ ! -d "$RESULTS_DIR" ] ; then + echo "Error! $RESULTS_DIR directory missing." + exit -1 +fi +if [ ! -d "$CHECKPOINTS_DIR" ] ; then + echo "Warning! $CHECKPOINTS_DIR directory missing." + echo "Checkpoints will be written to $RESULTS_DIR instead." + CHECKPOINTS_DIR=$RESULTS_DIR +fi +if [ ! -f "$BERT_CONFIG" ] ; then + echo "Error! BERT large configuration file not found at $BERT_CONFIG" + exit -1 +fi + +train_batch_size=${1:-14} +eval_batch_size=${2:-8} +learning_rate=${3:-"1e-4"} +precision=${4:-"fp16_xla"} +num_gpus=${5:-8} +warmup_steps=${6:-"10000"} +train_steps=${7:-1144000} +save_checkpoint_steps=${8:-5000} +create_logfile=${9:-"true"} + +PREC="" +if [ "$precision" = "fp16" ] ; then + PREC="--use_fp16" +elif [ "$precision" = "fp16_xla" ] ; then + PREC="--use_fp16 --use_xla" +elif [ "$precision" = "fp32" ] ; then + PREC="" +elif [ "$precision" = "amp" ] ; then + PREC="--amp" +elif [ "$precision" = "amp_xla" ] ; then + PREC="--amp --use_xla" +else + echo "Unknown argument" + exit -2 +fi + +echo $SOURCES +INPUT_FILES=$(eval ls $SOURCES | tr " " "\n" | awk '{printf "%s,",$1}' | sed s'/.$//') +CMD="python3 /workspace/bert/run_pretraining.py" +CMD+=" --input_file=$INPUT_FILES" +CMD+=" --output_dir=$CHECKPOINTS_DIR" +CMD+=" --bert_config_file=$BERT_CONFIG" +CMD+=" --do_train=True" +CMD+=" --do_eval=True" +CMD+=" --train_batch_size=$train_batch_size" +CMD+=" --eval_batch_size=$eval_batch_size" +CMD+=" --max_seq_length=512" +CMD+=" --max_predictions_per_seq=80" +CMD+=" --num_train_steps=$train_steps" +CMD+=" --num_warmup_steps=$warmup_steps" +CMD+=" --save_checkpoint_steps=$save_checkpoint_steps" +CMD+=" --learning_rate=$learning_rate" +CMD+=" --report_loss" +CMD+=" --horovod $PREC" + +if [ $num_gpus > 1 ] ; then + CMD="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket $CMD" +fi + +if [ "$create_logfile" = "true" ] ; then + export GBS=$(expr $train_batch_size \* $num_gpus) + printf -v TAG "tf_bert_1n_%s_gbs%d" "$precision" $GBS + DATESTAMP=`date +'%y%m%d%H%M%S'` + LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log + printf "Logs written to %s\n" "$LOGFILE" +fi + +set -x +if [ -z "$LOGFILE" ] ; then + $CMD +else + ( + $CMD + ) |& tee $LOGFILE +fi +set +x