initial commit on training PubMed-gvocab-base.

This commit is contained in:
Hoo Chang Shin 2019-05-30 12:55:57 -04:00
parent b7eff3da41
commit 65bf7dbe48
7 changed files with 930 additions and 0 deletions

View file

@ -0,0 +1,28 @@
#! /bin/bash
set -e
USE_BERT_LARGE=true
MAX_SEQUENCE_LENGTH=512
MAX_PREDICTIONS_PER_SEQUENCE=80
MASKED_LM_PROB=0.15
SEED=12345
DUPE_FACTOR=5
DO_LOWER_CASE="True"
N_LINES_PER_SHARD_APPROX=396000 # Default=396000 creates 256 shards
N_PROCS_PREPROCESS=20 # Adjust this based on memory requirements and available number of cores
export WORKING_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
#BERT_BASE_DIR="${WORKING_DIR}/pretrained_models_google/uncased_L-12_H-768_A-12"
#BERT_LARGE_DIR="${WORKING_DIR}/pretrained_models_google/uncased_L-24_H-1024_A-16"
BERT_BASE_DIR="/workspace/bert/data/pretrained_models_google/uncased_L-12_H-768_A-12"
#if [ "$USE_BERT_LARGE" = true ] ; then
# VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
#else
VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
#fi
OUTPUT_DIR="${WORKING_DIR}/final_tfrecords_sharded_gvocab_base/bert_pubmed_gvocab_base_seq_${MAX_SEQUENCE_LENGTH}_pred_${MAX_PREDICTIONS_PER_SEQUENCE}"

View file

@ -0,0 +1,23 @@
#! /bin/bash
SHARD_INDEX=${1}
INPUT_FILE="${WORKING_DIR}/final_text_files_sharded_gvocab_base/pubmed_sentence.part.${SHARD_INDEX}.txt"
source /workspace/bert/data/pubmed/config_gvocab_base.sh
OUTPUT_DIR=${WORKING_DIR}/final_tfrecords_sharded_gvocab_base
mkdir -p ${OUTPUT_DIR}
OUTPUT_FILE="${OUTPUT_DIR}/tf_examples.tfrecord000${SHARD_INDEX}"
python /workspace/bert/create_pretraining_data.py \
--input_file=${INPUT_FILE} \
--output_file=${OUTPUT_FILE} \
--vocab_file=${VOCAB_FILE} \
--do_lower_case=${DO_LOWER_CASE} \
--max_seq_length=${MAX_SEQUENCE_LENGTH} \
--max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
--masked_lm_prob=${MASKED_LM_PROB} \
--random_seed=${SEED} \
--dupe_factor=${DUPE_FACTOR}

View file

@ -0,0 +1,13 @@
#! /bin/bash
source /workspace/bert/data/pubmed/config_gvocab_base.sh
SHARD_COUNT=0
rm -rf /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt
touch /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt
for file in /workspace/bert/data/pubmed/final_text_files_sharded_gvocab_base/*; do
echo ${SHARD_COUNT} >> /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt
SHARD_COUNT=$((SHARD_COUNT+1))
done
xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=/workspace/bert/data/pubmed/xarg_list_gvocab_base.txt /workspace/bert/data/pubmed/preprocessing_gvocab_base.sh

View file

@ -0,0 +1,687 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import glob, os, sys"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"files = glob.glob('final_tfrecords_sharded_gvocab_base/*tfrecord*')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000466',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000136',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000183',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000254',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00038',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000525',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00012',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00066',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000451',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000343',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000290',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000194',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000363',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000340',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000357',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000291',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000117',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000517',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000444',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000408',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000520',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000168',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000325',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000396',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000382',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000273',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000478',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000259',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000333',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000392',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000263',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000100',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000253',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000504',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000375',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000465',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000127',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000239',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00033',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000356',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00081',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000164',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000119',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000512',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00086',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00063',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000454',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000523',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00058',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000483',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00015',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000370',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000352',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000349',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000302',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00052',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000438',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000422',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00070',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000436',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00089',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000270',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000498',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000215',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000509',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000202',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00051',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000332',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000141',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000176',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00071',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000191',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000271',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00073',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000295',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000329',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000516',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000533',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000417',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000334',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000547',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000515',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00065',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000206',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000344',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000282',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00021',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000485',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000150',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000439',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000441',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000106',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000493',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00010',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000405',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0003',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00025',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000249',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000310',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000157',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000233',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000505',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00042',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00096',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000135',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000435',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000321',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000537',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00040',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000330',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000530',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000427',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000256',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000383',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00097',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000182',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00064',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000303',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000139',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000223',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000146',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000534',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000286',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000374',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000430',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000114',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0004',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000123',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000531',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00077',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000447',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000402',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000299',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00098',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000242',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00014',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000293',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000388',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000297',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000507',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000362',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000540',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000161',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000198',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000210',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000519',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000204',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000494',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000197',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000442',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000212',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00028',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00045',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00044',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000480',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000317',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000312',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000225',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000348',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000292',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000219',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000101',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000440',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000137',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000143',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000394',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000301',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000250',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000412',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000217',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000546',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000347',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00023',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000185',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000481',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000234',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000462',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000326',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000335',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00099',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000111',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000434',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000506',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000118',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000414',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000243',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00095',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000513',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000190',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000351',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000323',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000423',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00076',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000300',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000503',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000393',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000499',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000425',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000209',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000456',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000345',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000205',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000337',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000522',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000470',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000368',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000151',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000371',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000508',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000160',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000145',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000541',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000389',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000260',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000366',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00037',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000112',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000227',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00057',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000269',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000342',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000526',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000156',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000120',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000122',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000255',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000319',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000391',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000488',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000527',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000192',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000490',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000187',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000339',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00080',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000262',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00054',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000372',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000529',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000472',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000193',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000124',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000140',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000449',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000420',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000338',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00092',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00085',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000102',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00088',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000380',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000458',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000158',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000130',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000236',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00011',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00075',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000384',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000486',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00020',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000147',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000459',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000354',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00079',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00026',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000387',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000163',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000521',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000476',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00019',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000471',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0001',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000395',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00087',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000229',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000220',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000518',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000410',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000409',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000257',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000195',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000174',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000132',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000110',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000320',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000252',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000431',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000247',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000154',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000433',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000235',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000179',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000180',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000341',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000178',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00061',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0006',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000386',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000437',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00031',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000463',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00074',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000381',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000171',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000543',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000167',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00047',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00034',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000331',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00036',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000361',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000479',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000358',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000285',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000173',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000428',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00022',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000266',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000116',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000487',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000224',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00018',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000426',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000419',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000413',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0007',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000500',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00016',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00053',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000222',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000201',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000109',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000105',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000492',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000353',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000221',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000199',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000376',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000496',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000445',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000153',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00029',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000208',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000131',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000181',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0008',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000103',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000311',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000125',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00062',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000549',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000489',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000491',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000165',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000429',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000305',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00069',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000275',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000113',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000407',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00059',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0002',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000406',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000203',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000328',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00056',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000152',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000246',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000467',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000398',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000360',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000322',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000377',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000464',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000280',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000264',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000294',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000172',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000501',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000121',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000245',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000373',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000468',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000548',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000142',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00043',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000484',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000211',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000364',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00082',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000267',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000162',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00041',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000355',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000308',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000144',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000502',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000514',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000457',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000367',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000460',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000390',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000283',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000289',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000532',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000418',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000274',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000453',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000365',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000482',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000378',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00067',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000544',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000284',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00078',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00091',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00030',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000216',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000241',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000251',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000279',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000307',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000237',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000475',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000400',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000169',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000277',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000379',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000138',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000276',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000214',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000495',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000196',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000129',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000248',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000126',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000314',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000281',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000397',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000261',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000304',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000133',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000207',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000228',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000477',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000148',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000411',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000539',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00013',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000416',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000511',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000403',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000524',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00024',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000401',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000336',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000188',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000149',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000399',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000107',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000226',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000538',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00060',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000432',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000369',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000545',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000528',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000309',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000272',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00068',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00027',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000296',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000115',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000450',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000346',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000268',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000108',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000385',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000359',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00084',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000230',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000448',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000415',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00093',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000189',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000536',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000177',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000306',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000298',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000315',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000316',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00083',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0005',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00050',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00055',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000200',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000186',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000244',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00072',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000175',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000218',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000443',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000213',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000240',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000265',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000170',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00090',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000535',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00017',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000159',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000155',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000497',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000461',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000510',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000327',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000318',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00032',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000424',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00039',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000313',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000474',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000469',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000287',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000421',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000231',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000455',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000452',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000166',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000542',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00094',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000404',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000278',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000288',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000446',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000473',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00035',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00048',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000258',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000350',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000238',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000128',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000184',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000104',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000134',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00049',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000232',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0000',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000324',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0009',\n",
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00046']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"files"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"550"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(files)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0003\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0004\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0001\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0006\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0007\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0008\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0002\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0005\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0000\n",
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0009\n"
]
}
],
"source": [
"prefix = 'final_tfrecords_sharded_gvocab_base/tf_examples.'\n",
"for filei in files:\n",
" fname = filei.split('.')[-1]\n",
" numi = int(fname[8:])\n",
" \n",
" if numi < 10:\n",
" print(filei)\n",
" continue\n",
" elif numi >= 10 and numi < 100:\n",
" suffix = '00' + str(numi)\n",
" elif numi >= 100 and numi < 1000:\n",
" suffix = '0' + str(numi)\n",
" elif numi >= 1000:\n",
" suffix = str(numi)\n",
" else:\n",
" print(fname)\n",
" print('!!!')\n",
" break\n",
" \n",
" fname_ = fname[:8] + suffix\n",
" newfname = prefix + fname_\n",
" \n",
" os.rename(filei, newfname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -0,0 +1,28 @@
#! /bin/bash
source /workspace/bert/data/pubmed/config_gvocab_base.sh
# Download books
#mkdir -p download
#python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ${WORKING_DIR}/download --trash-bad-count
# Clean and prep (one book per line)
#mkdir -p ${WORKING_DIR}/intermediate_files
#python3 ${WORKING_DIR}/clean_and_merge_text.py
# Split books into one-sentence-per-line format for use with BERT scripts
#echo "Applying sentence segmentation to get one sentence per line"
#mkdir -p ${WORKING_DIR}/final_text_file_single
#python3 ${WORKING_DIR}/sentence_segmentation_nltk.py
# Note: NLTK can be replaced with Spacy, although it is slower (2 variations provided)
# Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into tfrecords (choose appropriate number of shards for distributed training)
echo "Shard text files - size is approximate to prevent splitting a book across shards"
mkdir -p ${WORKING_DIR}/final_text_files_sharded_gvocab_base
python3 ${WORKING_DIR}/shard_text_input_file_gvocab_base.py
# Convert sharded text files into tfrecords that are ready for BERT pretraining
echo "Creating tfrecords for each text shard"
mkdir -p ${WORKING_DIR}/final_tfrecords_sharded_gvocab_base
. ${WORKING_DIR}/preprocessing_xargs_wrapper_gvocab_base.sh ${N_PROCS_PREPROCESS}

View file

@ -0,0 +1,43 @@
# NVIDIA
import os
from pdb import set_trace as bp
input_file = os.environ['WORKING_DIR'] + '/final_text_file_single/pubmed_sentence_nltk_uncased.txt'
output_file = os.environ['WORKING_DIR'] + '/final_text_files_sharded_gvocab_base/pubmed_sentence.part.'
doc_seperator = "\n"
line_buffer = []
shard_size = 396000 # Approximate, will split at next article break
line_counter = 0
shard_index = 0
ifile_lines = 0
with open(input_file) as ifile:
for line in ifile:
ifile_lines += 1
print("Input file contains", ifile_lines, "lines.")
iline_counter = 1
with open(input_file) as ifile:
for line in ifile:
if line_counter < shard_size and iline_counter < ifile_lines:
line_buffer.append(line)
line_counter += 1
iline_counter += 1
elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
line_buffer.append(line)
line_counter += 1
iline_counter += 1
else:
with open(output_file + str(shard_index) + ".txt", "w") as ofile:
for oline in line_buffer:
ofile.write(oline)
line_buffer = []
line_counter = 0
shard_index += 1

View file

@ -0,0 +1,108 @@
#! /bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
# WIKI_DIR=/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
# BOOKS_DIR=/workspace/bert/data/bookcorpus/final_tfrecords_sharded
PUBMED_DIR=/workspace/bert/data/pubmed/final_tfrecords_sharded_gvocab_base
BERT_CONFIG=/workspace/bert/data/pretrained_models_google/uncased_L-24_H-1024_A-16/bert_config.json
RESULTS_DIR=/results
CHECKPOINTS_DIR=/results/checkpoints
# if [ ! -d "$WIKI_DIR" ] ; then
# echo "Error! $WIKI_DIR directory missing. Please mount wikipedia dataset."
# exit -1
# else
# SOURCES="$WIKI_DIR/*"
# fi
# if [ ! -d "$BOOKS_DIR" ] ; then
# echo "Warning! $BOOKS_DIR directory missing. Training will proceed without book corpus."
# else
# SOURCES+=" $BOOKS_DIR/*"
# fi
if [ ! -d "$PUBMED_DIR" ] ; then
echo "Error! $WIKI_DIR directory missing. Please mount pubmed dataset."
exit -1
else
SOURCES="$PUBMED_DIR/*"
fi
if [ ! -d "$RESULTS_DIR" ] ; then
echo "Error! $RESULTS_DIR directory missing."
exit -1
fi
if [ ! -d "$CHECKPOINTS_DIR" ] ; then
echo "Warning! $CHECKPOINTS_DIR directory missing."
echo "Checkpoints will be written to $RESULTS_DIR instead."
CHECKPOINTS_DIR=$RESULTS_DIR
fi
if [ ! -f "$BERT_CONFIG" ] ; then
echo "Error! BERT large configuration file not found at $BERT_CONFIG"
exit -1
fi
train_batch_size=${1:-14}
eval_batch_size=${2:-8}
learning_rate=${3:-"1e-4"}
precision=${4:-"fp16_xla"}
num_gpus=${5:-8}
warmup_steps=${6:-"10000"}
train_steps=${7:-1144000}
save_checkpoint_steps=${8:-5000}
create_logfile=${9:-"true"}
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--use_fp16"
elif [ "$precision" = "fp16_xla" ] ; then
PREC="--use_fp16 --use_xla"
elif [ "$precision" = "fp32" ] ; then
PREC=""
elif [ "$precision" = "amp" ] ; then
PREC="--amp"
elif [ "$precision" = "amp_xla" ] ; then
PREC="--amp --use_xla"
else
echo "Unknown <precision> argument"
exit -2
fi
echo $SOURCES
INPUT_FILES=$(eval ls $SOURCES | tr " " "\n" | awk '{printf "%s,",$1}' | sed s'/.$//')
CMD="python3 /workspace/bert/run_pretraining.py"
CMD+=" --input_file=$INPUT_FILES"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --bert_config_file=$BERT_CONFIG"
CMD+=" --do_train=True"
CMD+=" --do_eval=True"
CMD+=" --train_batch_size=$train_batch_size"
CMD+=" --eval_batch_size=$eval_batch_size"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --num_train_steps=$train_steps"
CMD+=" --num_warmup_steps=$warmup_steps"
CMD+=" --save_checkpoint_steps=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate"
CMD+=" --report_loss"
CMD+=" --horovod $PREC"
if [ $num_gpus > 1 ] ; then
CMD="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket $CMD"
fi
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size \* $num_gpus)
printf -v TAG "tf_bert_1n_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
set +x