initial commit on training PubMed-gvocab-base.
This commit is contained in:
parent
b7eff3da41
commit
65bf7dbe48
28
TensorFlow/LanguageModeling/BERT/data/pubmed/config_gvocab_base.sh
Executable file
28
TensorFlow/LanguageModeling/BERT/data/pubmed/config_gvocab_base.sh
Executable file
|
@ -0,0 +1,28 @@
|
|||
#! /bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
USE_BERT_LARGE=true
|
||||
MAX_SEQUENCE_LENGTH=512
|
||||
MAX_PREDICTIONS_PER_SEQUENCE=80
|
||||
MASKED_LM_PROB=0.15
|
||||
SEED=12345
|
||||
DUPE_FACTOR=5
|
||||
DO_LOWER_CASE="True"
|
||||
N_LINES_PER_SHARD_APPROX=396000 # Default=396000 creates 256 shards
|
||||
|
||||
N_PROCS_PREPROCESS=20 # Adjust this based on memory requirements and available number of cores
|
||||
export WORKING_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
#BERT_BASE_DIR="${WORKING_DIR}/pretrained_models_google/uncased_L-12_H-768_A-12"
|
||||
#BERT_LARGE_DIR="${WORKING_DIR}/pretrained_models_google/uncased_L-24_H-1024_A-16"
|
||||
BERT_BASE_DIR="/workspace/bert/data/pretrained_models_google/uncased_L-12_H-768_A-12"
|
||||
|
||||
#if [ "$USE_BERT_LARGE" = true ] ; then
|
||||
# VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
|
||||
#else
|
||||
VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
|
||||
#fi
|
||||
|
||||
OUTPUT_DIR="${WORKING_DIR}/final_tfrecords_sharded_gvocab_base/bert_pubmed_gvocab_base_seq_${MAX_SEQUENCE_LENGTH}_pred_${MAX_PREDICTIONS_PER_SEQUENCE}"
|
||||
|
23
TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_gvocab_base.sh
Executable file
23
TensorFlow/LanguageModeling/BERT/data/pubmed/preprocessing_gvocab_base.sh
Executable file
|
@ -0,0 +1,23 @@
|
|||
#! /bin/bash
|
||||
|
||||
SHARD_INDEX=${1}
|
||||
INPUT_FILE="${WORKING_DIR}/final_text_files_sharded_gvocab_base/pubmed_sentence.part.${SHARD_INDEX}.txt"
|
||||
|
||||
source /workspace/bert/data/pubmed/config_gvocab_base.sh
|
||||
|
||||
OUTPUT_DIR=${WORKING_DIR}/final_tfrecords_sharded_gvocab_base
|
||||
mkdir -p ${OUTPUT_DIR}
|
||||
|
||||
OUTPUT_FILE="${OUTPUT_DIR}/tf_examples.tfrecord000${SHARD_INDEX}"
|
||||
|
||||
python /workspace/bert/create_pretraining_data.py \
|
||||
--input_file=${INPUT_FILE} \
|
||||
--output_file=${OUTPUT_FILE} \
|
||||
--vocab_file=${VOCAB_FILE} \
|
||||
--do_lower_case=${DO_LOWER_CASE} \
|
||||
--max_seq_length=${MAX_SEQUENCE_LENGTH} \
|
||||
--max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
|
||||
--masked_lm_prob=${MASKED_LM_PROB} \
|
||||
--random_seed=${SEED} \
|
||||
--dupe_factor=${DUPE_FACTOR}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
#! /bin/bash
|
||||
|
||||
source /workspace/bert/data/pubmed/config_gvocab_base.sh
|
||||
|
||||
SHARD_COUNT=0
|
||||
rm -rf /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt
|
||||
touch /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt
|
||||
for file in /workspace/bert/data/pubmed/final_text_files_sharded_gvocab_base/*; do
|
||||
echo ${SHARD_COUNT} >> /workspace/bert/data/pubmed/xarg_list_gvocab_base.txt
|
||||
SHARD_COUNT=$((SHARD_COUNT+1))
|
||||
done
|
||||
|
||||
xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=/workspace/bert/data/pubmed/xarg_list_gvocab_base.txt /workspace/bert/data/pubmed/preprocessing_gvocab_base.sh
|
|
@ -0,0 +1,687 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob, os, sys"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = glob.glob('final_tfrecords_sharded_gvocab_base/*tfrecord*')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000466',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000136',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000183',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000254',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00038',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000525',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00012',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00066',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000451',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000343',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000290',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000194',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000363',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000340',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000357',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000291',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000117',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000517',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000444',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000408',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000520',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000168',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000325',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000396',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000382',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000273',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000478',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000259',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000333',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000392',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000263',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000100',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000253',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000504',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000375',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000465',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000127',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000239',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00033',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000356',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00081',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000164',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000119',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000512',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00086',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00063',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000454',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000523',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00058',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000483',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00015',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000370',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000352',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000349',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000302',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00052',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000438',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000422',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00070',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000436',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00089',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000270',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000498',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000215',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000509',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000202',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00051',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000332',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000141',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000176',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00071',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000191',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000271',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00073',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000295',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000329',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000516',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000533',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000417',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000334',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000547',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000515',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00065',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000206',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000344',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000282',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00021',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000485',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000150',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000439',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000441',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000106',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000493',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00010',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000405',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0003',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00025',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000249',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000310',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000157',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000233',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000505',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00042',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00096',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000135',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000435',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000321',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000537',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00040',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000330',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000530',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000427',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000256',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000383',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00097',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000182',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00064',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000303',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000139',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000223',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000146',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000534',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000286',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000374',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000430',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000114',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0004',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000123',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000531',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00077',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000447',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000402',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000299',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00098',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000242',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00014',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000293',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000388',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000297',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000507',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000362',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000540',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000161',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000198',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000210',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000519',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000204',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000494',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000197',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000442',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000212',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00028',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00045',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00044',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000480',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000317',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000312',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000225',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000348',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000292',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000219',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000101',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000440',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000137',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000143',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000394',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000301',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000250',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000412',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000217',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000546',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000347',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00023',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000185',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000481',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000234',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000462',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000326',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000335',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00099',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000111',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000434',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000506',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000118',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000414',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000243',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00095',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000513',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000190',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000351',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000323',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000423',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00076',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000300',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000503',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000393',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000499',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000425',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000209',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000456',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000345',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000205',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000337',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000522',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000470',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000368',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000151',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000371',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000508',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000160',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000145',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000541',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000389',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000260',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000366',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00037',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000112',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000227',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00057',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000269',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000342',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000526',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000156',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000120',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000122',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000255',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000319',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000391',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000488',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000527',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000192',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000490',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000187',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000339',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00080',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000262',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00054',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000372',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000529',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000472',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000193',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000124',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000140',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000449',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000420',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000338',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00092',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00085',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000102',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00088',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000380',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000458',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000158',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000130',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000236',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00011',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00075',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000384',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000486',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00020',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000147',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000459',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000354',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00079',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00026',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000387',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000163',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000521',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000476',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00019',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000471',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0001',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000395',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00087',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000229',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000220',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000518',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000410',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000409',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000257',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000195',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000174',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000132',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000110',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000320',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000252',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000431',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000247',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000154',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000433',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000235',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000179',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000180',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000341',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000178',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00061',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0006',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000386',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000437',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00031',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000463',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00074',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000381',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000171',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000543',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000167',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00047',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00034',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000331',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00036',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000361',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000479',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000358',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000285',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000173',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000428',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00022',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000266',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000116',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000487',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000224',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00018',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000426',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000419',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000413',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0007',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000500',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00016',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00053',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000222',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000201',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000109',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000105',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000492',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000353',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000221',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000199',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000376',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000496',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000445',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000153',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00029',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000208',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000131',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000181',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0008',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000103',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000311',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000125',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00062',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000549',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000489',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000491',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000165',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000429',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000305',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00069',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000275',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000113',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000407',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00059',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0002',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000406',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000203',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000328',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00056',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000152',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000246',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000467',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000398',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000360',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000322',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000377',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000464',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000280',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000264',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000294',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000172',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000501',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000121',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000245',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000373',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000468',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000548',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000142',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00043',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000484',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000211',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000364',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00082',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000267',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000162',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00041',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000355',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000308',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000144',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000502',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000514',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000457',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000367',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000460',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000390',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000283',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000289',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000532',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000418',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000274',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000453',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000365',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000482',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000378',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00067',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000544',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000284',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00078',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00091',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00030',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000216',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000241',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000251',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000279',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000307',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000237',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000475',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000400',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000169',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000277',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000379',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000138',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000276',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000214',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000495',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000196',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000129',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000248',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000126',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000314',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000281',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000397',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000261',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000304',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000133',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000207',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000228',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000477',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000148',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000411',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000539',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00013',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000416',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000511',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000403',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000524',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00024',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000401',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000336',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000188',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000149',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000399',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000107',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000226',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000538',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00060',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000432',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000369',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000545',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000528',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000309',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000272',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00068',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00027',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000296',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000115',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000450',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000346',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000268',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000108',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000385',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000359',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00084',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000230',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000448',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000415',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00093',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000189',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000536',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000177',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000306',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000298',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000315',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000316',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00083',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0005',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00050',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00055',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000200',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000186',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000244',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00072',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000175',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000218',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000443',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000213',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000240',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000265',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000170',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00090',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000535',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00017',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000159',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000155',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000497',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000461',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000510',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000327',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000318',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00032',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000424',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00039',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000313',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000474',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000469',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000287',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000421',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000231',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000455',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000452',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000166',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000542',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00094',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000404',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000278',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000288',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000446',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000473',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00035',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00048',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000258',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000350',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000238',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000128',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000184',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000104',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000134',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00049',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000232',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0000',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord000324',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0009',\n",
|
||||
" 'final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord00046']"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"550"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(files)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0003\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0004\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0001\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0006\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0007\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0008\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0002\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0005\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0000\n",
|
||||
"final_tfrecords_sharded_gvocab_base/tf_examples.tfrecord0009\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prefix = 'final_tfrecords_sharded_gvocab_base/tf_examples.'\n",
|
||||
"for filei in files:\n",
|
||||
" fname = filei.split('.')[-1]\n",
|
||||
" numi = int(fname[8:])\n",
|
||||
" \n",
|
||||
" if numi < 10:\n",
|
||||
" print(filei)\n",
|
||||
" continue\n",
|
||||
" elif numi >= 10 and numi < 100:\n",
|
||||
" suffix = '00' + str(numi)\n",
|
||||
" elif numi >= 100 and numi < 1000:\n",
|
||||
" suffix = '0' + str(numi)\n",
|
||||
" elif numi >= 1000:\n",
|
||||
" suffix = str(numi)\n",
|
||||
" else:\n",
|
||||
" print(fname)\n",
|
||||
" print('!!!')\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" fname_ = fname[:8] + suffix\n",
|
||||
" newfname = prefix + fname_\n",
|
||||
" \n",
|
||||
" os.rename(filei, newfname)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
#! /bin/bash
|
||||
|
||||
source /workspace/bert/data/pubmed/config_gvocab_base.sh
|
||||
|
||||
# Download books
|
||||
#mkdir -p download
|
||||
#python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ${WORKING_DIR}/download --trash-bad-count
|
||||
|
||||
# Clean and prep (one book per line)
|
||||
#mkdir -p ${WORKING_DIR}/intermediate_files
|
||||
#python3 ${WORKING_DIR}/clean_and_merge_text.py
|
||||
|
||||
# Split books into one-sentence-per-line format for use with BERT scripts
|
||||
#echo "Applying sentence segmentation to get one sentence per line"
|
||||
#mkdir -p ${WORKING_DIR}/final_text_file_single
|
||||
#python3 ${WORKING_DIR}/sentence_segmentation_nltk.py
|
||||
# Note: NLTK can be replaced with Spacy, although it is slower (2 variations provided)
|
||||
|
||||
# Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into tfrecords (choose appropriate number of shards for distributed training)
|
||||
echo "Shard text files - size is approximate to prevent splitting a book across shards"
|
||||
mkdir -p ${WORKING_DIR}/final_text_files_sharded_gvocab_base
|
||||
python3 ${WORKING_DIR}/shard_text_input_file_gvocab_base.py
|
||||
|
||||
# Convert sharded text files into tfrecords that are ready for BERT pretraining
|
||||
echo "Creating tfrecords for each text shard"
|
||||
mkdir -p ${WORKING_DIR}/final_tfrecords_sharded_gvocab_base
|
||||
. ${WORKING_DIR}/preprocessing_xargs_wrapper_gvocab_base.sh ${N_PROCS_PREPROCESS}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
# NVIDIA
|
||||
|
||||
import os
|
||||
|
||||
from pdb import set_trace as bp
|
||||
|
||||
input_file = os.environ['WORKING_DIR'] + '/final_text_file_single/pubmed_sentence_nltk_uncased.txt'
|
||||
output_file = os.environ['WORKING_DIR'] + '/final_text_files_sharded_gvocab_base/pubmed_sentence.part.'
|
||||
|
||||
doc_seperator = "\n"
|
||||
|
||||
line_buffer = []
|
||||
shard_size = 396000 # Approximate, will split at next article break
|
||||
line_counter = 0
|
||||
shard_index = 0
|
||||
|
||||
ifile_lines = 0
|
||||
with open(input_file) as ifile:
|
||||
for line in ifile:
|
||||
ifile_lines += 1
|
||||
|
||||
print("Input file contains", ifile_lines, "lines.")
|
||||
|
||||
iline_counter = 1
|
||||
with open(input_file) as ifile:
|
||||
for line in ifile:
|
||||
if line_counter < shard_size and iline_counter < ifile_lines:
|
||||
line_buffer.append(line)
|
||||
line_counter += 1
|
||||
iline_counter += 1
|
||||
elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
|
||||
line_buffer.append(line)
|
||||
line_counter += 1
|
||||
iline_counter += 1
|
||||
else:
|
||||
with open(output_file + str(shard_index) + ".txt", "w") as ofile:
|
||||
for oline in line_buffer:
|
||||
ofile.write(oline)
|
||||
line_buffer = []
|
||||
line_counter = 0
|
||||
shard_index += 1
|
||||
|
||||
|
108
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining-pubmed_gvocab_base.sh
Executable file
108
TensorFlow/LanguageModeling/BERT/scripts/run_pretraining-pubmed_gvocab_base.sh
Executable file
|
@ -0,0 +1,108 @@
|
|||
#! /bin/bash
|
||||
|
||||
echo "Container nvidia build = " $NVIDIA_BUILD_ID
|
||||
|
||||
# WIKI_DIR=/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
|
||||
# BOOKS_DIR=/workspace/bert/data/bookcorpus/final_tfrecords_sharded
|
||||
PUBMED_DIR=/workspace/bert/data/pubmed/final_tfrecords_sharded_gvocab_base
|
||||
BERT_CONFIG=/workspace/bert/data/pretrained_models_google/uncased_L-24_H-1024_A-16/bert_config.json
|
||||
RESULTS_DIR=/results
|
||||
CHECKPOINTS_DIR=/results/checkpoints
|
||||
|
||||
# if [ ! -d "$WIKI_DIR" ] ; then
|
||||
# echo "Error! $WIKI_DIR directory missing. Please mount wikipedia dataset."
|
||||
# exit -1
|
||||
# else
|
||||
# SOURCES="$WIKI_DIR/*"
|
||||
# fi
|
||||
# if [ ! -d "$BOOKS_DIR" ] ; then
|
||||
# echo "Warning! $BOOKS_DIR directory missing. Training will proceed without book corpus."
|
||||
# else
|
||||
# SOURCES+=" $BOOKS_DIR/*"
|
||||
# fi
|
||||
if [ ! -d "$PUBMED_DIR" ] ; then
|
||||
echo "Error! $WIKI_DIR directory missing. Please mount pubmed dataset."
|
||||
exit -1
|
||||
else
|
||||
SOURCES="$PUBMED_DIR/*"
|
||||
fi
|
||||
if [ ! -d "$RESULTS_DIR" ] ; then
|
||||
echo "Error! $RESULTS_DIR directory missing."
|
||||
exit -1
|
||||
fi
|
||||
if [ ! -d "$CHECKPOINTS_DIR" ] ; then
|
||||
echo "Warning! $CHECKPOINTS_DIR directory missing."
|
||||
echo "Checkpoints will be written to $RESULTS_DIR instead."
|
||||
CHECKPOINTS_DIR=$RESULTS_DIR
|
||||
fi
|
||||
if [ ! -f "$BERT_CONFIG" ] ; then
|
||||
echo "Error! BERT large configuration file not found at $BERT_CONFIG"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
train_batch_size=${1:-14}
|
||||
eval_batch_size=${2:-8}
|
||||
learning_rate=${3:-"1e-4"}
|
||||
precision=${4:-"fp16_xla"}
|
||||
num_gpus=${5:-8}
|
||||
warmup_steps=${6:-"10000"}
|
||||
train_steps=${7:-1144000}
|
||||
save_checkpoint_steps=${8:-5000}
|
||||
create_logfile=${9:-"true"}
|
||||
|
||||
PREC=""
|
||||
if [ "$precision" = "fp16" ] ; then
|
||||
PREC="--use_fp16"
|
||||
elif [ "$precision" = "fp16_xla" ] ; then
|
||||
PREC="--use_fp16 --use_xla"
|
||||
elif [ "$precision" = "fp32" ] ; then
|
||||
PREC=""
|
||||
elif [ "$precision" = "amp" ] ; then
|
||||
PREC="--amp"
|
||||
elif [ "$precision" = "amp_xla" ] ; then
|
||||
PREC="--amp --use_xla"
|
||||
else
|
||||
echo "Unknown <precision> argument"
|
||||
exit -2
|
||||
fi
|
||||
|
||||
echo $SOURCES
|
||||
INPUT_FILES=$(eval ls $SOURCES | tr " " "\n" | awk '{printf "%s,",$1}' | sed s'/.$//')
|
||||
CMD="python3 /workspace/bert/run_pretraining.py"
|
||||
CMD+=" --input_file=$INPUT_FILES"
|
||||
CMD+=" --output_dir=$CHECKPOINTS_DIR"
|
||||
CMD+=" --bert_config_file=$BERT_CONFIG"
|
||||
CMD+=" --do_train=True"
|
||||
CMD+=" --do_eval=True"
|
||||
CMD+=" --train_batch_size=$train_batch_size"
|
||||
CMD+=" --eval_batch_size=$eval_batch_size"
|
||||
CMD+=" --max_seq_length=512"
|
||||
CMD+=" --max_predictions_per_seq=80"
|
||||
CMD+=" --num_train_steps=$train_steps"
|
||||
CMD+=" --num_warmup_steps=$warmup_steps"
|
||||
CMD+=" --save_checkpoint_steps=$save_checkpoint_steps"
|
||||
CMD+=" --learning_rate=$learning_rate"
|
||||
CMD+=" --report_loss"
|
||||
CMD+=" --horovod $PREC"
|
||||
|
||||
if [ $num_gpus > 1 ] ; then
|
||||
CMD="mpiexec --allow-run-as-root -np $num_gpus --bind-to socket $CMD"
|
||||
fi
|
||||
|
||||
if [ "$create_logfile" = "true" ] ; then
|
||||
export GBS=$(expr $train_batch_size \* $num_gpus)
|
||||
printf -v TAG "tf_bert_1n_%s_gbs%d" "$precision" $GBS
|
||||
DATESTAMP=`date +'%y%m%d%H%M%S'`
|
||||
LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
|
||||
printf "Logs written to %s\n" "$LOGFILE"
|
||||
fi
|
||||
|
||||
set -x
|
||||
if [ -z "$LOGFILE" ] ; then
|
||||
$CMD
|
||||
else
|
||||
(
|
||||
$CMD
|
||||
) |& tee $LOGFILE
|
||||
fi
|
||||
set +x
|
Loading…
Reference in a new issue