25 lines
593 B
Bash
25 lines
593 B
Bash
|
#! /bin/bash
|
||
|
|
||
|
set -e
|
||
|
|
||
|
USE_BERT_LARGE=true
|
||
|
MAX_SEQUENCE_LENGTH=512
|
||
|
MAX_PREDICTIONS_PER_SEQUENCE=80
|
||
|
MASKED_LM_PROB=0.15
|
||
|
SEED=12345
|
||
|
DUPE_FACTOR=5
|
||
|
DO_LOWER_CASE="True"
|
||
|
N_LINES_PER_SHARD_APPROX=396000 # Default=396000 creates 256 shards
|
||
|
|
||
|
N_PROCS_PREPROCESS=4 # Adjust this based on memory requirements and available number of cores
|
||
|
|
||
|
BERT_BASE_DIR="/workspace/bert/vocab/uncased_L-12_H-768_A-12"
|
||
|
BERT_LARGE_DIR="/workspace/bert/vocab/uncased_L-24_H-1024_A-16"
|
||
|
|
||
|
if [ "$USE_BERT_LARGE" = true ] ; then
|
||
|
VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
|
||
|
else
|
||
|
VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
|
||
|
fi
|
||
|
|