DeepLearningExamples/TensorFlow/Translation/GNMT/utils/vocab_utils.py
Przemek Strzelczyk a644350589 Updating models and adding BERT/PyT
Tacotron2+Waveglow/PyT
* AMP support
* Data preprocessing for Tacotron 2 training
* Fixed dropouts on LSTMCells

SSD/PyT
* script and notebook for inference
* AMP support
* README update
* updates to examples/*

BERT/PyT
* initial release

GNMT/PyT
* Default container updated to NGC PyTorch 19.05-py3
* Mixed precision training implemented using APEX AMP
* Added inference throughput and latency results on NVIDIA Tesla V100 16G
* Added option to run inference on user-provided raw input text from command line

NCF/PyT
* Updated performance tables.
* Default container changed to PyTorch 19.06-py3.
* Caching validation negatives between runs

Transformer/PyT
* new README
* jit support added

UNet Medical/TF
* inference example scripts added
* inference benchmark measuring latency added
* TRT/TF-TRT support added
* README updated

GNMT/TF
* Performance improvements

Small updates (mostly README) for other models.
2019-07-16 21:13:08 +02:00

206 lines
6.7 KiB
Python

# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility to handle vocabularies."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import os
import tensorflow as tf
from tensorflow.python.ops import lookup_ops
from utils import misc_utils as utils
# word level special token
UNK = "<unk>"
SOS = "<s>"
EOS = "</s>"
UNK_ID = 0
# char ids 0-255 come from utf-8 encoding bytes
# assign 256-300 to special chars
BOS_CHAR_ID = 256 # <begin sentence>
EOS_CHAR_ID = 257 # <end sentence>
BOW_CHAR_ID = 258 # <begin word>
EOW_CHAR_ID = 259 # <end word>
PAD_CHAR_ID = 260 # <padding>
DEFAULT_CHAR_MAXLEN = 50 # max number of chars for each word.
def _string_to_bytes(text, max_length):
"""Given string and length, convert to byte seq of at most max_length.
This process mimics docqa/elmo's preprocessing:
https://github.com/allenai/document-qa/blob/master/docqa/elmo/data.py
Note that we make use of BOS_CHAR_ID and EOS_CHAR_ID in iterator_utils.py &
our usage differs from docqa/elmo.
Args:
text: tf.string tensor of shape []
max_length: max number of chars for each word.
Returns:
A tf.int32 tensor of the byte encoded text.
"""
byte_ids = tf.to_int32(tf.decode_raw(text, tf.uint8))
byte_ids = byte_ids[:max_length - 2]
padding = tf.fill([max_length - tf.shape(byte_ids)[0] - 2], PAD_CHAR_ID)
byte_ids = tf.concat(
[[BOW_CHAR_ID], byte_ids, [EOW_CHAR_ID], padding], axis=0)
tf.logging.info(byte_ids)
byte_ids = tf.reshape(byte_ids, [max_length])
tf.logging.info(byte_ids.get_shape().as_list())
return byte_ids + 1
def tokens_to_bytes(tokens):
"""Given a sequence of strings, map to sequence of bytes.
Args:
tokens: A tf.string tensor
Returns:
A tensor of shape words.shape + [bytes_per_word] containing byte versions
of each word.
"""
bytes_per_word = DEFAULT_CHAR_MAXLEN
with tf.device("/cpu:0"):
tf.assert_rank(tokens, 1)
shape = tf.shape(tokens)
tf.logging.info(tokens)
tokens_flat = tf.reshape(tokens, [-1])
as_bytes_flat = tf.map_fn(
fn=lambda x: _string_to_bytes(x, max_length=bytes_per_word),
elems=tokens_flat,
dtype=tf.int32,
back_prop=False)
tf.logging.info(as_bytes_flat)
as_bytes = tf.reshape(as_bytes_flat, [shape[0], bytes_per_word])
return as_bytes
def load_vocab(vocab_file):
vocab = []
with codecs.getreader("utf-8")(tf.gfile.GFile(vocab_file, "rb")) as f:
vocab_size = 0
for word in f:
vocab_size += 1
vocab.append(word.strip())
return vocab, vocab_size
def check_vocab(vocab_file, output_dir, check_special_token=True, sos=None,
eos=None, unk=None, pad_vocab=False):
"""Check if vocab_file doesn't exist, create from corpus_file."""
if tf.gfile.Exists(vocab_file):
utils.print_out("# Vocab file %s exists" % vocab_file)
vocab, vocab_size = load_vocab(vocab_file)
if check_special_token:
# Verify if the vocab starts with unk, sos, eos
# If not, prepend those tokens & generate a new vocab file
if not unk: unk = UNK
if not sos: sos = SOS
if not eos: eos = EOS
assert len(vocab) >= 3
if vocab[0] != unk or vocab[1] != sos or vocab[2] != eos:
utils.print_out("The first 3 vocab words [%s, %s, %s]"
" are not [%s, %s, %s]" %
(vocab[0], vocab[1], vocab[2], unk, sos, eos))
vocab = [unk, sos, eos] + vocab
vocab_size += 3
new_vocab_file = os.path.join(output_dir, os.path.basename(vocab_file))
with codecs.getwriter("utf-8")(
tf.gfile.GFile(new_vocab_file, "wb")) as f:
for word in vocab:
f.write("%s\n" % word)
vocab_file = new_vocab_file
if pad_vocab == True and vocab_size % 8 != 0:
new_vocab_file = os.path.join(output_dir, os.path.basename(vocab_file))
padded_vocab_size = ((vocab_size + 8 - 1)// 8) * 8
for i in range(0, padded_vocab_size - vocab_size):
token = "<madeupword" + str(i) + ">"
vocab.append(token)
with codecs.getwriter("utf-8")(
tf.gfile.GFile(new_vocab_file, "wb")) as f:
for word in vocab:
f.write("%s\n" % word)
vocab_file = new_vocab_file
else:
raise ValueError("vocab_file '%s' does not exist." % vocab_file)
vocab_size = len(vocab)
return vocab_size, vocab_file
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
"""Creates vocab tables for src_vocab_file and tgt_vocab_file."""
src_vocab_table = lookup_ops.index_table_from_file(
src_vocab_file, default_value=UNK_ID)
if share_vocab:
tgt_vocab_table = src_vocab_table
else:
tgt_vocab_table = lookup_ops.index_table_from_file(
tgt_vocab_file, default_value=UNK_ID)
return src_vocab_table, tgt_vocab_table
def load_embed_txt(embed_file):
"""Load embed_file into a python dictionary.
Note: the embed_file should be a Glove/word2vec formatted txt file. Assuming
Here is an exampe assuming embed_size=5:
the -0.071549 0.093459 0.023738 -0.090339 0.056123
to 0.57346 0.5417 -0.23477 -0.3624 0.4037
and 0.20327 0.47348 0.050877 0.002103 0.060547
For word2vec format, the first line will be: <num_words> <emb_size>.
Args:
embed_file: file path to the embedding file.
Returns:
a dictionary that maps word to vector, and the size of embedding dimensions.
"""
emb_dict = dict()
emb_size = None
is_first_line = True
with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, "rb")) as f:
for line in f:
tokens = line.rstrip().split(" ")
if is_first_line:
is_first_line = False
if len(tokens) == 2: # header line
emb_size = int(tokens[1])
continue
word = tokens[0]
vec = list(map(float, tokens[1:]))
emb_dict[word] = vec
if emb_size:
if emb_size != len(vec):
utils.print_out(
"Ignoring %s since embeding size is inconsistent." % word)
del emb_dict[word]
else:
emb_size = len(vec)
return emb_dict, emb_size