a quick fix for issue #3094 index out-of-bound when truncating long text to max_seq_length (#3131)

This commit is contained in:
bugface 2021-11-05 21:50:10 -04:00 committed by GitHub
parent 875f54464a
commit 9ec02280d0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -221,7 +221,7 @@ class TextClassificationDataset(Dataset):
sent_subtokens.extend(word_tokens)
if max_seq_length > 0 and len(sent_subtokens) + 1 > max_seq_length:
sent_subtokens = sent_subtokens[:max_seq_length]
sent_subtokens = sent_subtokens[: max_seq_length - 1]
too_long_count += 1
sent_subtokens.append(tokenizer.sep_token)