DeepLearningExamples/PyTorch/LanguageModeling/BERT/data/utils/sentence_segmentation_nltk.py
Przemek Strzelczyk 0663b67c1a Updating models
2019-07-08 22:51:28 +02:00

29 lines
618 B
Python

# NVIDIA
import argparse
import nltk
import os
nltk.download('punkt')
parser = argparse.ArgumentParser(description='Sentence Segmentation')
parser.add_argument('input_file', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
input_file = args.input_file
output_file = args.output_file
doc_seperator = "\n"
with open(input_file) as ifile:
with open(output_file, "w") as ofile:
for line in ifile:
if line != "\n":
sent_list = nltk.tokenize.sent_tokenize(line)
for sent in sent_list:
ofile.write(sent + "\n")
ofile.write(doc_seperator)