DeepLearningExamples/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/remove_tags_and_clean.py
Przemek Strzelczyk 0663b67c1a Updating models
2019-07-08 22:51:28 +02:00

40 lines
1.1 KiB
Python

# NVIDIA
import glob
import os
import argparse
parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
parser.add_argument('extracted_articles_path', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
extracted_articles_path = args.extracted_articles_path
output_file = args.output_file
with open(output_file, "w") as ofile:
for dirname in glob.glob('{}/*/'.format(extracted_articles_path), recursive=False):
for filename in glob.glob(dirname + 'wiki_*', recursive=True):
print(filename)
article_lines = []
article_open = False
with open(filename, "r") as file:
for line in file:
if "<doc id=" in line:
article_open = True
elif "</doc>" in line:
article_open = False
for oline in article_lines[1:]:
if oline != "\n":
ofile.write(oline.rstrip() + " ")
ofile.write("\n\n")
article_lines = []
else:
if article_open:
article_lines.append(line)