DeepLearningExamples/PyTorch/Translation/GNMT/scripts/filter_dataset.py

import argparse
from collections import Counter


def parse_args():
    parser = argparse.ArgumentParser(description='Clean dataset')
    parser.add_argument('-f1', '--file1', help='file1')
    parser.add_argument('-f2', '--file2', help='file2')
    return parser.parse_args()


def save_output(fname, data):
    with open(fname, 'w') as f:
        f.writelines(data)


def main():
    """
    Discards all pairs of sentences which can't be decoded by latin-1 encoder.

    It aims to filter out sentences with rare unicode glyphs and pairs which
    are most likely not valid English-German sentences.

    Examples of discarded sentences:

        ✿★★★Hommage au king de la pop ★★★✿ ✿★★★Que son âme repos...

        Для их осуществления нам, прежде всего, необходимо преодолеть
        возражения рыночных фундаменталистов, которые хотят ликвидировать или
        уменьшить роль МВФ.

        practised as a scientist in various medical departments of the ⇗Medical
        University of Hanover , the ⇗University of Ulm , and the ⇗RWTH Aachen
        (rheumatology, pharmacology, physiology, pathology, microbiology,
        immunology and electron-microscopy).

        The same shift】 and press 【】 【alt out with a smaller diameter
        circle.

        Brought to you by ABMSUBS ♥leira(Coordinator/Translator)
        ♥chibichan93(Timer/Typesetter) ♥ja...

        Some examples: &0u - ☺ &0U - ☻ &tel - ☏ &PI - ¶ &SU - ☼ &cH- - ♥ &M2=♫
        &sn - ﺵ SGML maps SGML to unicode.
    """
    args = parse_args()

    c = Counter()
    skipped = 0
    valid = 0
    data1 = []
    data2 = []

    with open(args.file1) as f1, open(args.file2) as f2:
        for idx, lines in enumerate(zip(f1, f2)):
            line1, line2 = lines
            if idx % 100000 == 1:
                print(f'Processed {idx} lines')
            try:
                line1.encode('latin1')
                line2.encode('latin1')
            except UnicodeEncodeError:
                skipped += 1
            else:
                data1.append(line1)
                data2.append(line2)
                valid += 1
                c.update(line1)

    ratio = valid / (skipped + valid)
    print(f'Skipped: {skipped}, Valid: {valid}, Valid ratio {ratio}')
    print('Character frequency:', c)

    save_output(args.file1, data1)
    save_output(args.file2, data2)


if __name__ == '__main__':
    main()
Squashed 'PyTorch/Translation/GNMT/' content from commit 4dc145a git-subtree-dir: PyTorch/Translation/GNMT git-subtree-split: 4dc145ac0a58128a3b56d55b3e8b3b179b329332 2018-08-07 16:27:43 +02:00			`import argparse`
			`from collections import Counter`

Squashed 'PyTorch/Translation/GNMT/' changes from 4dc145a..51a90b1 51a90b1 Feb 14, 2019 update git-subtree-dir: PyTorch/Translation/GNMT git-subtree-split: 51a90b1667e7c1d45bd68bf719f8bf5d4c4521e3 2019-02-14 12:40:30 +01:00
Squashed 'PyTorch/Translation/GNMT/' content from commit 4dc145a git-subtree-dir: PyTorch/Translation/GNMT git-subtree-split: 4dc145ac0a58128a3b56d55b3e8b3b179b329332 2018-08-07 16:27:43 +02:00			`def parse_args():`
			`parser = argparse.ArgumentParser(description='Clean dataset')`
			`parser.add_argument('-f1', '--file1', help='file1')`
			`parser.add_argument('-f2', '--file2', help='file2')`
			`return parser.parse_args()`


			`def save_output(fname, data):`
			`with open(fname, 'w') as f:`
			`f.writelines(data)`

Squashed 'PyTorch/Translation/GNMT/' changes from 4dc145a..51a90b1 51a90b1 Feb 14, 2019 update git-subtree-dir: PyTorch/Translation/GNMT git-subtree-split: 51a90b1667e7c1d45bd68bf719f8bf5d4c4521e3 2019-02-14 12:40:30 +01:00
Squashed 'PyTorch/Translation/GNMT/' content from commit 4dc145a git-subtree-dir: PyTorch/Translation/GNMT git-subtree-split: 4dc145ac0a58128a3b56d55b3e8b3b179b329332 2018-08-07 16:27:43 +02:00			`def main():`
			`"""`
			`Discards all pairs of sentences which can't be decoded by latin-1 encoder.`

			`It aims to filter out sentences with rare unicode glyphs and pairs which`
			`are most likely not valid English-German sentences.`

			`Examples of discarded sentences:`

			`✿★★★Hommage au king de la pop ★★★✿ ✿★★★Que son âme repos...`

			`Для их осуществления нам, прежде всего, необходимо преодолеть`
			`возражения рыночных фундаменталистов, которые хотят ликвидировать или`
			`уменьшить роль МВФ.`

			`practised as a scientist in various medical departments of the ⇗Medical`
			`University of Hanover , the ⇗University of Ulm , and the ⇗RWTH Aachen`
			`(rheumatology, pharmacology, physiology, pathology, microbiology,`
			`immunology and electron-microscopy).`

			`The same shift】 and press 【】【alt out with a smaller diameter`
			`circle.`

			`Brought to you by ABMSUBS ♥leira(Coordinator/Translator)`
			`♥chibichan93(Timer/Typesetter) ♥ja...`

			`Some examples: &0u - ☺ &0U - ☻ &tel - ☏ &PI - ¶ &SU - ☼ &cH- - ♥ &M2=♫`
			`&sn - ﺵ SGML maps SGML to unicode.`
			`"""`
			`args = parse_args()`

			`c = Counter()`
			`skipped = 0`
			`valid = 0`
			`data1 = []`
			`data2 = []`

			`with open(args.file1) as f1, open(args.file2) as f2:`
			`for idx, lines in enumerate(zip(f1, f2)):`
			`line1, line2 = lines`
			`if idx % 100000 == 1:`
			`print(f'Processed {idx} lines')`
			`try:`
			`line1.encode('latin1')`
			`line2.encode('latin1')`
			`except UnicodeEncodeError:`
			`skipped += 1`
			`else:`
			`data1.append(line1)`
			`data2.append(line2)`
			`valid += 1`
			`c.update(line1)`

			`ratio = valid / (skipped + valid)`
			`print(f'Skipped: {skipped}, Valid: {valid}, Valid ratio {ratio}')`
			`print('Character frequency:', c)`

			`save_output(args.file1, data1)`
			`save_output(args.file2, data2)`


			`if __name__ == '__main__':`
			`main()`