Itn fr (#2921)
* First commit. French ITN grammars for tagger and verbalizer. Test for French inverse_normalize added to tests. inverse_text_normalize updated to allow 'fr' tag. tools/text_processing/deployment/pynini_export.py updated to accept 'fr' tag. All CI tests for grammars passed. Signed-off-by: tbartley94 <tbartley@nvidia.com> * Ran style checker. Signed-off-by: tbartley94 <tbartley@nvidia.com> * Fixed bug causing ordinals to fail sparrowhawk test when verbalizing as roman numbers. Signed-off-by: tbartley94 <tbartley@nvidia.com> * style change for verbalizer/ordinal.py Signed-off-by: tbartley94 <tbartley@nvidia.com> * Delete test.py Signed-off-by: tbartley94 <tbartley@nvidia.com> * Cleaning up unused import spaces for lgtm check. Signed-off-by: tbartley94 <tbartley@nvidia.com> * taggers/time.py missed style checker Signed-off-by: tbartley94 <tbartley@nvidia.com> * inverse_text_normalization/fr lacked an __init__ file Signed-off-by: tbartley94 <tbartley@nvidia.com> * Fixing copyright wording and adding whitelisting for titles. Signed-off-by: tbartley94 <tbartley@nvidia.com> * Fixing copyright headers. Signed-off-by: tbartley94 <tbartley@nvidia.com> * copyright header change (missed whitelist) Signed-off-by: tbartley94 <tbartley@nvidia.com> * Edited export_grammars.sh notes to include 'fr'. Made verbalizer/decimal.py rewrite class part of main class instead. Signed-off-by: tbartley94 <tbartley@nvidia.com> * Adjusting copyright headers for tests. Signed-off-by: tbartley94 <tbartley@nvidia.com> * inverse_text_normalization/fr/__init__ copyright header Signed-off-by: tbartley94 <tbartley@nvidia.com> * addint __init__ file to fr/data Signed-off-by: tbartley94 <tbartley@nvidia.com> Co-authored-by: Yang Zhang <yzhang123@users.noreply.github.com>
This commit is contained in:
parent
6d7f1a5339
commit
d0c97aab6a
|
@ -0,0 +1,32 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import ClassifyFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize import VerbalizeFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import VerbalizeFinalFst
|
||||
|
||||
from nemo.utils import logging
|
||||
|
||||
try:
|
||||
import pynini
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logging.warning(
|
||||
"`pynini` is not installed ! \n"
|
||||
"Please run the `nemo_text_processing/setup.sh` script"
|
||||
"prior to usage of this toolkit."
|
||||
)
|
||||
|
||||
PYNINI_AVAILABLE = False
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,25 @@
|
|||
com
|
||||
es
|
||||
uk
|
||||
fr
|
||||
net
|
||||
br
|
||||
in
|
||||
ru
|
||||
de
|
||||
it
|
||||
edu
|
||||
co
|
||||
ar
|
||||
bo
|
||||
cl
|
||||
co
|
||||
ec
|
||||
fk
|
||||
gf
|
||||
fy
|
||||
pe
|
||||
py
|
||||
sr
|
||||
ve
|
||||
uy
|
|
|
@ -0,0 +1,17 @@
|
|||
g mail gmail
|
||||
gmail
|
||||
n vidia nvidia
|
||||
nvidia
|
||||
outlook
|
||||
hotmail
|
||||
yahoo
|
||||
aol
|
||||
gmx
|
||||
msn
|
||||
live
|
||||
yandex
|
||||
orange
|
||||
wanadoo
|
||||
web
|
||||
google
|
||||
comcast
|
Can't render this file because it has a wrong number of fields in line 2.
|
|
@ -0,0 +1,12 @@
|
|||
chez @
|
||||
at @
|
||||
à @
|
||||
arobase @
|
||||
point .
|
||||
barre oblique /
|
||||
tiret -
|
||||
tiret bas _
|
||||
souligné _
|
||||
sous-tiret _
|
||||
blanc souligné _
|
||||
underscore _
|
|
|
@ -0,0 +1,41 @@
|
|||
demie deux
|
||||
demies deux
|
||||
demi deux
|
||||
demis deux
|
||||
tiers trois
|
||||
quart quatre
|
||||
quarts quatre
|
||||
quatrièmes quatre
|
||||
quatrième quatre
|
||||
cinquième cinq
|
||||
cinquièmes cinq
|
||||
neuvième neuf
|
||||
neuvièmes neuf
|
||||
onzième onze
|
||||
onzièmes onze
|
||||
douzième douze
|
||||
douzièmes douze
|
||||
treizième treize
|
||||
treizièmes treize
|
||||
quatorzième quatorze
|
||||
quatorzièmes quatorze
|
||||
quinzième quinze
|
||||
quinzièmes quinze
|
||||
seizième seize
|
||||
seizièmes seize
|
||||
trentième trente
|
||||
trentièmes trente
|
||||
quarantième quarante
|
||||
quarantièmes quarante
|
||||
cinquantième cinquante
|
||||
cinquantièmes cinquante
|
||||
soixantième soixante
|
||||
soixantièmes soixante
|
||||
septantième septante
|
||||
septantièmes septante
|
||||
huitantième huitante
|
||||
huitantièmes huitante
|
||||
nonantième nonante
|
||||
nonantièmes nonante
|
||||
millième mille
|
||||
millièmes mille
|
|
|
@ -0,0 +1,16 @@
|
|||
exa E
|
||||
péta P
|
||||
téra T
|
||||
giga G
|
||||
méga M
|
||||
kilo k
|
||||
hecto h
|
||||
déca da
|
||||
déci d
|
||||
centi c
|
||||
milli m
|
||||
micro µ
|
||||
nano n
|
||||
pico p
|
||||
femto f
|
||||
atto a
|
|
|
@ -0,0 +1,27 @@
|
|||
mètre m
|
||||
mètre carré m²
|
||||
mètres carrés m²
|
||||
mètre cube m³
|
||||
mètres cubes m³
|
||||
seconde s
|
||||
minute min
|
||||
heure h
|
||||
degré °
|
||||
degrés °
|
||||
degré celsius °C
|
||||
degrés celsius °C
|
||||
gramme g
|
||||
litre l
|
||||
kilo kg
|
||||
pouce ''
|
||||
livre lb
|
||||
poid lb
|
||||
mile ml
|
||||
pour cent %
|
||||
pour mille ‰
|
||||
mètre heure km/h
|
||||
mètres heure m/h
|
||||
mètre à l’heure m/h
|
||||
mètres à l’heure m/h
|
||||
mètre par l’heure m/h
|
||||
mètres par l’heure m/h
|
|
|
@ -0,0 +1,40 @@
|
|||
dollar $
|
||||
dollars $
|
||||
dollar américain $ US
|
||||
dollars américains $ US
|
||||
dollar des États-Unis $ US
|
||||
dollars des États-Unis $ US
|
||||
dollar canadien $ CA
|
||||
dollars canadiens $ CA
|
||||
dollar australien $ AU
|
||||
dollars australien $ AU
|
||||
dollar néo-zélandais $ NZ
|
||||
dollars néo-zélandais $ NZ
|
||||
dollar de Hong Kong $ HK
|
||||
dollars de Hong Kong $ HK
|
||||
euro €
|
||||
euros €
|
||||
livre sterling £
|
||||
livre £
|
||||
livres £
|
||||
livre britannique £
|
||||
livres britanniques £
|
||||
sterling £
|
||||
won ₩
|
||||
won sud-coréen ₩
|
||||
yen japonais ¥
|
||||
yens japonais ¥
|
||||
yen ¥
|
||||
yens ¥
|
||||
yuan ¥
|
||||
yuans ¥
|
||||
franc CHF
|
||||
franc suisse CHF
|
||||
dinar algérien DA
|
||||
dinars algériens DA
|
||||
dinar DA
|
||||
dinars DA
|
||||
dirham marocain DH
|
||||
dirhams marocains DH
|
||||
dirham DH
|
||||
dirhams DH
|
|
|
@ -0,0 +1,8 @@
|
|||
cent $
|
||||
cents $
|
||||
centime €
|
||||
centimes €
|
||||
eurocent €
|
||||
eurocents €
|
||||
pence £
|
||||
pesos $
|
|
|
@ -0,0 +1,12 @@
|
|||
janvier
|
||||
février
|
||||
mars
|
||||
avril
|
||||
mai
|
||||
juin
|
||||
juillet
|
||||
août
|
||||
septembre
|
||||
octobre
|
||||
novembre
|
||||
décembre
|
|
|
@ -0,0 +1,10 @@
|
|||
un 1
|
||||
une 1
|
||||
deux 2
|
||||
trois 3
|
||||
quatre 4
|
||||
cinq 5
|
||||
six 6
|
||||
sept 7
|
||||
huit 8
|
||||
neuf 9
|
|
|
@ -0,0 +1,2 @@
|
|||
cent
|
||||
cents
|
|
|
@ -0,0 +1,6 @@
|
|||
onze 11
|
||||
douze 12
|
||||
treize 13
|
||||
quatorze 14
|
||||
quinze 15
|
||||
seize 16
|
|
|
@ -0,0 +1,23 @@
|
|||
mille
|
||||
million
|
||||
millions
|
||||
milliard
|
||||
milliards
|
||||
billion
|
||||
billions
|
||||
trillion
|
||||
trillions
|
||||
quadrillion
|
||||
quadrillions
|
||||
quintillion
|
||||
quintillions
|
||||
sextillion
|
||||
sextillions
|
||||
septillion
|
||||
septillions
|
||||
octillion
|
||||
octillions
|
||||
nonillion
|
||||
nonillions
|
||||
decillion
|
||||
decillions
|
|
|
@ -0,0 +1,13 @@
|
|||
dix 1
|
||||
vingt 2
|
||||
vingts 2
|
||||
trente 3
|
||||
quarante 4
|
||||
cinquante 5
|
||||
soixante 6
|
||||
soixante-dix 7
|
||||
septante 7
|
||||
huitante 8
|
||||
quatre-vingt 8
|
||||
quatre-vingt-dix 9
|
||||
nonante 9
|
|
|
@ -0,0 +1,25 @@
|
|||
vingts 20
|
||||
vingt-et-un 21
|
||||
vingt-et-une 21
|
||||
trente-et-un 31
|
||||
trente-et-une 31
|
||||
quarante-et-un 41
|
||||
quarante-et-une 41
|
||||
cinquante-et-un 51
|
||||
cinquante-et-une 51
|
||||
soixante-et-un 61
|
||||
soixante-et-une 61
|
||||
soixante-et-onze 71
|
||||
soixante-douze 72
|
||||
soixante-treize 73
|
||||
soixante-quatorze 74
|
||||
soixante-quinze 75
|
||||
soixante-seize 76
|
||||
quatre-vingts 80
|
||||
quatre-vingt 80
|
||||
quatre-vingt-onze 91
|
||||
quatre-vingt-douze 92
|
||||
quatre-vingt-treize 93
|
||||
quatre-vingt-quatorze 94
|
||||
quatre-vingt-quinze 95
|
||||
quatre-vingt-seize 96
|
|
|
@ -0,0 +1 @@
|
|||
zéro 0
|
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,17 @@
|
|||
quatrième quatre
|
||||
cinquième cinq
|
||||
neuvième neuf
|
||||
onzième onze
|
||||
douzième douze
|
||||
treizième treize
|
||||
quatorzième quatorze
|
||||
quinzième quinze
|
||||
seizième seize
|
||||
trentième trente
|
||||
quarantième quarante
|
||||
cinquantième cinquante
|
||||
soixantième soixante
|
||||
septantième septante
|
||||
huitantième huitante
|
||||
nonantième nonante
|
||||
millième mille
|
|
|
@ -0,0 +1,4 @@
|
|||
premier er
|
||||
premiers ers
|
||||
première re
|
||||
premières res
|
|
|
@ -0,0 +1 @@
|
|||
siècle
|
|
|
@ -0,0 +1,4 @@
|
|||
second d
|
||||
seconde de
|
||||
seconds ds
|
||||
secondes des
|
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,9 @@
|
|||
I 1
|
||||
II 2
|
||||
III 3
|
||||
IV 4
|
||||
V 5
|
||||
VI 6
|
||||
VII 7
|
||||
VIII 8
|
||||
IX 9
|
|
|
@ -0,0 +1,9 @@
|
|||
C 1
|
||||
CC 2
|
||||
CCC 3
|
||||
CD 4
|
||||
D 5
|
||||
DC 6
|
||||
DCC 7
|
||||
DCCC 8
|
||||
CM 9
|
|
|
@ -0,0 +1,9 @@
|
|||
X 1
|
||||
XX 2
|
||||
XXX 3
|
||||
XL 4
|
||||
L 5
|
||||
LX 6
|
||||
LXX 7
|
||||
LXXX 8
|
||||
XC 9
|
|
|
@ -0,0 +1,12 @@
|
|||
1 13
|
||||
2 14
|
||||
3 15
|
||||
4 16
|
||||
5 17
|
||||
6 18
|
||||
7 19
|
||||
8 20
|
||||
9 21
|
||||
10 22
|
||||
11 23
|
||||
12 0
|
|
|
@ -0,0 +1,26 @@
|
|||
zéro 0
|
||||
une 1
|
||||
deux 2
|
||||
trois 3
|
||||
quatre 4
|
||||
cinq 5
|
||||
six 6
|
||||
sept 7
|
||||
huit 8
|
||||
neuf 9
|
||||
dix 10
|
||||
onze 11
|
||||
douze 12
|
||||
treize 13
|
||||
quatorze 14
|
||||
quinze 15
|
||||
seize 16
|
||||
dix-sept 17
|
||||
dix-huit 18
|
||||
dix-neuf 19
|
||||
vingt 20
|
||||
vingt-et-une 21
|
||||
vingt et une
|
||||
vingt-deux 22
|
||||
vingt-trois 23
|
||||
vingt-quatre 24
|
Can't render this file because it has a wrong number of fields in line 23.
|
|
@ -0,0 +1,25 @@
|
|||
1 0
|
||||
2 1
|
||||
3 2
|
||||
4 3
|
||||
5 4
|
||||
6 5
|
||||
7 6
|
||||
8 7
|
||||
9 8
|
||||
10 9
|
||||
11 10
|
||||
12 11
|
||||
13 12
|
||||
14 13
|
||||
15 14
|
||||
16 15
|
||||
17 16
|
||||
18 17
|
||||
19 18
|
||||
20 19
|
||||
21 20
|
||||
22 21
|
||||
23 22
|
||||
24 23
|
||||
0 23
|
|
|
@ -0,0 +1,63 @@
|
|||
une 01
|
||||
deux 02
|
||||
trois 03
|
||||
quatre 04
|
||||
cinq 05
|
||||
six 06
|
||||
sept 07
|
||||
huit 08
|
||||
neuf 09
|
||||
dix 10
|
||||
onze 11
|
||||
douze 12
|
||||
treize 13
|
||||
quatorze 14
|
||||
quinze 15
|
||||
seize 16
|
||||
dix-sept 17
|
||||
dix-huit 18
|
||||
dix-neuf 19
|
||||
vingt 20
|
||||
vingt-et-une 21
|
||||
vingt et une 21
|
||||
vingt-deux 22
|
||||
vingt-trois 23
|
||||
vingt-quatre 27
|
||||
vingt-cinq 25
|
||||
vingt-six 26
|
||||
vingt-sept 27
|
||||
vingt-huit 28
|
||||
vingt-neuf 29
|
||||
trente 30
|
||||
trente-et-une 31
|
||||
trente et une 31
|
||||
trente-deux 32
|
||||
trente-trois 33
|
||||
trente-quatre 34
|
||||
trente-cinq 35
|
||||
trente-six 36
|
||||
trente-sept 37
|
||||
trente-huit 38
|
||||
trente-neuf 39
|
||||
quarante 40
|
||||
quarante-et-une 41
|
||||
quarante et une 41
|
||||
quarante-deux 42
|
||||
quarante-trois 43
|
||||
quarante-quatre 44
|
||||
quarante-cinq 45
|
||||
quarante-six 46
|
||||
quarante-sept 47
|
||||
quarante-huit 48
|
||||
quarante-neuf 49
|
||||
cinquante 50
|
||||
cinquante-et-une 51
|
||||
cinquante et une 51
|
||||
cinquante-deux 52
|
||||
cinquante-trois 53
|
||||
cinquante-quatre 54
|
||||
cinquante-cinq 55
|
||||
cinquante-six 56
|
||||
cinquante-sept 57
|
||||
cinquante-huit 58
|
||||
cinquante-neuf 59
|
|
|
@ -0,0 +1,59 @@
|
|||
01 59
|
||||
02 58
|
||||
03 57
|
||||
04 56
|
||||
05 55
|
||||
06 54
|
||||
07 53
|
||||
08 52
|
||||
09 51
|
||||
10 50
|
||||
11 49
|
||||
12 48
|
||||
13 47
|
||||
14 46
|
||||
15 45
|
||||
16 44
|
||||
17 43
|
||||
18 42
|
||||
19 41
|
||||
20 40
|
||||
21 39
|
||||
22 38
|
||||
23 37
|
||||
24 36
|
||||
25 35
|
||||
26 34
|
||||
27 33
|
||||
28 32
|
||||
29 31
|
||||
30 30
|
||||
31 29
|
||||
32 28
|
||||
33 27
|
||||
34 26
|
||||
35 25
|
||||
36 24
|
||||
37 23
|
||||
38 22
|
||||
39 21
|
||||
40 20
|
||||
41 19
|
||||
42 18
|
||||
43 17
|
||||
44 16
|
||||
45 15
|
||||
46 14
|
||||
47 13
|
||||
48 12
|
||||
49 11
|
||||
50 10
|
||||
51 09
|
||||
52 08
|
||||
53 07
|
||||
54 06
|
||||
55 05
|
||||
56 04
|
||||
57 03
|
||||
58 02
|
||||
59 01
|
|
|
@ -0,0 +1 @@
|
|||
du matin
|
|
|
@ -0,0 +1,2 @@
|
|||
de l'après-midi
|
||||
du soir
|
|
|
@ -0,0 +1,16 @@
|
|||
monsieur M.
|
||||
messieurs MM.
|
||||
madame Mᵐᵉ
|
||||
mesdames Mᵐᵉˢ
|
||||
mademoiselle Mˡˡᵉ
|
||||
mademoiselles Mˡˡᵉˢ
|
||||
docteur Dʳ
|
||||
docteurs Dʳˢ
|
||||
docteure Dʳᵉ
|
||||
docteures Dʳᵉˢ
|
||||
après jésus-christ apr. J.-C.
|
||||
avant Jésus-Christ av. J.-C.
|
||||
ca v.
|
||||
vers v.
|
||||
l’honorable le hon.
|
||||
le très hononrable le très hon.
|
|
|
@ -0,0 +1,235 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# Copyright 2015 and onwards Google, Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import string
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini import Far
|
||||
from pynini.export import export
|
||||
from pynini.examples import plurals
|
||||
from pynini.lib import byte, pynutil, utf8
|
||||
|
||||
NEMO_CHAR = utf8.VALID_UTF8_CHAR
|
||||
|
||||
NEMO_DIGIT = byte.DIGIT
|
||||
NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
|
||||
NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
|
||||
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
|
||||
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
|
||||
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
|
||||
NEMO_NON_BREAKING_SPACE = u"\u00A0"
|
||||
NEMO_SPACE = " "
|
||||
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
|
||||
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
|
||||
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
|
||||
|
||||
NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
|
||||
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
|
||||
|
||||
NEMO_SIGMA = pynini.closure(NEMO_CHAR)
|
||||
|
||||
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
|
||||
insert_space = pynutil.insert(" ")
|
||||
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
|
||||
|
||||
# French frequently compounds numbers with hyphen.
|
||||
delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1))
|
||||
insert_hyphen = pynutil.insert("-")
|
||||
|
||||
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
|
||||
|
||||
_s = NEMO_SIGMA + pynutil.insert("s")
|
||||
_x = NEMO_SIGMA + pynini.string_map([("eau"), ("eu"), ("ou")]) + pynutil.insert("x")
|
||||
_aux = NEMO_SIGMA + pynini.string_map([("al", "aux"), ("ail", "aux")])
|
||||
|
||||
graph_plural = plurals._priority_union(
|
||||
suppletive, plurals._priority_union(_s, pynini.union(_x, _aux), NEMO_SIGMA), NEMO_SIGMA
|
||||
).optimize()
|
||||
|
||||
SINGULAR_TO_PLURAL = graph_plural
|
||||
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
|
||||
TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)])
|
||||
TO_UPPER = pynini.invert(TO_LOWER)
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
# Create placeholders
|
||||
NEMO_CHAR = None
|
||||
|
||||
NEMO_DIGIT = None
|
||||
NEMO_LOWER = None
|
||||
NEMO_UPPER = None
|
||||
NEMO_ALPHA = None
|
||||
NEMO_ALNUM = None
|
||||
NEMO_HEX = None
|
||||
NEMO_NON_BREAKING_SPACE = u"\u00A0"
|
||||
NEMO_SPACE = " "
|
||||
NEMO_WHITE_SPACE = None
|
||||
NEMO_NOT_SPACE = None
|
||||
NEMO_NOT_QUOTE = None
|
||||
|
||||
NEMO_PUNCT = None
|
||||
NEMO_GRAPH = None
|
||||
|
||||
NEMO_SIGMA = None
|
||||
|
||||
delete_space = None
|
||||
insert_space = None
|
||||
delete_extra_space = None
|
||||
|
||||
delete_hyphen = None
|
||||
insert_hyphen = None
|
||||
|
||||
suppletive = None
|
||||
_s = None
|
||||
_x = None
|
||||
_aux = None
|
||||
|
||||
graph_plural = None
|
||||
|
||||
SINGULAR_TO_PLURAL = None
|
||||
PLURAL_TO_SINGULAR = None
|
||||
TO_LOWER = None
|
||||
TO_UPPER = None
|
||||
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
def generator_main(file_name: str, graphs: Dict[str, pynini.FstLike]):
|
||||
"""
|
||||
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
|
||||
|
||||
Args:
|
||||
file_name: exported file name
|
||||
graphs: Mapping of a rule name and Pynini WFST graph to be exported
|
||||
"""
|
||||
exporter = export.Exporter(file_name)
|
||||
for rule, graph in graphs.items():
|
||||
exporter[rule] = graph.optimize()
|
||||
exporter.close()
|
||||
print(f'Created {file_name}')
|
||||
|
||||
|
||||
def get_plurals(fst):
|
||||
"""
|
||||
Given singular returns plurals
|
||||
|
||||
Args:
|
||||
fst: Fst
|
||||
|
||||
Returns plurals to given singular forms
|
||||
"""
|
||||
return SINGULAR_TO_PLURAL @ fst
|
||||
|
||||
|
||||
def get_singulars(fst):
|
||||
"""
|
||||
Given plural returns singulars
|
||||
|
||||
Args:
|
||||
fst: Fst
|
||||
|
||||
Returns singulars to given plural forms
|
||||
"""
|
||||
return PLURAL_TO_SINGULAR @ fst
|
||||
|
||||
|
||||
def convert_space(fst) -> 'pynini.FstLike':
|
||||
"""
|
||||
Converts space to nonbreaking space.
|
||||
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
|
||||
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
|
||||
|
||||
Args:
|
||||
fst: input fst
|
||||
|
||||
Returns output fst where breaking spaces are converted to non breaking spaces
|
||||
"""
|
||||
return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA)
|
||||
|
||||
|
||||
class GraphFst:
|
||||
"""
|
||||
Base class for all grammar fsts.
|
||||
|
||||
Args:
|
||||
name: name of grammar class
|
||||
kind: either 'classify' or 'verbalize'
|
||||
deterministic: if True will provide a single transduction option,
|
||||
for False multiple transduction are generated (used for audio-based normalization)
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, kind: str, deterministic: bool = True):
|
||||
self.name = name
|
||||
self.kind = str
|
||||
self._fst = None
|
||||
self.deterministic = deterministic
|
||||
|
||||
self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
|
||||
if self.far_exist():
|
||||
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
|
||||
|
||||
def far_exist(self) -> bool:
|
||||
"""
|
||||
Returns true if FAR can be loaded
|
||||
"""
|
||||
return self.far_path.exists()
|
||||
|
||||
@property
|
||||
def fst(self) -> 'pynini.FstLike':
|
||||
return self._fst
|
||||
|
||||
@fst.setter
|
||||
def fst(self, fst):
|
||||
self._fst = fst
|
||||
|
||||
def add_tokens(self, fst) -> 'pynini.FstLike':
|
||||
"""
|
||||
Wraps class name around to given fst
|
||||
|
||||
Args:
|
||||
fst: input fst
|
||||
|
||||
Returns:
|
||||
Fst: fst
|
||||
"""
|
||||
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
|
||||
|
||||
def delete_tokens(self, fst) -> 'pynini.FstLike':
|
||||
"""
|
||||
Deletes class name wrap around output of given fst
|
||||
|
||||
Args:
|
||||
fst: input fst
|
||||
|
||||
Returns:
|
||||
Fst: fst
|
||||
"""
|
||||
res = (
|
||||
pynutil.delete(f"{self.name}")
|
||||
+ delete_space
|
||||
+ pynutil.delete("{")
|
||||
+ delete_space
|
||||
+ fst
|
||||
+ delete_space
|
||||
+ pynutil.delete("}")
|
||||
)
|
||||
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,284 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_CHAR,
|
||||
NEMO_DIGIT,
|
||||
NEMO_NOT_SPACE,
|
||||
NEMO_SIGMA,
|
||||
NEMO_SPACE,
|
||||
GraphFst,
|
||||
delete_hyphen,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
def rewrite(cardinal: 'pynini.FstLike') -> 'pynini.FstLike':
|
||||
"""
|
||||
Function to rewrite cardinals written in traditional orthograph (no '-' for numbers >100)
|
||||
to current orthography ('-' between all words in number string)
|
||||
e.g. deux mille cent vingt-trois -> deux-mille-cent-vingt-trois.
|
||||
In cases where original orthography is current, or string is mixture of two orthographies,
|
||||
will render invalid form that will not pass through CardinalFst
|
||||
e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.)
|
||||
e.g. deux
|
||||
|
||||
Args:
|
||||
cardinal: cardinal FST
|
||||
"""
|
||||
|
||||
# Traditional orthography does not hyphenate numbers > 100, this will insert hyphens in
|
||||
# those contexts.
|
||||
targets = pynini.string_map(
|
||||
[
|
||||
"et", # for 'et un/onze'
|
||||
"cent",
|
||||
"mille",
|
||||
"million",
|
||||
"milliard",
|
||||
"billion",
|
||||
"billiard",
|
||||
"trillion",
|
||||
"trilliard",
|
||||
]
|
||||
)
|
||||
targets += pynini.accep("s").ques
|
||||
|
||||
no_spaces = pynini.closure(NEMO_NOT_SPACE)
|
||||
|
||||
# Valid numbers in reformed orthography will have no spaces.
|
||||
new_orthography_sigma = no_spaces
|
||||
|
||||
# Old orthography will not have these strings. Replacing with character to mark.
|
||||
targets_for_filtering = ("-" + targets) | ("-" + targets + "-") | (targets + "-")
|
||||
|
||||
filter = pynini.cdrewrite(pynini.cross(targets_for_filtering, "#"), "", "", NEMO_SIGMA) # Invalid for cardinal
|
||||
|
||||
old_orthography_sigma = pynini.difference(NEMO_CHAR, "#") # Marked character removed from sigma_star.
|
||||
old_orthography_sigma.closure()
|
||||
|
||||
# Only accept strings that occur in old orthography. (This avoids tying two non-related numbers together.)
|
||||
# e.g. mille cent-une -> mille-cent-une
|
||||
filter @= old_orthography_sigma
|
||||
|
||||
# Now know replacements will only work around targets
|
||||
replace_left = pynini.cdrewrite(pynini.cross(" ", "-"), "", targets, NEMO_SIGMA)
|
||||
|
||||
replace_right = pynini.cdrewrite(pynini.cross(" ", "-"), targets, "", NEMO_SIGMA)
|
||||
|
||||
replace = replace_left @ replace_right
|
||||
|
||||
graph = new_orthography_sigma | (filter @ replace)
|
||||
|
||||
return graph @ cardinal
|
||||
|
||||
|
||||
class CardinalFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying cardinals
|
||||
e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"}
|
||||
This class converts cardinals up to (but not including) "un-quatrillion",
|
||||
i.e up to "one septillion" in English (10^{24}).
|
||||
Cardinals below nine are not converted (in order to avoid
|
||||
"j'ai un pomme." --> "j'ai 1 pomme" and any other odd conversions.)
|
||||
This transducer accomodates both traditional hyphenation of numbers ('-' for most numbers <100)
|
||||
and current hyphenation (all elements of number are hyphenated), prioritizing the latter.
|
||||
e.g cent cinquante et un -> cardinal { integer: "151"}
|
||||
cent-cinquante-et-un -> cardinal { integer: "151"}
|
||||
This is done through a context dependent rewrite that attempts to map old spelling to new.
|
||||
e.g. cent cinquante et un -> cent-cinquante-et-un
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="cardinal", kind="classify")
|
||||
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
|
||||
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
|
||||
graph_teens = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
|
||||
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
|
||||
graph_ties_unique = pynini.string_file(get_abs_path("data/numbers/ties_unique.tsv"))
|
||||
|
||||
# Tens components
|
||||
graph_tens_component = graph_ties + ((delete_hyphen + graph_digit) | pynutil.insert("0"))
|
||||
graph_tens_component = pynini.union(graph_tens_component, graph_teens, graph_ties_unique)
|
||||
|
||||
graph_tens_component_with_leading_zeros = pynini.union(
|
||||
graph_tens_component, (pynutil.insert("0") + (graph_digit | pynutil.insert("0", weight=0.01)))
|
||||
)
|
||||
|
||||
# Hundreds components
|
||||
graph_cent_singular = pynutil.delete("cent") # Used in hundreds place
|
||||
graph_cent_plural = pynini.cross(
|
||||
"cents", "00"
|
||||
) # Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201
|
||||
|
||||
graph_digit_no_one = pynini.project(pynini.union("un", "une"), 'input')
|
||||
graph_digit_no_one = (pynini.project(graph_digit, "input") - graph_digit_no_one.arcsort()) @ graph_digit
|
||||
|
||||
graph_hundreds_component_singular = (
|
||||
graph_digit_no_one + delete_hyphen + graph_cent_singular
|
||||
) # Regular way: [1-9] * 100
|
||||
|
||||
graph_hundreds_component_singular = pynini.union(graph_hundreds_component_singular, pynini.cross("cent", "1"))
|
||||
graph_hundreds_component_singular += delete_hyphen
|
||||
graph_hundreds_component_singular += graph_tens_component_with_leading_zeros
|
||||
|
||||
graph_hundreds_component_plural = graph_digit_no_one + delete_hyphen + graph_cent_plural
|
||||
|
||||
graph_hundreds_component = pynini.union(
|
||||
graph_hundreds_component_singular,
|
||||
graph_hundreds_component_plural,
|
||||
pynutil.insert("0") + graph_tens_component_with_leading_zeros,
|
||||
)
|
||||
|
||||
graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component @ (
|
||||
pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)
|
||||
)
|
||||
self.graph_hundreds_component_at_least_one_none_zero_digit = rewrite(
|
||||
graph_hundreds_component_at_least_one_none_zero_digit
|
||||
).optimize()
|
||||
|
||||
# Graph thousands (we'll need this for cases of mille millions, mille milliards...)
|
||||
graph_tens_of_hundreds_component_singular = (
|
||||
graph_tens_component + delete_hyphen + graph_cent_singular
|
||||
) # Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents"
|
||||
graph_tens_of_hundreds_component_singular += delete_hyphen + graph_tens_component_with_leading_zeros
|
||||
graph_tens_of_hundreds_component_plural = graph_tens_component + delete_hyphen + graph_cent_plural
|
||||
graph_tens_of_hundred_component = (
|
||||
graph_tens_of_hundreds_component_plural | graph_tens_of_hundreds_component_singular
|
||||
)
|
||||
|
||||
graph_thousands = pynini.union(
|
||||
graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + pynutil.delete("mille"),
|
||||
pynutil.insert("001") + pynutil.delete("mille"), # because 'mille', not 'un mille'
|
||||
pynutil.insert("000", weight=0.1),
|
||||
)
|
||||
|
||||
# All other large amounts
|
||||
graph_millions = pynini.union(
|
||||
graph_hundreds_component_at_least_one_none_zero_digit
|
||||
+ delete_hyphen
|
||||
+ (pynutil.delete("million") | pynutil.delete("millions")),
|
||||
pynutil.insert("000", weight=0.1),
|
||||
)
|
||||
|
||||
graph_milliards = pynini.union( # French for English 'billion'
|
||||
graph_hundreds_component_at_least_one_none_zero_digit
|
||||
+ delete_hyphen
|
||||
+ (pynutil.delete("milliard") | pynutil.delete("milliards")),
|
||||
pynutil.insert("000", weight=0.1),
|
||||
)
|
||||
|
||||
graph_billions = pynini.union( # NOTE: this is English 'trillion.'
|
||||
graph_hundreds_component_at_least_one_none_zero_digit
|
||||
+ delete_hyphen
|
||||
+ (pynutil.delete("billions") | pynutil.delete("billion")),
|
||||
pynutil.insert("000", weight=0.1),
|
||||
)
|
||||
|
||||
graph_mille_billion = pynini.union(
|
||||
graph_hundreds_component_at_least_one_none_zero_digit + delete_hyphen + pynutil.delete("mille"),
|
||||
pynutil.insert("001") + pynutil.delete("mille"), # because we say 'mille', not 'un mille'
|
||||
)
|
||||
graph_mille_billion += delete_hyphen + (
|
||||
graph_millions | pynutil.insert("000") + pynutil.delete("billions")
|
||||
) # allow for 'mil millones'
|
||||
graph_mille_billion |= pynutil.insert("000000", weight=0.1)
|
||||
|
||||
graph_billiards = pynini.union(
|
||||
graph_hundreds_component_at_least_one_none_zero_digit
|
||||
+ delete_hyphen
|
||||
+ (pynutil.delete("billiards") | pynutil.delete("billiard")),
|
||||
pynutil.insert("000", weight=0.1),
|
||||
)
|
||||
|
||||
graph_trillions = pynini.union( # One thousand English trillions.
|
||||
graph_hundreds_component_at_least_one_none_zero_digit
|
||||
+ delete_hyphen
|
||||
+ (pynutil.delete("trillions") | pynutil.delete("trillion")),
|
||||
pynutil.insert("000", weight=0.1),
|
||||
)
|
||||
|
||||
graph_trilliards = pynini.union(
|
||||
graph_hundreds_component_at_least_one_none_zero_digit
|
||||
+ delete_hyphen
|
||||
+ (pynutil.delete("trilliards") | pynutil.delete("trilliard")),
|
||||
pynutil.insert("000", weight=0.1),
|
||||
)
|
||||
|
||||
graph = pynini.union(
|
||||
graph_trilliards
|
||||
+ delete_hyphen
|
||||
+ graph_trillions
|
||||
+ delete_hyphen
|
||||
+ graph_billiards
|
||||
+ delete_hyphen
|
||||
+ graph_billions
|
||||
+ delete_hyphen
|
||||
+ graph_milliards
|
||||
+ delete_hyphen
|
||||
+ graph_millions
|
||||
+ delete_hyphen
|
||||
+ graph_thousands
|
||||
+ delete_hyphen
|
||||
+ graph_hundreds_component,
|
||||
graph_tens_of_hundred_component,
|
||||
graph_zero,
|
||||
)
|
||||
|
||||
graph = graph @ pynini.union(
|
||||
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), "0"
|
||||
)
|
||||
|
||||
graph = rewrite(graph)
|
||||
|
||||
self.graph_no_exception = graph.optimize()
|
||||
|
||||
# save self.numbers_up_to_thousand for use in DecimalFst
|
||||
digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3)
|
||||
numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize()
|
||||
self.numbers_up_to_thousand = numbers_up_to_thousand
|
||||
|
||||
# save self.numbers_up_to_million for use in DecimalFst
|
||||
digits_up_to_million = (
|
||||
NEMO_DIGIT
|
||||
| (NEMO_DIGIT ** 2)
|
||||
| (NEMO_DIGIT ** 3)
|
||||
| (NEMO_DIGIT ** 4)
|
||||
| (NEMO_DIGIT ** 5)
|
||||
| (NEMO_DIGIT ** 6)
|
||||
)
|
||||
numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
|
||||
self.numbers_up_to_million = numbers_up_to_million
|
||||
|
||||
# don't convert cardinals from zero to nine inclusive
|
||||
graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), 'input')
|
||||
|
||||
self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
|
||||
|
||||
optional_minus_graph = pynini.closure(
|
||||
pynutil.insert("negative: ") + pynini.cross("moins", "\"-\"") + NEMO_SPACE, 0, 1
|
||||
)
|
||||
|
||||
final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"")
|
||||
|
||||
final_graph = self.add_tokens(final_graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,62 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, delete_extra_space
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class DateFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying date, in the form of (day) month (year) or year
|
||||
e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
|
||||
e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
|
||||
e.g. le premier janvier -> date { day: "1" month: "janvier" preserve_order: true }
|
||||
|
||||
Also will convert colloquialism of spelling in which tens of hundreds are used to express date. (e.g. nineteen hundred and four)
|
||||
e.g. le vingt mais dix-neuf-cent-quatre -> date { day: "20" month: "mais" year: "1904" preserve_order: true }
|
||||
|
||||
Args:
|
||||
cardinal: CardinalFst
|
||||
"""
|
||||
|
||||
def __init__(self, cardinal: GraphFst):
|
||||
super().__init__(name="date", kind="classify")
|
||||
|
||||
self.cardinal = cardinal.graph_no_exception
|
||||
|
||||
year_graph = self.cardinal
|
||||
|
||||
month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
|
||||
month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
|
||||
|
||||
day_graph = self.cardinal | pynini.cross("premier", "1") # Premier is only ordinal used for dates
|
||||
day_graph = pynutil.insert("day: \"") + day_graph + pynutil.insert("\"")
|
||||
optional_graph_year = pynini.closure(
|
||||
delete_extra_space + pynutil.insert("year: \"") + year_graph + pynutil.insert("\""), 0, 1,
|
||||
)
|
||||
graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
|
||||
|
||||
final_graph = graph_dmy
|
||||
final_graph += pynutil.insert(" preserve_order: true")
|
||||
final_graph = self.add_tokens(final_graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,134 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_DIGIT,
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_hyphen,
|
||||
delete_space,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_thousand: 'pynini.FstLike') -> 'pynini.FstLike':
|
||||
"""
|
||||
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
|
||||
e.g. one million -> integer_part: "1" quantity: "million"
|
||||
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
|
||||
|
||||
Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions
|
||||
|
||||
Args:
|
||||
decimal: decimal FST
|
||||
cardinal_up_to_million: cardinal FST
|
||||
"""
|
||||
numbers = cardinal_up_to_thousand @ (
|
||||
pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT)
|
||||
)
|
||||
|
||||
suffix = pynini.union(
|
||||
"million",
|
||||
"millions",
|
||||
"milliard",
|
||||
"milliards",
|
||||
"billion",
|
||||
"billions",
|
||||
"billiard",
|
||||
"billiards",
|
||||
"trillion",
|
||||
"trillions",
|
||||
"trilliard",
|
||||
"trilliards",
|
||||
)
|
||||
res = (
|
||||
pynutil.insert("integer_part: \"")
|
||||
+ numbers
|
||||
+ pynutil.insert("\"")
|
||||
+ (
|
||||
pynini.union(delete_hyphen, delete_extra_space)
|
||||
) # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.
|
||||
+ pynutil.insert(" quantity: \"")
|
||||
+ suffix
|
||||
+ pynutil.insert("\"")
|
||||
)
|
||||
res |= decimal + delete_extra_space + pynutil.insert(" quantity: \"") + suffix + pynutil.insert("\"")
|
||||
return res
|
||||
|
||||
|
||||
class DecimalFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying decimal
|
||||
Decimal point is "," (virgule).
|
||||
e.g. moins un virgule deux six -> decimal { negative: "true" integer_part: "1" fractional_part: "26" }
|
||||
|
||||
This decimal rule assumes that decimals can be pronounced as:
|
||||
(a cardinal) + ('virgule') plus (any sequence of cardinals <1 million, including 'zero')
|
||||
|
||||
Also writes large numbers in shortened form, e.g.
|
||||
e.g. un virgule deux-six-million -> decimal { negative: "false" integer_part: "1" fractional_part: "26" quantity: "million" }
|
||||
e.g. deux-million -> decimal { negative: "false" integer_part: "2" quantity: "millions" }
|
||||
e.g. moins cent-vingt-quatre-millions -> decimal { negative: "true" integer_part: "124" quantity: "millions" }
|
||||
Args:
|
||||
cardinal: CardinalFst
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, cardinal: GraphFst):
|
||||
super().__init__(name="decimal", kind="classify")
|
||||
|
||||
# number after decimal point can be any series of cardinals <1 million, including 'zero'
|
||||
graph_decimal = cardinal.numbers_up_to_million
|
||||
graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
|
||||
self.graph = graph_decimal
|
||||
|
||||
# decimal point is denote by virgule
|
||||
graph_fractional_separator = pynutil.delete("virgule")
|
||||
|
||||
# Possible negatives
|
||||
optional_graph_negative = pynutil.insert("negative: ") + pynini.cross("moins", "\"true\"") + delete_extra_space
|
||||
optional_graph_negative = optional_graph_negative.ques
|
||||
|
||||
# Fractional portion
|
||||
graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"")
|
||||
|
||||
# Integers
|
||||
cardinal_graph = cardinal.graph_no_exception | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
|
||||
graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
|
||||
|
||||
# Final graphs
|
||||
final_graph_wo_sign = (
|
||||
pynini.closure(graph_integer + delete_extra_space, 0, 1)
|
||||
+ graph_fractional_separator
|
||||
+ delete_extra_space
|
||||
+ graph_fractional
|
||||
)
|
||||
final_graph = optional_graph_negative + final_graph_wo_sign
|
||||
|
||||
self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
|
||||
final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit
|
||||
)
|
||||
final_graph |= optional_graph_negative + get_quantity(
|
||||
final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit
|
||||
)
|
||||
final_graph = self.add_tokens(final_graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,116 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_ALPHA, GraphFst, insert_space
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class ElectronicFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying 'electronic' semiotic classes, i.e.
|
||||
email address (which get converted to "username" and "domain" fields),
|
||||
and URLS (which get converted to a "protocol" field).
|
||||
e.g. c d f une arobase a b c point e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
|
||||
e.g. double vé double vé double vé a b c point e d u -> tokens { electronic { protocol: "www.abc.edu" } }
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="electronic", kind="classify")
|
||||
|
||||
delete_extra_space = pynutil.delete(" ")
|
||||
alpha_num = (
|
||||
NEMO_ALPHA
|
||||
| pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
|
||||
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
|
||||
)
|
||||
|
||||
symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv"))
|
||||
ampersand = pynini.string_map([("arobase"), ("chez"), ("at"), ("à")])
|
||||
|
||||
accepted_username = alpha_num | symbols
|
||||
process_dot = pynini.cross("point", ".")
|
||||
username = (
|
||||
pynutil.insert("username: \"")
|
||||
+ alpha_num
|
||||
+ delete_extra_space
|
||||
+ pynini.closure(accepted_username + delete_extra_space)
|
||||
+ alpha_num
|
||||
+ pynutil.insert("\"")
|
||||
)
|
||||
single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
|
||||
server = single_alphanum | pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
|
||||
domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
|
||||
domain_graph = (
|
||||
pynutil.insert("domain: \"")
|
||||
+ server
|
||||
+ delete_extra_space
|
||||
+ process_dot
|
||||
+ delete_extra_space
|
||||
+ domain
|
||||
+ pynutil.insert("\"")
|
||||
)
|
||||
graph = (
|
||||
username
|
||||
+ delete_extra_space
|
||||
+ pynutil.delete(ampersand)
|
||||
+ insert_space
|
||||
+ delete_extra_space
|
||||
+ domain_graph
|
||||
)
|
||||
|
||||
############# url ###
|
||||
protocol_end = pynini.cross(pynini.union("www", "w w w", "double vé double vé double vé"), "www")
|
||||
protocol_start = pynini.cross(pynini.union("http", "h t t p", "ache té té pé"), "http")
|
||||
protocol_start |= pynini.cross(pynini.union("https", "h t t p s", "ache té té pé esse"), "https")
|
||||
protocol_start += pynini.cross(
|
||||
pynini.union(
|
||||
" deux-points barre oblique barre oblique ",
|
||||
" deux-points barre barre ",
|
||||
" deux-points double barre ",
|
||||
" deux-points slash slash ",
|
||||
),
|
||||
"://",
|
||||
)
|
||||
|
||||
# e.g. .com, .es
|
||||
ending = (
|
||||
delete_extra_space
|
||||
+ symbols
|
||||
+ delete_extra_space
|
||||
+ (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username)
|
||||
)
|
||||
|
||||
protocol = (
|
||||
pynini.closure(protocol_start, 0, 1)
|
||||
+ protocol_end
|
||||
+ delete_extra_space
|
||||
+ process_dot
|
||||
+ delete_extra_space
|
||||
+ (pynini.closure(delete_extra_space + accepted_username, 1) | server)
|
||||
+ pynini.closure(ending, 1)
|
||||
)
|
||||
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
|
||||
graph |= protocol
|
||||
########
|
||||
|
||||
final_graph = self.add_tokens(graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,81 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_CHAR,
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_space,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class FractionFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying fraction
|
||||
e.g. demi -> tokens { fraction { numerator: "1" denominator: "2" } }
|
||||
e.g. un et demi -> tokens { fraction { integer_part: "1" numerator: "1" denominator: "2" } }
|
||||
e.g. trois et deux centième -> tokens { fraction { integer_part: "3" numerator: "2" denominator: "100" } }
|
||||
|
||||
Args:
|
||||
cardinal: OrdinalFst
|
||||
"""
|
||||
|
||||
def __init__(self, cardinal: GraphFst):
|
||||
super().__init__(name="fraction", kind="classify")
|
||||
# integer_part # numerator # denominator
|
||||
|
||||
graph_cardinal = cardinal.graph_no_exception
|
||||
graph_strip_undo_root_change = pynini.string_file(get_abs_path("data/fractions.tsv")) # add in absolute path
|
||||
|
||||
graph_strip_no_root_change = pynutil.delete("ième") # For no change to root
|
||||
graph_strip_no_root_change += pynutil.delete("s").ques # for plurals
|
||||
|
||||
graph_strip = graph_strip_no_root_change | graph_strip_undo_root_change
|
||||
|
||||
self.fractional = ((pynini.closure(NEMO_CHAR) + graph_strip) @ graph_cardinal).optimize()
|
||||
|
||||
integer = pynutil.insert("integer_part: \"") + graph_cardinal + pynutil.insert("\" ")
|
||||
integer += delete_space
|
||||
integer += pynutil.delete("et") # used to demarcate integer and fractional parts
|
||||
|
||||
numerator = pynutil.insert("numerator: \"") + graph_cardinal + pynutil.insert("\"")
|
||||
denominator = pynutil.insert(" denominator: \"") + self.fractional + pynutil.insert("\"")
|
||||
|
||||
# Demi (half) can occur alone without explicit numerator.
|
||||
graph_demi_component = pynutil.delete("demi") + pynutil.delete("e").ques + pynutil.delete("s").ques
|
||||
graph_demi_component += pynutil.insert("numerator: \"1\" denominator: \"2\"")
|
||||
|
||||
graph_fraction_component = numerator + delete_space + denominator
|
||||
graph_fraction_component |= graph_demi_component
|
||||
self.graph_fraction_component = graph_fraction_component
|
||||
|
||||
graph = pynini.closure(integer + delete_space, 0, 1) + graph_fraction_component
|
||||
graph = graph.optimize()
|
||||
self.final_graph_wo_negative = graph
|
||||
|
||||
optional_graph_negative = pynini.closure(
|
||||
pynutil.insert("negative: ") + pynini.cross("moins", "\"true\"") + delete_extra_space, 0, 1
|
||||
)
|
||||
graph = optional_graph_negative + graph
|
||||
final_graph = self.add_tokens(graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,98 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_space,
|
||||
get_singulars,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class MeasureFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying measure. Allows for plural form for unit.
|
||||
e.g. moins onze kilogramme -> measure { negative: "true" cardinal { integer: "11" } units: "kg" }
|
||||
e.g. trois heures -> measure { cardinal { integer: "3" } units: "h" }
|
||||
e.g. demi gramme -> measure { fraction { numerator: "1" denominator: "2" } units: "g" }
|
||||
|
||||
Args:
|
||||
cardinal: CardinalFst
|
||||
decimal: DecimalFst
|
||||
fraction: FractionFst
|
||||
"""
|
||||
|
||||
def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst):
|
||||
super().__init__(name="measure", kind="classify")
|
||||
|
||||
cardinal_graph = cardinal.graph_no_exception
|
||||
|
||||
graph_prefix = pynini.string_file(get_abs_path("data/measurements/magnitudes.tsv"))
|
||||
graph_unit_singular = pynini.string_file(get_abs_path("data/measurements/measurements.tsv"))
|
||||
|
||||
unit = get_singulars(graph_unit_singular) | graph_unit_singular
|
||||
unit = graph_prefix.ques + unit
|
||||
|
||||
optional_graph_negative = pynini.closure(
|
||||
pynutil.insert("negative: ") + pynini.cross("moins", "\"true\"") + delete_extra_space, 0, 1
|
||||
)
|
||||
|
||||
unit_misc = pynutil.insert("/") + (pynutil.delete("par") | pynutil.delete("à")) + delete_space + unit
|
||||
|
||||
unit = (
|
||||
pynutil.insert("units: \"")
|
||||
+ (unit | unit_misc | pynutil.add_weight(unit + delete_space + unit_misc, 0.01))
|
||||
+ pynutil.insert("\"")
|
||||
)
|
||||
|
||||
subgraph_decimal = (
|
||||
pynutil.insert("decimal { ")
|
||||
+ optional_graph_negative
|
||||
+ decimal.final_graph_wo_negative
|
||||
+ pynutil.insert(" }")
|
||||
+ delete_extra_space
|
||||
+ unit
|
||||
)
|
||||
|
||||
subgraph_fraction = (
|
||||
pynutil.insert("fraction { ")
|
||||
+ optional_graph_negative
|
||||
+ fraction.final_graph_wo_negative
|
||||
+ pynutil.insert(" }")
|
||||
+ delete_extra_space
|
||||
+ unit
|
||||
)
|
||||
|
||||
subgraph_cardinal = (
|
||||
pynutil.insert("cardinal { ")
|
||||
+ optional_graph_negative
|
||||
+ pynutil.insert("integer: \"")
|
||||
+ cardinal_graph
|
||||
+ pynutil.insert("\"")
|
||||
+ pynutil.insert(" }")
|
||||
+ delete_extra_space
|
||||
+ unit
|
||||
)
|
||||
final_graph = subgraph_decimal | subgraph_cardinal | subgraph_fraction
|
||||
final_graph = self.add_tokens(final_graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,140 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_DIGIT,
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_space,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class MoneyFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying money
|
||||
e.g. douze euro cinq -> money { integer_part: "12" currency: "€" fractional_part: 05}
|
||||
e.g. zéro euro cinq -> money { integer_part: "0" currency: "€" fractional_part: 05}
|
||||
e.g. cinq centimes -> money { integer_part: "0" currency: "€" fractional_part: 05}
|
||||
|
||||
Note, the currency symbol seems more common for exact amounts and quantities less than 'un million'
|
||||
For 'round' quantities of >=million (milliard, billion), the symbol is dropped. This allows
|
||||
use of the 'de' preposition.
|
||||
e.g. cinq millions d'euros -> money { integer_part: "5" currency: "d'euros" fractional_part: 00}
|
||||
e.g. un milliard d'euro -> money { integer_part: "5" currency: "d'euro" fractional_part: 00}
|
||||
e.g. trois virgule trois millions d'euros -> money { integer_part: "3" currency: "d'euros" fractional_part: 3}
|
||||
|
||||
Currency is included for uniform tagging.
|
||||
|
||||
Args:
|
||||
cardinal: CardinalFst
|
||||
decimal: DecimalFst
|
||||
"""
|
||||
|
||||
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
|
||||
super().__init__(name="money", kind="classify")
|
||||
# quantity, integer_part, fractional_part, currency
|
||||
|
||||
# quantities
|
||||
cardinal_graph = cardinal.graph_no_exception
|
||||
graph_decimal = decimal.final_graph_wo_negative
|
||||
|
||||
# Converts currency names to symbols
|
||||
convert_currency_major = pynini.string_file(
|
||||
get_abs_path("data/money/currency_major.tsv")
|
||||
) # major denominations
|
||||
convert_currency_minor = pynini.string_file(
|
||||
get_abs_path("data/money/currency_minor.tsv")
|
||||
) # minor denominations to major symbol. (e.g. 5 cents -> 0.05 $ )
|
||||
|
||||
accept_all_currency = (convert_currency_major | convert_currency_minor).project(
|
||||
"input"
|
||||
) # recognizes all currencies
|
||||
|
||||
# Graphs for large round amounts ('deux billiards d'euros', 'un milliard de dollars')
|
||||
graph_de = pynini.union("de ", "des ", "d'") # the use of de/d'only occurs with round amounts
|
||||
graph_currency_component_large_round_amounts = graph_de + accept_all_currency
|
||||
graph_currency_component_large_round_amounts = (
|
||||
pynutil.insert(" currency: \"") + graph_currency_component_large_round_amounts + pynutil.insert("\"")
|
||||
)
|
||||
|
||||
graph_money_large_round_amounts = (
|
||||
graph_decimal + delete_space
|
||||
) # graph_decimal includes tags and quantities already
|
||||
graph_money_large_round_amounts += graph_currency_component_large_round_amounts
|
||||
|
||||
# For standard currency
|
||||
add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT)
|
||||
|
||||
# Graphs integer denomination for large denominations (e.g. $)
|
||||
graph_integer_component_major = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
|
||||
graph_integer_component_major += delete_space
|
||||
|
||||
graph_currency_component_major = (
|
||||
pynutil.insert(" currency: \"") + convert_currency_major + pynutil.insert("\"")
|
||||
)
|
||||
|
||||
graph_decimal_component_major = (
|
||||
delete_space
|
||||
+ pynutil.insert(" fractional_part: \"")
|
||||
+ (cardinal_graph @ add_leading_zero_to_double_digit)
|
||||
+ pynutil.insert("\"")
|
||||
)
|
||||
|
||||
# Rare cases where 'et' will separate major and minor denominations.
|
||||
delete_minor_currency = pynini.project(convert_currency_minor, "input")
|
||||
delete_minor_currency = delete_extra_space + pynutil.delete(delete_minor_currency)
|
||||
|
||||
delete_et = delete_extra_space + pynutil.delete("et")
|
||||
|
||||
graph_money_major = (
|
||||
graph_integer_component_major
|
||||
+ graph_currency_component_major
|
||||
+ delete_et.ques
|
||||
+ graph_decimal_component_major.ques
|
||||
+ delete_minor_currency.ques
|
||||
)
|
||||
|
||||
# For cases when only small denominations are used.
|
||||
graph_integer_component_minor = pynutil.insert("integer_part: \"0\"")
|
||||
|
||||
graph_decimal_component_minor = (
|
||||
pynutil.insert(" fractional_part: \"")
|
||||
+ (cardinal_graph @ add_leading_zero_to_double_digit)
|
||||
+ pynutil.insert("\"")
|
||||
)
|
||||
graph_decimal_component_minor += delete_extra_space
|
||||
|
||||
graph_currency_component_minor = (
|
||||
pynutil.insert(" currency: \"") + convert_currency_minor + pynutil.insert("\"")
|
||||
)
|
||||
|
||||
graph_money_minor = (
|
||||
graph_integer_component_minor + graph_decimal_component_minor + graph_currency_component_minor
|
||||
)
|
||||
|
||||
graph_money_standard_amounts = graph_money_major | graph_money_minor
|
||||
|
||||
final_graph = graph_money_large_round_amounts | graph_money_standard_amounts
|
||||
final_graph = self.add_tokens(final_graph)
|
||||
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,83 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_SIGMA, GraphFst, delete_space
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class OrdinalFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying ordinal
|
||||
vingt-deuxième -> ordinal { integer: "22" morphosyntactic_features: "e" }
|
||||
|
||||
Also notes specific nouns that have unique normalization conventions.
|
||||
For instance, 'siècles' are rendered in roman numerals when given an ordinal adjective.
|
||||
e.g. dix-neuvième siècle -> XIXe
|
||||
|
||||
Args:
|
||||
cardinal: CardinalFst
|
||||
"""
|
||||
|
||||
def __init__(self, cardinal: GraphFst):
|
||||
super().__init__(name="ordinal", kind="classify")
|
||||
|
||||
graph_cardinal = cardinal.graph_no_exception
|
||||
graph_undo_root_change = pynini.string_file(
|
||||
get_abs_path("data/ordinals/digits_root_change.tsv")
|
||||
) # Returns base number to normal after root change.
|
||||
graph_firsts = pynini.string_file(get_abs_path("data/ordinals/firsts.tsv"))
|
||||
graph_second = pynini.string_file(get_abs_path("data/ordinals/second.tsv"))
|
||||
graph_special_ordinals = pynini.string_file(get_abs_path("data/ordinals/key_nouns.tsv"))
|
||||
|
||||
# Removes morpheme
|
||||
graph_no_root_change = pynutil.delete("ième") # For no change to root
|
||||
|
||||
graph_strip_morpheme = pynini.union(graph_no_root_change, graph_undo_root_change)
|
||||
graph_strip_morpheme = NEMO_SIGMA + graph_strip_morpheme
|
||||
|
||||
graph_integer_component = graph_strip_morpheme @ graph_cardinal
|
||||
|
||||
graph_morpheme_component = pynutil.insert("e") # Put the superscript in.
|
||||
graph_morpheme_component += pynini.accep("s").ques # In case of plurals.
|
||||
|
||||
# Concatenate with cardinal graph.
|
||||
graph_ordinal = pynutil.insert("integer: \"") + graph_integer_component + pynutil.insert("\"")
|
||||
graph_ordinal += (
|
||||
pynutil.insert(" morphosyntactic_features: \"") + graph_morpheme_component
|
||||
) # Leave open in case further morphems occur
|
||||
|
||||
# Primer has a different subscript depending on gender, need to take note if
|
||||
# 'premier' or 'première'
|
||||
graph_firsts = pynutil.insert("integer: \"1\" morphosyntactic_features: \"") + graph_firsts
|
||||
|
||||
# Second used 'd' as a superscript.
|
||||
graph_second = pynutil.insert("integer: \"2\" morphosyntactic_features: \"") + graph_second
|
||||
|
||||
graph = graph_firsts | graph_second | graph_ordinal
|
||||
|
||||
# For roman numerals. Carries over designation to verbalizer
|
||||
graph_special_ordinals = pynutil.insert("/") + delete_space + graph_special_ordinals
|
||||
|
||||
graph += graph_special_ordinals.ques + pynutil.insert("\"")
|
||||
|
||||
final_graph = self.add_tokens(graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,42 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = False
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class PunctuationFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying punctuation
|
||||
e.g. a, -> tokens { name: "a" } tokens { name: "," }
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="punctuation", kind="classify")
|
||||
|
||||
s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~"
|
||||
guillemets = "\u00AB" + "\u00BB" # quotation marks in French.
|
||||
s += guillemets
|
||||
punct = pynini.union(*s)
|
||||
|
||||
graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"")
|
||||
|
||||
self.fst = graph.optimize()
|
|
@ -0,0 +1,88 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
GraphFst,
|
||||
delete_hyphen,
|
||||
delete_space,
|
||||
insert_space,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class TelephoneFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying telephone numbers. Assumes conventional grouping for Metropolitan France (and overseas departments)
|
||||
(two number sequences are grouped as individual cardinals) or digit by digit (chiffre-par-chiffre) e.g.
|
||||
"zero un quatre-vingt-deux zero deux vingt-deux cinquante" -> { number_part: "01 42 02 22 50" }
|
||||
"zero un quatre deux zero deux deux deux cinq zero" -> { number_part: "01 42 02 22 50" }
|
||||
|
||||
In cases where only one digit of the first pairing is admitted, assumes that the 0 was skipped.
|
||||
"une vingt-trois quatre-vingt zero six dix-sept" -> { number_part: "01 23 40 06 17" }
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="telephone", kind="classify")
|
||||
|
||||
# create `single_digits` and `double_digits` graphs as these will be
|
||||
# the building blocks of possible telephone numbers
|
||||
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
|
||||
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
|
||||
graph_ties_unique = pynini.string_file((get_abs_path("data/numbers/ties_unique.tsv")))
|
||||
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
|
||||
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
|
||||
|
||||
double_digits = pynini.union(
|
||||
graph_teen,
|
||||
graph_ties_unique,
|
||||
(graph_ties + pynutil.insert("0")),
|
||||
(graph_ties + delete_hyphen + graph_digit),
|
||||
)
|
||||
|
||||
graph_first_pair = graph_zero + delete_space + graph_digit
|
||||
graph_first_pair |= pynutil.insert("0") + graph_digit # if zero is omitted
|
||||
graph_first_pair += (
|
||||
delete_space + insert_space
|
||||
) # delete_space since closure allows possible gaps to be removed
|
||||
|
||||
# All digits
|
||||
single_digits = graph_digit | graph_zero
|
||||
|
||||
graph_pair_all_digits = single_digits + delete_space
|
||||
graph_pair_all_digits += single_digits
|
||||
|
||||
graph_all_digits = pynini.closure(graph_pair_all_digits + delete_space + insert_space, 3, 3)
|
||||
graph_all_digits = graph_first_pair + graph_all_digits + graph_pair_all_digits
|
||||
|
||||
# Paired digits
|
||||
graph_pair_digits_and_ties = double_digits | graph_pair_all_digits
|
||||
|
||||
graph_digits_and_ties = pynini.closure(graph_pair_digits_and_ties + delete_space + insert_space, 3, 3)
|
||||
graph_digits_and_ties = graph_first_pair + graph_digits_and_ties + graph_pair_digits_and_ties
|
||||
|
||||
number_part = pynini.union(graph_all_digits, graph_digits_and_ties)
|
||||
|
||||
number_part = pynutil.insert("number_part: \"") + number_part + pynutil.insert("\"")
|
||||
|
||||
graph = number_part
|
||||
final_graph = self.add_tokens(graph)
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,121 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, delete_space
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class TimeFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying time
|
||||
e.g. huit heures -> time { hours: "8" minutes: "00" }
|
||||
e.g. treize heures -> time { hours: "13" minutes: "00" }
|
||||
e.g. treize heures dix -> time { hours: "13" minutes: "10" }
|
||||
e.g. huit heures du matin -> time { hours: "8" minutes: "00" suffix: "avant mid"}
|
||||
e.g. huite heures du après midi -> time { hours: "8" minutes: "00" suffix: "après-midi"}
|
||||
e.g. douze heures moins qart -> time { hours: "11" minutes: "45" }
|
||||
e.g. douze heures et qart -> time { hours: "12" minutes: "15" }
|
||||
e.g. midi et qart -> time { hours: "12" minutes: "15" }
|
||||
e.g. minuit et medi -> time { hours: "0" minutes: "30" }
|
||||
e.g. douze heures moins medi -> time { hours: "11" minutes: "30" }
|
||||
e.g. douze heures moins trois -> time { hours: "11" minutes: "57" }
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="time", kind="classify")
|
||||
# hours, minutes, seconds, suffix, zone, style, speak_period
|
||||
|
||||
# time_zone = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
|
||||
graph_hours_to = pynini.string_file(get_abs_path("data/time/hours_to.tsv"))
|
||||
graph_minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv"))
|
||||
graph_hours = pynini.string_file(get_abs_path("data/time/hours.tsv"))
|
||||
graph_minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv"))
|
||||
graph_suffix_am = pynini.string_file(get_abs_path("data/time/time_suffix_am.tsv"))
|
||||
graph_suffix_pm = pynini.string_file(get_abs_path("data/time/time_suffix_pm.tsv"))
|
||||
|
||||
graph_suffix = pynini.cross(graph_suffix_am, "am") | pynini.cross(graph_suffix_pm, "pm")
|
||||
|
||||
# Mapping 'heures'
|
||||
graph_heures = pynini.accep("heure") + pynini.accep("s").ques
|
||||
graph_heures = pynutil.delete(graph_heures)
|
||||
|
||||
graph_hours += delete_space + graph_heures
|
||||
|
||||
# Midi and minuit
|
||||
graph_midi = pynini.cross("midi", "12")
|
||||
graph_minuit = pynini.cross("minuit", "0")
|
||||
|
||||
# Mapping 'et demi' and 'et qart'
|
||||
graph_et = pynutil.delete("et") + delete_space
|
||||
|
||||
graph_demi = pynini.accep("demi")
|
||||
graph_demi += pynini.accep("e").ques # people vary on feminine or masculine form
|
||||
graph_demi = pynini.cross(graph_demi, "30")
|
||||
|
||||
graph_quart = pynini.accep('quart')
|
||||
graph_quart = pynini.accep("le ").ques + graph_quart # sometimes used
|
||||
graph_quart = pynini.cross(graph_quart, '15')
|
||||
graph_trois_quart = pynini.cross("trois quarts", "45")
|
||||
|
||||
graph_fractions = pynini.union(graph_demi, graph_quart, graph_trois_quart)
|
||||
|
||||
graph_et_fractions = graph_et + graph_fractions
|
||||
|
||||
# Hours component is usually just a cardinal + 'heures' (ignored in case of 'midi/minuit').
|
||||
graph_hours_component = pynini.union(graph_hours, graph_midi, graph_minuit)
|
||||
graph_hours_component = pynutil.insert("hours: \"") + graph_hours_component + pynutil.insert("\"")
|
||||
graph_hours_component += delete_space
|
||||
|
||||
# Minutes component
|
||||
graph_minutes_component = (
|
||||
pynutil.insert(" minutes: \"") + pynini.union(graph_minutes, graph_et_fractions) + pynutil.insert("\"")
|
||||
)
|
||||
|
||||
# Hour and minutes together. For 'demi' and 'qart', 'et' is used as a conjunction.
|
||||
graph_time_standard = graph_hours_component + graph_minutes_component.ques
|
||||
|
||||
# For time until hour. "quatre heures moins qart" -> 4 h 00 - 0 h 15 = 3 h 45
|
||||
graph_moins = pynutil.delete("moins")
|
||||
graph_moins += delete_space
|
||||
|
||||
graph_hours_to_component = graph_hours | graph_midi | graph_minuit
|
||||
graph_hours_to_component @= graph_hours_to
|
||||
graph_hours_to_component = pynutil.insert("hours: \"") + graph_hours_to_component + pynutil.insert("\"")
|
||||
graph_hours_to_component += delete_space
|
||||
|
||||
graph_minutes_to_component = pynini.union(graph_minutes, graph_fractions)
|
||||
graph_minutes_to_component @= graph_minutes_to
|
||||
graph_minutes_to_component = pynutil.insert(" minutes: \"") + graph_minutes_to_component + pynutil.insert("\"")
|
||||
|
||||
graph_time_to = graph_hours_to_component + graph_moins + graph_minutes_to_component
|
||||
|
||||
graph_time_no_suffix = graph_time_standard | graph_time_to
|
||||
|
||||
graph_suffix_component = pynutil.insert(" suffix: \"") + graph_suffix + pynutil.insert("\"")
|
||||
graph_suffix_component = delete_space + graph_suffix_component
|
||||
graph_suffix_component = graph_suffix_component.ques
|
||||
|
||||
final_graph = graph_time_no_suffix + graph_suffix_component
|
||||
|
||||
final_graph = self.add_tokens(final_graph)
|
||||
|
||||
self.fst = final_graph.optimize()
|
|
@ -0,0 +1,122 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_space,
|
||||
generator_main,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.date import DateFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.decimal import DecimalFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.electronic import ElectronicFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.fraction import FractionFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.measure import MeasureFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.money import MoneyFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.ordinal import OrdinalFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.punctuation import PunctuationFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.telephone import TelephoneFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.time import TimeFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.whitelist import WhiteListFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.word import WordFst
|
||||
|
||||
from nemo.utils import logging
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class ClassifyFst(GraphFst):
|
||||
"""
|
||||
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
|
||||
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
|
||||
More details to deployment at NeMo/tools/text_processing_deployment.
|
||||
|
||||
Args:
|
||||
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
|
||||
overwrite_cache: set to True to overwrite .far files
|
||||
"""
|
||||
|
||||
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
|
||||
super().__init__(name="tokenize_and_classify", kind="classify")
|
||||
|
||||
far_file = None
|
||||
if cache_dir is not None and cache_dir != "None":
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
far_file = os.path.join(cache_dir, "_fr_itn.far")
|
||||
if not overwrite_cache and far_file and os.path.exists(far_file):
|
||||
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
|
||||
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
|
||||
else:
|
||||
logging.info(f"Creating ClassifyFst grammars.")
|
||||
|
||||
cardinal = CardinalFst()
|
||||
cardinal_graph = cardinal.fst
|
||||
|
||||
fraction = FractionFst(cardinal)
|
||||
fraction_graph = fraction.fst
|
||||
|
||||
ordinal = OrdinalFst(cardinal)
|
||||
ordinal_graph = ordinal.fst
|
||||
|
||||
decimal = DecimalFst(cardinal)
|
||||
decimal_graph = decimal.fst
|
||||
|
||||
measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst
|
||||
date_graph = DateFst(cardinal).fst
|
||||
word_graph = WordFst().fst
|
||||
time_graph = TimeFst().fst
|
||||
money_graph = MoneyFst(cardinal, decimal).fst
|
||||
whitelist_graph = WhiteListFst().fst
|
||||
punct_graph = PunctuationFst().fst
|
||||
electronic_graph = ElectronicFst().fst
|
||||
telephone_graph = TelephoneFst().fst
|
||||
|
||||
classify = (
|
||||
pynutil.add_weight(whitelist_graph, 1.01)
|
||||
| pynutil.add_weight(time_graph, 1.05)
|
||||
| pynutil.add_weight(date_graph, 1.09)
|
||||
| pynutil.add_weight(decimal_graph, 1.08)
|
||||
| pynutil.add_weight(measure_graph, 1.1)
|
||||
| pynutil.add_weight(cardinal_graph, 1.1)
|
||||
| pynutil.add_weight(ordinal_graph, 1.1)
|
||||
| pynutil.add_weight(fraction_graph, 1.09)
|
||||
| pynutil.add_weight(money_graph, 1.07)
|
||||
| pynutil.add_weight(telephone_graph, 1.1)
|
||||
| pynutil.add_weight(electronic_graph, 1.1)
|
||||
| pynutil.add_weight(word_graph, 100)
|
||||
)
|
||||
|
||||
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
|
||||
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
|
||||
token_plus_punct = (
|
||||
pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct)
|
||||
)
|
||||
|
||||
graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
|
||||
graph = delete_space + graph + delete_space
|
||||
|
||||
self.fst = graph.optimize()
|
||||
|
||||
if far_file:
|
||||
generator_main(far_file, {"tokenize_and_classify": self.fst})
|
||||
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
|
|
@ -0,0 +1,39 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, convert_space
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class WhiteListFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying whitelisted tokens
|
||||
e.g. misses -> tokens { name: "mrs." }
|
||||
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="whitelist", kind="classify")
|
||||
|
||||
whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv"))
|
||||
graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"")
|
||||
self.fst = graph.optimize()
|
|
@ -0,0 +1,35 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_SPACE, GraphFst
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class WordFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
|
||||
e.g. sleep -> tokens { name: "sleep" }
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="word", kind="classify")
|
||||
word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"")
|
||||
self.fst = word.optimize()
|
27
nemo_text_processing/inverse_text_normalization/fr/utils.py
Normal file
27
nemo_text_processing/inverse_text_normalization/fr/utils.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
|
||||
def get_abs_path(rel_path):
|
||||
"""
|
||||
Get absolute path
|
||||
|
||||
Args:
|
||||
rel_path: relative path to this file
|
||||
|
||||
Returns absolute path
|
||||
"""
|
||||
return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,54 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class CardinalFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing cardinal
|
||||
e.g. cardinal { negative: "-" integer: "23" } -> -23
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="cardinal", kind="verbalize")
|
||||
optional_sign = pynini.closure(
|
||||
pynutil.delete("negative:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ NEMO_NOT_QUOTE
|
||||
+ pynutil.delete("\"")
|
||||
+ delete_space,
|
||||
0,
|
||||
1,
|
||||
)
|
||||
graph = (
|
||||
pynutil.delete("integer:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
self.numbers = graph
|
||||
graph = optional_sign + graph
|
||||
delete_tokens = self.delete_tokens(graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,82 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_NOT_QUOTE,
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_space,
|
||||
)
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class DateFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing date, e.g.
|
||||
date { day: "1" month: "janvier" preserve_order: true } -> 1 de enero
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="date", kind="verbalize")
|
||||
|
||||
convert_primer = pynini.cross('1', '1ᵉʳ')
|
||||
day = (
|
||||
pynutil.delete("day:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ (
|
||||
pynini.closure(NEMO_NOT_QUOTE, 1) | pynutil.add_weight(convert_primer, -1)
|
||||
) # first of the month is ordinal
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
month = (
|
||||
pynutil.delete("month:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
year = (
|
||||
pynutil.delete("year:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
|
||||
# day month
|
||||
graph_dm = day + delete_extra_space + month
|
||||
graph_dmy = graph_dm + delete_extra_space + year
|
||||
|
||||
optional_preserve_order = pynini.closure(
|
||||
pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
|
||||
| pynutil.delete("field_order:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ NEMO_NOT_QUOTE
|
||||
+ pynutil.delete("\"")
|
||||
+ delete_space
|
||||
)
|
||||
|
||||
final_graph = (graph_dm | graph_dmy) + delete_space + optional_preserve_order
|
||||
|
||||
delete_tokens = self.delete_tokens(final_graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,100 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_DIGIT,
|
||||
NEMO_NON_BREAKING_SPACE,
|
||||
NEMO_NOT_QUOTE,
|
||||
GraphFst,
|
||||
delete_space,
|
||||
)
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class NumberParser(GraphFst):
|
||||
"""
|
||||
Finite state transducer for parsing strings of digis. Breaks up digit strings into groups of three for
|
||||
strings of digits of four or more (inclusive). Groupings are separated by non-breaking space.
|
||||
e.g. '1000' -> '1 000'
|
||||
e.g. '1000,33333' -> '1 000,333 33
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="parser", kind="verbalize")
|
||||
|
||||
|
||||
class DecimalFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing decimal, e.g.
|
||||
decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" } -> -12.5006 billion
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="decimal", kind="verbalize")
|
||||
|
||||
# Need parser to group digits by threes
|
||||
exactly_three_digits = NEMO_DIGIT ** 3
|
||||
at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3)
|
||||
|
||||
space_every_three_integer = (
|
||||
at_most_three_digits + (pynutil.insert(NEMO_NON_BREAKING_SPACE) + exactly_three_digits).closure()
|
||||
)
|
||||
space_every_three_decimal = (
|
||||
pynini.accep(",")
|
||||
+ (exactly_three_digits + pynutil.insert(NEMO_NON_BREAKING_SPACE)).closure()
|
||||
+ at_most_three_digits
|
||||
)
|
||||
group_by_threes = space_every_three_integer | space_every_three_decimal
|
||||
self.group_by_threes = group_by_threes
|
||||
|
||||
optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1)
|
||||
integer = (
|
||||
pynutil.delete("integer_part:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
integer = integer @ group_by_threes
|
||||
optional_integer = pynini.closure(integer + delete_space, 0, 1)
|
||||
fractional = (
|
||||
pynutil.insert(",")
|
||||
+ pynutil.delete("fractional_part:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
fractional = fractional @ group_by_threes
|
||||
optional_fractional = pynini.closure(fractional + delete_space, 0, 1)
|
||||
quantity = (
|
||||
pynutil.delete("quantity:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1)
|
||||
graph = (optional_integer + optional_fractional + optional_quantity).optimize()
|
||||
self.numbers = graph
|
||||
graph = optional_sign + graph
|
||||
delete_tokens = self.delete_tokens(graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,51 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class ElectronicFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing electronic
|
||||
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="electronic", kind="verbalize")
|
||||
user_name = (
|
||||
pynutil.delete("username:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
domain = (
|
||||
pynutil.delete("domain:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
|
||||
graph = user_name + delete_space + pynutil.insert("@") + domain
|
||||
delete_tokens = self.delete_tokens(graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_NOT_QUOTE,
|
||||
GraphFst,
|
||||
delete_space,
|
||||
insert_space,
|
||||
)
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class FractionFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing fraction
|
||||
e.g. fraction { integer_part: "1" numerator: "2" denominator: "3" } } -> 1 2/3
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="fraction", kind="verbalize")
|
||||
optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1)
|
||||
integer = (
|
||||
pynutil.delete("integer_part: \"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
+ insert_space
|
||||
)
|
||||
numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
|
||||
|
||||
denominator = (
|
||||
pynutil.insert('/')
|
||||
+ pynutil.delete("denominator: \"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
|
||||
graph = (pynini.closure(integer + delete_space, 0, 1) + numerator + delete_space + denominator).optimize()
|
||||
self.numbers = graph
|
||||
delete_tokens = self.delete_tokens(optional_sign + graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,79 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_CHAR, GraphFst, delete_space
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class MeasureFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing measure, e.g.
|
||||
measure { negative: "true" cardinal { integer: "12" } units: "kg" } -> -12 kg
|
||||
|
||||
Args:
|
||||
decimal: DecimalFst
|
||||
cardinal: CardinalFst
|
||||
fraction: FractionFst
|
||||
"""
|
||||
|
||||
def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst):
|
||||
super().__init__(name="measure", kind="verbalize")
|
||||
optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-"), 0, 1)
|
||||
unit = (
|
||||
pynutil.delete("units:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_CHAR - " ", 1)
|
||||
+ pynutil.delete("\"")
|
||||
+ delete_space
|
||||
)
|
||||
graph_decimal = (
|
||||
pynutil.delete("decimal {")
|
||||
+ delete_space
|
||||
+ optional_sign
|
||||
+ delete_space
|
||||
+ decimal.numbers
|
||||
+ delete_space
|
||||
+ pynutil.delete("}")
|
||||
)
|
||||
graph_cardinal = (
|
||||
pynutil.delete("cardinal {")
|
||||
+ delete_space
|
||||
+ optional_sign
|
||||
+ delete_space
|
||||
+ cardinal.numbers @ decimal.group_by_threes # measurements most obey three by three spacing
|
||||
+ delete_space
|
||||
+ pynutil.delete("}")
|
||||
)
|
||||
graph_fraction = (
|
||||
pynutil.delete("fraction {")
|
||||
+ delete_space
|
||||
+ optional_sign
|
||||
+ delete_space
|
||||
+ fraction.numbers
|
||||
+ delete_space
|
||||
+ pynutil.delete("}")
|
||||
)
|
||||
|
||||
graph = (graph_cardinal | graph_decimal | graph_fraction) + delete_space + pynutil.insert(" ") + unit
|
||||
delete_tokens = self.delete_tokens(graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,52 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_NOT_QUOTE,
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_space,
|
||||
)
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class MoneyFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing money, e.g.
|
||||
money { integer_part: "12" fractional_part: "05" currency: "$" } -> 12.05 $
|
||||
|
||||
Args:
|
||||
decimal: DecimalFst
|
||||
"""
|
||||
|
||||
def __init__(self, decimal: GraphFst):
|
||||
super().__init__(name="money", kind="verbalize")
|
||||
unit = (
|
||||
pynutil.delete("currency:")
|
||||
+ delete_extra_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
graph = decimal.numbers + delete_space + unit
|
||||
delete_tokens = self.delete_tokens(graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,87 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_DIGIT,
|
||||
NEMO_NOT_QUOTE,
|
||||
GraphFst,
|
||||
delete_space,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class OrdinalFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing ordinal, e.g.
|
||||
ordinal { integer: "13" morphosyntactic_features: "e" } -> 13ᵉ
|
||||
|
||||
Given 'special' terms for ordinals (e.g. siècle), renders
|
||||
amount in conventional format. e.g.
|
||||
|
||||
ordinal { integer: "13" morphosyntactic_features: "e/siècle" } -> XIIIᵉ
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="ordinal", kind="verbalize")
|
||||
graph_integer = (
|
||||
pynutil.delete("integer:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_NOT_QUOTE, 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
|
||||
replace_suffix = pynini.union(
|
||||
pynini.cross("e", "ᵉ"), # only delete first quote since there may be more features
|
||||
pynini.cross("d", "ᵈ"),
|
||||
pynini.cross("r", "ʳ"),
|
||||
pynini.cross("s", "ˢ"),
|
||||
)
|
||||
replace_suffix = pynutil.delete(" morphosyntactic_features: \"") + replace_suffix.plus
|
||||
|
||||
graph_arabic = graph_integer + replace_suffix.plus
|
||||
|
||||
# For roman.
|
||||
graph_roman_digits = pynini.string_file(get_abs_path("data/roman/digits_large.tsv")).invert()
|
||||
graph_roman_ties = pynini.string_file(get_abs_path("data/roman/ties_large.tsv")).invert()
|
||||
graph_roman_hundreds = pynini.string_file(get_abs_path("data/roman/hundreds_large.tsv")).invert()
|
||||
graph_roman_zero_digit = pynutil.delete("0")
|
||||
|
||||
graph_roman_hundreds = NEMO_DIGIT ** 3 @ (
|
||||
graph_roman_hundreds
|
||||
+ pynini.union(graph_roman_ties, graph_roman_zero_digit)
|
||||
+ pynini.union(graph_roman_digits, graph_roman_zero_digit)
|
||||
)
|
||||
graph_roman_ties = NEMO_DIGIT ** 2 @ (
|
||||
graph_roman_ties + pynini.union(graph_roman_digits, graph_roman_zero_digit)
|
||||
)
|
||||
graph_roman_digits = NEMO_DIGIT @ graph_roman_digits
|
||||
|
||||
graph_roman_integers = graph_roman_hundreds | graph_roman_ties | graph_roman_digits
|
||||
|
||||
graph_roman = (graph_integer @ graph_roman_integers) + replace_suffix
|
||||
graph_roman += pynini.cross("/", " ") + "siècle"
|
||||
|
||||
graph = (graph_roman | graph_arabic) + pynutil.delete("\"")
|
||||
|
||||
delete_tokens = self.delete_tokens(graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,38 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import NEMO_NOT_QUOTE, GraphFst
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class TelephoneFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing telephone, e.g.
|
||||
telephone { number_part: "02 33 43 53 22" }
|
||||
-> 02 33 43 53 22
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="telephone", kind="verbalize")
|
||||
|
||||
number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
|
||||
delete_tokens = self.delete_tokens(number_part)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,74 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_DIGIT,
|
||||
GraphFst,
|
||||
delete_extra_space,
|
||||
delete_space,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.utils import get_abs_path
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class TimeFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing time, e.g.
|
||||
time { hours: "8" minutes: "30" suffix: "du matin"} -> 8 h 30
|
||||
time { hours: "8" minutes: "30" } -> 8 h 30
|
||||
time { hours: "8" minutes: "30" suffix: "du soir"} -> 20 h 30
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="time", kind="verbalize")
|
||||
|
||||
hour_to_night = pynini.string_file(get_abs_path("data/time/hour_to_night.tsv"))
|
||||
|
||||
day_suffixes = pynutil.delete("suffix: \"am\"")
|
||||
night_suffixes = pynutil.delete("suffix: \"pm\"")
|
||||
|
||||
hour = (
|
||||
pynutil.delete("hours:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_DIGIT, 1, 2)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
minute = (
|
||||
pynutil.delete("minutes:")
|
||||
+ delete_extra_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_DIGIT, 1, 2)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
|
||||
graph = hour + delete_extra_space + pynutil.insert("h") + minute.ques + delete_space + day_suffixes.ques
|
||||
|
||||
graph |= (
|
||||
hour @ hour_to_night
|
||||
+ delete_extra_space
|
||||
+ pynutil.insert("h")
|
||||
+ minute.ques
|
||||
+ delete_space
|
||||
+ night_suffixes
|
||||
)
|
||||
delete_tokens = self.delete_tokens(graph)
|
||||
self.fst = delete_tokens.optimize()
|
|
@ -0,0 +1,64 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.cardinal import CardinalFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.date import DateFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.decimal import DecimalFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.electronic import ElectronicFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.fraction import FractionFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.measure import MeasureFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.money import MoneyFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.ordinal import OrdinalFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.telephone import TelephoneFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.time import TimeFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.whitelist import WhiteListFst
|
||||
|
||||
|
||||
class VerbalizeFst(GraphFst):
|
||||
"""
|
||||
Composes other verbalizer grammars.
|
||||
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
|
||||
More details to deployment at NeMo/tools/text_processing_deployment.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="verbalize", kind="verbalize")
|
||||
cardinal = CardinalFst()
|
||||
cardinal_graph = cardinal.fst
|
||||
ordinal_graph = OrdinalFst().fst
|
||||
decimal = DecimalFst()
|
||||
decimal_graph = decimal.fst
|
||||
fraction = FractionFst()
|
||||
fraction_graph = fraction.fst
|
||||
measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction).fst
|
||||
money_graph = MoneyFst(decimal=decimal).fst
|
||||
time_graph = TimeFst().fst
|
||||
date_graph = DateFst().fst
|
||||
whitelist_graph = WhiteListFst().fst
|
||||
telephone_graph = TelephoneFst().fst
|
||||
electronic_graph = ElectronicFst().fst
|
||||
graph = (
|
||||
time_graph
|
||||
| date_graph
|
||||
| money_graph
|
||||
| measure_graph
|
||||
| fraction_graph
|
||||
| ordinal_graph
|
||||
| decimal_graph
|
||||
| cardinal_graph
|
||||
| whitelist_graph
|
||||
| telephone_graph
|
||||
| electronic_graph
|
||||
)
|
||||
self.fst = graph
|
|
@ -0,0 +1,49 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, delete_extra_space, delete_space
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize import VerbalizeFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.word import WordFst
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class VerbalizeFinalFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer that verbalizes an entire sentence, e.g.
|
||||
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="verbalize_final", kind="verbalize")
|
||||
verbalize = VerbalizeFst().fst
|
||||
word = WordFst().fst
|
||||
types = verbalize | word
|
||||
graph = (
|
||||
pynutil.delete("tokens")
|
||||
+ delete_space
|
||||
+ pynutil.delete("{")
|
||||
+ delete_space
|
||||
+ types
|
||||
+ delete_space
|
||||
+ pynutil.delete("}")
|
||||
)
|
||||
graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
|
||||
self.fst = graph
|
|
@ -0,0 +1,48 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_CHAR,
|
||||
NEMO_SIGMA,
|
||||
GraphFst,
|
||||
delete_space,
|
||||
)
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class WhiteListFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing whitelist
|
||||
e.g. tokens { name: "mrs." } -> mrs.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="whitelist", kind="verbalize")
|
||||
graph = (
|
||||
pynutil.delete("name:")
|
||||
+ delete_space
|
||||
+ pynutil.delete("\"")
|
||||
+ pynini.closure(NEMO_CHAR - " ", 1)
|
||||
+ pynutil.delete("\"")
|
||||
)
|
||||
graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
|
||||
self.fst = graph.optimize()
|
|
@ -0,0 +1,43 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from nemo_text_processing.inverse_text_normalization.fr.graph_utils import (
|
||||
NEMO_CHAR,
|
||||
NEMO_SIGMA,
|
||||
GraphFst,
|
||||
delete_space,
|
||||
)
|
||||
|
||||
try:
|
||||
import pynini
|
||||
from pynini.lib import pynutil
|
||||
|
||||
PYNINI_AVAILABLE = True
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
PYNINI_AVAILABLE = False
|
||||
|
||||
|
||||
class WordFst(GraphFst):
|
||||
"""
|
||||
Finite state transducer for verbalizing plain tokens
|
||||
e.g. tokens { name: "sleep" } -> sleep
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(name="word", kind="verbalize")
|
||||
chars = pynini.closure(NEMO_CHAR - " ", 1)
|
||||
char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"")
|
||||
graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
|
||||
|
||||
self.fst = graph.optimize()
|
|
@ -23,7 +23,7 @@ from nemo_text_processing.text_normalization.token_parser import TokenParser
|
|||
class InverseNormalizer(Normalizer):
|
||||
"""
|
||||
Inverse normalizer that converts text from spoken to written form. Useful for ASR postprocessing.
|
||||
Input is expected to have no punctuation and be lower cased.
|
||||
Input is expected to have no punctuation outside of approstrophe (') and dash (-) and be lower cased.
|
||||
|
||||
Args:
|
||||
lang: language specifying the ITN
|
||||
|
@ -55,6 +55,11 @@ class InverseNormalizer(Normalizer):
|
|||
from nemo_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
|
||||
VerbalizeFinalFst,
|
||||
)
|
||||
elif lang == 'fr':
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import ClassifyFst
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
|
||||
VerbalizeFinalFst,
|
||||
)
|
||||
|
||||
self.tagger = ClassifyFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache)
|
||||
self.verbalizer = VerbalizeFinalFst()
|
||||
|
@ -89,7 +94,7 @@ class InverseNormalizer(Normalizer):
|
|||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("input_string", help="input string", type=str)
|
||||
parser.add_argument("--language", help="language", choices=['en', 'de', 'es', 'ru'], default="en", type=str)
|
||||
parser.add_argument("--language", help="language", choices=['en', 'de', 'es', 'ru', 'fr'], default="en", type=str)
|
||||
parser.add_argument("--verbose", help="print info for debugging", action='store_true')
|
||||
parser.add_argument("--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true")
|
||||
parser.add_argument(
|
||||
|
|
13
tests/nemo_text_processing/fr/__init__.py
Normal file
13
tests/nemo_text_processing/fr/__init__.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,106 @@
|
|||
cent~100
|
||||
dix-huit~18
|
||||
vingt et un~21
|
||||
vingt-et-un~21
|
||||
trente et un~31
|
||||
trente-et-un~31
|
||||
quarante-trois~43
|
||||
quarante trois~40 trois
|
||||
cinquante et un~51
|
||||
cinquante-et-un~51
|
||||
soixante et un~61
|
||||
soixante-et-un~61
|
||||
soixante-dix~70
|
||||
soixante-douze~72
|
||||
quatre-vingts~80
|
||||
quatre-vingt-dix-huit~98
|
||||
cent~100
|
||||
cent deux~102
|
||||
cent-deux~102
|
||||
cent vingt~120
|
||||
cent-vingt~120
|
||||
deux-cents~200
|
||||
deux cent neuf~209
|
||||
deux-cent-neuf~209
|
||||
cent onze~111
|
||||
cent-onze~111
|
||||
mille~1000
|
||||
cent vingt~120
|
||||
cent-vingt~120
|
||||
mille vingt~1020
|
||||
mille-vingt~1020
|
||||
neuf billion sept cent quatre-vingt-neuf milliard trois cent quatre-vingt-deux million cinq cent trente-six mille cent trente~9789382536130
|
||||
neuf-billion-sept-cent-quatre-vingt-neuf-milliard-trois-cent-quatre-vingt-deux-million-cinq-cent-trente-six-mille-cent-trente~9789382536130
|
||||
deux cent cinquante-quatre~254
|
||||
deux-cent-cinquante-quatre~254
|
||||
cent quarante-sept mille quatre cent cinquante et une~147451
|
||||
cent-quarante-sept-mille-quatre-cent-cinquante-et-une~147451
|
||||
un million cent cinquante-six mille cent soixante-treize~1156173
|
||||
un-million-cent-cinquante-six-mille-cent-soixante-treize~1156173
|
||||
un milliard cinq cent quatre-vingt-treize million soixante-douze mille neuf cent soixante et un~1593072961
|
||||
un-milliard-cinq-cent-quatre-vingt-treize-million-soixante-douze-mille-neuf-cent-soixante-et-un~1593072961
|
||||
un milliard cinq cent quatre-vingt-treize million septante-deux mille neuf cent soixante et un~1593072961
|
||||
un-milliard-cinq-cent-quatre-vingt-treize-million-septante-deux-mille-neuf-cent-soixante-et-un~1593072961
|
||||
quatre-vingt-dix-sept billiard huit cent huit billion deux cent soixante-quatre milliard sept cent soixante-douze million sept cent quatre-vingt-douze mille cinq~97808264772792005
|
||||
quatre-vingt-dix-sept-billiard-huit-cent-huit-billion-deux-cent-soixante-quatre-milliard-sept-cent-soixante-douze-million-sept-cent-quatre-vingt-douze-mille-cinq~97808264772792005
|
||||
dix billiard dix billion dix million cent mille dix~10010000010100010
|
||||
dix-billiard-dix-billion-dix-million-cent-mille-dix~10010000010100010
|
||||
moins vingt-cinq mille trente-sept~-25037
|
||||
moins vingt-cinq-mille-trente-sept~-25037
|
||||
moins dix-neuf cent trente-sept~-1937
|
||||
moins dix-neuf-cent-trente-sept~-1937
|
||||
un billiard deux cent soixante-quatre billion trois cent un milliard neuf cent trente-huit million cent quatre~1264301938000104
|
||||
un-billiard-deux-cent-soixante-quatre-billion-trois-cent-un-milliard-neuf-cent-trente-huit-million-cent-quatre~1264301938000104
|
||||
moins soixante~-60
|
||||
quarante-six mille six cent soixante-quatre~46664
|
||||
quarante-six-mille-six-cent-soixante-quatre~46664
|
||||
soixante~60
|
||||
zéro~zéro
|
||||
un~un
|
||||
une~une
|
||||
deux~deux
|
||||
neuf~neuf
|
||||
dix~10
|
||||
onze~11
|
||||
douze~12
|
||||
treize~13
|
||||
quatorze~14
|
||||
quinze~15
|
||||
seize~16
|
||||
dix-sept~17
|
||||
dix-huit~18
|
||||
vingt~20
|
||||
trente~30
|
||||
quarante~40
|
||||
cinquante~50
|
||||
soixante~60
|
||||
soixante-dix~70
|
||||
septante~70
|
||||
quatre-vingts~80
|
||||
huitante~80
|
||||
quatre-vingt-dix~90
|
||||
deux million dix~2000010
|
||||
deux-million-dix~2000010
|
||||
mille treize~1013
|
||||
mille-treize~1013
|
||||
mille un~1001
|
||||
mille-un~1001
|
||||
mille cent~1100
|
||||
mille-cent~1100
|
||||
onze cents~1100
|
||||
onze-cents~1100
|
||||
dix-huit mille treize~18013
|
||||
dix-huit-mille-treize~18013
|
||||
mille vingt-six~1026
|
||||
mille-vingt-six~1026
|
||||
mille cent vingt-six~1126
|
||||
mille-cent-vingt-six~1126
|
||||
onze cent vingt-six~1126
|
||||
onze-cent-vingt-six~1126
|
||||
dix-huit million quatre cent cinquante mille neuf cent quatre-vingt-dix~18450990
|
||||
dix-huit-million-quatre-cent-cinquante-mille-neuf-cent-quatre-vingt-dix~18450990
|
||||
dix-huit-million-quatre-cent-cinquante-mille-neuf-cent-nonante~18450990
|
||||
dix-huit mille huit cent quatre-vingts~18880
|
||||
dix-huit-mille-huit-cent-quatre-vingts~18880
|
||||
dix-huit mille huit cent huitante~18880
|
||||
dix-huit-mille-huit-cent-huitante~18880
|
|
@ -0,0 +1,6 @@
|
|||
vingt-quatre juillet deux-mille-treize~24 juillet 2013
|
||||
vingt-quatre juillet~24 juillet
|
||||
quatorze janvier~14 janvier
|
||||
premier janvier~1ᵉʳ janvier
|
||||
trente juin~30 juin
|
||||
dix-huit mai dix-neuf cent trente~18 mai 1930
|
|
@ -0,0 +1,15 @@
|
|||
zéro virgule deux million~0,2 million
|
||||
dix-huit milliards~18 milliards
|
||||
quatre cent soixante millions~460 millions
|
||||
quatre-cent-soixante millions~460 millions
|
||||
quatre-cent-soixante-millions~460 millions
|
||||
cent vingt millions~120 millions
|
||||
cent-vingt-millions~120 millions
|
||||
cent vingt millions~120 millions
|
||||
dix billions~10 billions
|
||||
dix-billions~10 billions
|
||||
moins soixante virgule deux quatre zéro zéro~-60,240 0
|
||||
huit cent dix-huit virgule trois zéro trois~818,303
|
||||
huit-cent-dix-huit virgule trois zéro trois~818,303
|
||||
huit-cent-dix-huit virgule trente trois~818,303
|
||||
mille-huit-cent-dix-huit virgule trois zéro trois trois quatre~1 818,303 34
|
|
@ -0,0 +1,10 @@
|
|||
a point b c arobase g mail point com~a.bc@gmail.com
|
||||
a point b c at g mail point com~a.bc@gmail.com
|
||||
c d f at a b c point e d u~cdf@abc.edu
|
||||
a b c at g mail point a b c~abc@gmail.abc
|
||||
a b c arobase g mail point a b c~abc@gmail.abc
|
||||
a b c at a b c point com~abc@abc.com
|
||||
a s d f un deux trois at a b c point com~asdf123@abc.com
|
||||
a un b deux arobase a b c point com~a1b2@abc.com
|
||||
a b trois point s d d point trois at g mail point com~ab3.sdd.3@gmail.com
|
||||
a b trois point s d d point trois arobase g mail point com~ab3.sdd.3@gmail.com
|
|
@ -0,0 +1,30 @@
|
|||
demi~1/2
|
||||
un tiers~1/3
|
||||
un quart~1/4
|
||||
un cinquième~1/5
|
||||
un sixième~1/6
|
||||
un septième~1/7
|
||||
un huitième~1/8
|
||||
deux neuvième~2/9
|
||||
un et demi~1 1/2
|
||||
un dixième~1/10
|
||||
un onzième~1/11
|
||||
un douzième~1/12
|
||||
un treizième~1/13
|
||||
un quatrième~1/4
|
||||
un quatorzième~1/14
|
||||
un quinzième~1/15
|
||||
un seizième~1/16
|
||||
un dix-septième~1/17
|
||||
un dix-huitième~1/18
|
||||
un dix-neuvième~1/19
|
||||
un vingtième~1/20
|
||||
un trentième~1/30
|
||||
un quarantième~1/40
|
||||
un cinquantième~1/50
|
||||
un soixantième~1/60
|
||||
un soixante-dixième~1/70
|
||||
un quatre-vingtième~1/80
|
||||
un quatre-vingt-dixième~1/90
|
||||
un centième~1/100
|
||||
quatre et deux quatrièmes~4 2/4
|
|
@ -0,0 +1,15 @@
|
|||
deux cents mètres~200 m
|
||||
cinquante-six virgule trois par kilomètre carré~56,3 /km²
|
||||
deux-cents kilomètres par heure~200 km/h
|
||||
deux-cents kilomètres heure~200 km/h
|
||||
quarante-deux-mille-deux-cent-cinquante-neuf par mètre carré~42 259 /m²
|
||||
moins soixante-six kilogrammes~-66 kg
|
||||
un virgule zéro zéro zéro zéro vingt-huit centimètre cube~1,000 028 cm³
|
||||
cinquante minutes~50 min
|
||||
deux mètres cubes~2 m³
|
||||
quatre-vingt-dix grammes~90 g
|
||||
quatre-cent-quarante millilitres~440 ml
|
||||
trois cents micromètres~300 µm
|
||||
soixante-cinq kilomètres carrés~65 km²
|
||||
deux kilomètres par heure~2 km/h
|
||||
soixante virgule vingt-quatre zéro zéro kilogrammes~60,240 0 kg
|
|
@ -0,0 +1,24 @@
|
|||
deux dollars~2 $
|
||||
un centime~0,01 €
|
||||
vingt centimes~0,20 €
|
||||
vingt-deux centimes~0,22 €
|
||||
deux dollars vingt~2,20 $
|
||||
deux dollars et vingt cents~2,20 $
|
||||
deux euros et vingt centimes~2,20 €
|
||||
vingt euros~20 €
|
||||
un franc suisse~1 CHF
|
||||
vingt euro cinq~20,05 €
|
||||
un euro~1 €
|
||||
deux euro~2 €
|
||||
cinq euro et soixante~5,60 €
|
||||
cinquante centimes~0,50 €
|
||||
cinq dollars et deux cents~5,02 $
|
||||
quatre-vingt mille won~80 000 ₩
|
||||
quatre-vingt-mille won~80 000 ₩
|
||||
quatre-vingt-millions de wons~80 millions de wons
|
||||
trois livre~3 £
|
||||
trois pence~0,03 £
|
||||
zéro euro~0 €
|
||||
zéro euro quatre-vingt~0,80 €
|
||||
deux-millions de dollars~2 millions de dollars
|
||||
quatre virgule quatre-vingt milliards d'euros~4,80 milliards d'euros
|
|
@ -0,0 +1,23 @@
|
|||
centième~100ᵉ
|
||||
centièmes~100ᵉˢ
|
||||
vingt-cinq-mille-cent-onzième~25111ᵉ
|
||||
première~1ʳᵉ
|
||||
premières~1ʳᵉˢ
|
||||
premier~1ᵉʳ
|
||||
premiers~1ᵉʳˢ
|
||||
second~2ᵈ
|
||||
seconds~2ᵈˢ
|
||||
seconde~2ᵈᵉ
|
||||
secondes~2ᵈᵉˢ
|
||||
deuxième~2ᵉ
|
||||
troisième~3ᵉ
|
||||
quatrième~4ᵉ
|
||||
onzièmes~11ᵉˢ
|
||||
treizième~13ᵉ
|
||||
vingt-et-unième~21ᵉ
|
||||
vingt-troisièmes~23ᵉˢ
|
||||
cent-onzième~111ᵉ
|
||||
cent onzième~111ᵉ
|
||||
millième~1000ᵉ
|
||||
dix-neuvième siècle~XIXᵉ siècle
|
||||
vingtième siècle~XXᵉ siècle
|
|
@ -0,0 +1,4 @@
|
|||
zéro deux douze trente-deux trente trente~02 12 32 30 30
|
||||
zéro deux une deux trois deux trois zéro trois zéro~02 12 32 30 30
|
||||
deux douze trente-deux trente trente~02 12 32 30 30
|
||||
deux une deux trois deux trois zéro trois zéro~02 12 32 30 30
|
|
@ -0,0 +1,18 @@
|
|||
huit heures~8 h
|
||||
huit heures du matin~8 h
|
||||
huit heures du soir~20 h
|
||||
minuit~0 h
|
||||
deux heures de l'après-midi~14 h
|
||||
quatorze heures~14 h
|
||||
midi~12 h
|
||||
dix-huit heures~18 h
|
||||
huit heures sept~8 h 07
|
||||
minuit dix-sept~0 h 17
|
||||
douze heures~12 h
|
||||
onze heures et demie~11 h 30
|
||||
midi moins le quart~11 h 45
|
||||
onze heures et trois quarts~11 h 45
|
||||
midi moins trois~11 h 57
|
||||
onze heures cinquante-sept~11 h 57
|
||||
onze heures trente-huit~11 h 38
|
||||
midi moins vingt-deux~11 h 38
|
|
@ -0,0 +1,8 @@
|
|||
docteur~Dʳ
|
||||
docteures~Dʳᵉˢ
|
||||
monsieur~M.
|
||||
messieurs~MM.
|
||||
madame~Mᵐᵉ
|
||||
mesdames~Mᵐᵉˢ
|
||||
mademoiselle~Mˡˡᵉ
|
||||
mademoiselles~Mˡˡᵉˢ
|
|
@ -0,0 +1,49 @@
|
|||
~
|
||||
yahoo!~yahoo!
|
||||
vingt!~20 !
|
||||
x ~x
|
||||
—~—
|
||||
aaa~aaa
|
||||
aabach~aabach
|
||||
aabenraa~aabenraa
|
||||
aabye~aabye
|
||||
aaccessed~aaccessed
|
||||
aach~aach
|
||||
aachen's~aachen's
|
||||
aadri~aadri
|
||||
aafia~aafia
|
||||
aagaard~aagaard
|
||||
aagadu~aagadu
|
||||
aagard~aagard
|
||||
aagathadi~aagathadi
|
||||
aaghart's~aaghart's
|
||||
aagnes~aagnes
|
||||
aagomoni~aagomoni
|
||||
aagon~aagon
|
||||
aagoo~aagoo
|
||||
aagot~aagot
|
||||
aahar~aahar
|
||||
aahh~aahh
|
||||
aahperd~aahperd
|
||||
aaibinterstate~aaibinterstate
|
||||
aajab~aajab
|
||||
aakasa~aakasa
|
||||
aakervik~aakervik
|
||||
aakirkeby~aakirkeby
|
||||
aalam~aalam
|
||||
aalbaek~aalbaek
|
||||
aaldiu~aaldiu
|
||||
aalem~aalem
|
||||
a'ali~a'ali
|
||||
aalilaassamthey~aalilaassamthey
|
||||
aalin~aalin
|
||||
aaliyan~aaliyan
|
||||
aaliyan's~aaliyan's
|
||||
aamadu~aamadu
|
||||
aamara~aamara
|
||||
aambala~aambala
|
||||
aamera~aamera
|
||||
aamer's~aamer's
|
||||
aamina~aamina
|
||||
aaminah~aaminah
|
||||
aamjiwnaang~aamjiwnaang
|
36
tests/nemo_text_processing/fr/test_cardinal.py
Normal file
36
tests/nemo_text_processing/fr/test_cardinal.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestCardinal:
|
||||
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_cardinal.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
35
tests/nemo_text_processing/fr/test_date.py
Normal file
35
tests/nemo_text_processing/fr/test_date.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestDate:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_date.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
35
tests/nemo_text_processing/fr/test_decimal.py
Normal file
35
tests/nemo_text_processing/fr/test_decimal.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestDecimal:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_decimal.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
35
tests/nemo_text_processing/fr/test_electronic.py
Normal file
35
tests/nemo_text_processing/fr/test_electronic.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestElectronic:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_electronic.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
36
tests/nemo_text_processing/fr/test_fraction.py
Normal file
36
tests/nemo_text_processing/fr/test_fraction.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestFraction:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_fraction.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
36
tests/nemo_text_processing/fr/test_measure.py
Normal file
36
tests/nemo_text_processing/fr/test_measure.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestMeasure:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_measure.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
36
tests/nemo_text_processing/fr/test_money.py
Normal file
36
tests/nemo_text_processing/fr/test_money.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestMoney:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_money.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
36
tests/nemo_text_processing/fr/test_ordinal.py
Normal file
36
tests/nemo_text_processing/fr/test_ordinal.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestOrdinal:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_ordinal.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
|
@ -0,0 +1,84 @@
|
|||
#! /bin/sh
|
||||
|
||||
PROJECT_DIR=/workspace/tests
|
||||
|
||||
runtest () {
|
||||
input=$1
|
||||
cd /workspace/sparrowhawk/documentation/grammars
|
||||
|
||||
# read test file
|
||||
while read testcase; do
|
||||
IFS='~' read spoken written <<< $testcase
|
||||
denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1)
|
||||
|
||||
# trim white space
|
||||
written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
|
||||
denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
|
||||
|
||||
# input expected actual
|
||||
assertEquals "$spoken" "$written" "$denorm_pred"
|
||||
done < "$input"
|
||||
}
|
||||
|
||||
testITNCardinal() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNDate() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNDecimal() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNOrdinal() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNFraction() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNTime() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNMeasure() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNMoney() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNWhitelist() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNTelephone() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNElectronic() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
testITNWord() {
|
||||
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt
|
||||
runtest $input
|
||||
}
|
||||
|
||||
# Load shUnit2
|
||||
. $PROJECT_DIR/../shunit2/shunit2
|
36
tests/nemo_text_processing/fr/test_telephone.py
Normal file
36
tests/nemo_text_processing/fr/test_telephone.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestTelephone:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_telephone.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
35
tests/nemo_text_processing/fr/test_time.py
Normal file
35
tests/nemo_text_processing/fr/test_time.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestTime:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_time.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
36
tests/nemo_text_processing/fr/test_whitelist.py
Normal file
36
tests/nemo_text_processing/fr/test_whitelist.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestWhitelist:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_whitelist.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
36
tests/nemo_text_processing/fr/test_word.py
Normal file
36
tests/nemo_text_processing/fr/test_word.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import pytest
|
||||
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
|
||||
from parameterized import parameterized
|
||||
|
||||
from ..utils import CACHE_DIR, PYNINI_AVAILABLE, parse_test_case_file
|
||||
|
||||
|
||||
class TestWord:
|
||||
inverse_normalizer = (
|
||||
InverseNormalizer(lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None
|
||||
)
|
||||
|
||||
@parameterized.expand(parse_test_case_file('fr/data_inverse_text_normalization/test_cases_word.txt'))
|
||||
@pytest.mark.skipif(
|
||||
not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh"
|
||||
)
|
||||
@pytest.mark.run_only_on('CPU')
|
||||
@pytest.mark.unit
|
||||
def test_denorm(self, test_input, expected):
|
||||
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
|
||||
assert pred == expected
|
|
@ -32,7 +32,7 @@
|
|||
|
||||
GRAMMARS="itn_grammars" # tn_grammars
|
||||
INPUT_CASE="cased" # lower_cased, only for tn_grammars
|
||||
LANGUAGE="en" # language, 'en' supports both TN and ITN, {'de', 'ru', 'es'} supports ITN only
|
||||
LANGUAGE="en" # language, 'en' supports both TN and ITN, {'de', 'ru', 'es', 'fr'} supports ITN only
|
||||
MODE="export"
|
||||
CACHE_DIR="None" # path to cache dir with .far files (to speed the export)
|
||||
OVERWRITE_CACHE="True" # Set to False to re-use .far files
|
||||
|
@ -60,7 +60,7 @@ if [[ ${OVERWRITE_CACHE,,} == "true" ]]; then
|
|||
else OVERWRITE_CACHE=""
|
||||
fi
|
||||
|
||||
python pynini_export.py --output_dir=. --grammars=${GRAMMARS} --input_case=${INPUT_CASE} --language=${LANGUAGE} --cache_dir=${CACHE_DIR} ${OVERWRITE_CACHE}|| exit 1
|
||||
python3 pynini_export.py --output_dir=. --grammars=${GRAMMARS} --input_case=${INPUT_CASE} --language=${LANGUAGE} --cache_dir=${CACHE_DIR} ${OVERWRITE_CACHE}|| exit 1
|
||||
find . -name "Makefile" -type f -delete
|
||||
bash docker/build.sh $FORCE
|
||||
|
||||
|
|
|
@ -87,7 +87,7 @@ def export_grammars(output_dir, grammars):
|
|||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--output_dir", help="output directory for grammars", required=True, type=str)
|
||||
parser.add_argument("--language", help="language", choices=["en", "de", "es", "ru"], type=str, default='en')
|
||||
parser.add_argument("--language", help="language", choices=["en", "de", "es", "ru", 'fr'], type=str, default='en')
|
||||
parser.add_argument(
|
||||
"--grammars", help="grammars to be exported", choices=["tn_grammars", "itn_grammars"], type=str, required=True
|
||||
)
|
||||
|
@ -146,6 +146,16 @@ if __name__ == '__main__':
|
|||
VerbalizeFst as ITNVerbalizeFst,
|
||||
)
|
||||
|
||||
if args.grammars == 'tn_grammars':
|
||||
raise ValueError(f'"{args.language}" only supports Inverse Text Normalization task.')
|
||||
elif args.language == 'fr':
|
||||
from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
|
||||
ClassifyFst as ITNClassifyFst,
|
||||
)
|
||||
from nemo_text_processing.inverse_text_normalization.fr.verbalizers.verbalize import (
|
||||
VerbalizeFst as ITNVerbalizeFst,
|
||||
)
|
||||
|
||||
if args.grammars == 'tn_grammars':
|
||||
raise ValueError(f'"{args.language}" only supports Inverse Text Normalization task.')
|
||||
|
||||
|
|
Loading…
Reference in a new issue