mirror of
https://github.com/NixOS/nixpkgs.git
synced 2024-11-16 23:03:40 +01:00
dictdDbs.wiktionary: use python3
In pkgs/servers/dict: * wiktionary/default.nix: change python2 -> python3 * wiktionary/latest_version.py: decodes('utf-8') no longer needed * wiktionary/wiktionary2dict.py: 'import codesc' no longer needed
This commit is contained in:
parent
01d76ca6c1
commit
9287d37501
3 changed files with 34 additions and 38 deletions
|
@ -1,4 +1,4 @@
|
||||||
{ lib, stdenv, fetchurl, python2, dict, glibcLocales }:
|
{ lib, stdenv, fetchurl, python3, dict, glibcLocales }:
|
||||||
|
|
||||||
stdenv.mkDerivation rec {
|
stdenv.mkDerivation rec {
|
||||||
pname = "dict-db-wiktionary";
|
pname = "dict-db-wiktionary";
|
||||||
|
@ -9,8 +9,7 @@ stdenv.mkDerivation rec {
|
||||||
sha256 = "qsha26LL2513SDtriE/0zdPX1zlnpzk1KKk+R9dSdew=";
|
sha256 = "qsha26LL2513SDtriE/0zdPX1zlnpzk1KKk+R9dSdew=";
|
||||||
};
|
};
|
||||||
|
|
||||||
# script in nixpkgs does not support python2
|
nativeBuildInputs = [ python3 dict glibcLocales ];
|
||||||
nativeBuildInputs = [ python2 dict glibcLocales ];
|
|
||||||
|
|
||||||
dontUnpack = true;
|
dontUnpack = true;
|
||||||
|
|
||||||
|
@ -18,7 +17,7 @@ stdenv.mkDerivation rec {
|
||||||
mkdir -p $out/share/dictd/
|
mkdir -p $out/share/dictd/
|
||||||
cd $out/share/dictd
|
cd $out/share/dictd
|
||||||
|
|
||||||
${python2.interpreter} -O ${./wiktionary2dict.py} "${src}"
|
${python3.interpreter} -O ${./wiktionary2dict.py} "${src}"
|
||||||
dictzip wiktionary-en.dict
|
dictzip wiktionary-en.dict
|
||||||
echo en_US.UTF-8 > locale
|
echo en_US.UTF-8 > locale
|
||||||
'';
|
'';
|
||||||
|
|
|
@ -25,18 +25,18 @@ def nix_prefetch_url(url, algo='sha256'):
|
||||||
"""Prefetches the content of the given URL."""
|
"""Prefetches the content of the given URL."""
|
||||||
print(f'nix-prefetch-url {url}')
|
print(f'nix-prefetch-url {url}')
|
||||||
out = subprocess.check_output(['nix-prefetch-url', '--type', algo, url])
|
out = subprocess.check_output(['nix-prefetch-url', '--type', algo, url])
|
||||||
return out.decode('utf-8').rstrip()
|
return out.rstrip()
|
||||||
|
|
||||||
|
|
||||||
current_version = subprocess.check_output([
|
current_version = subprocess.check_output([
|
||||||
'nix', 'eval', '--raw',
|
'nix', 'eval', '--raw',
|
||||||
'-f', dirname(abspath(__file__)) + '/../../../..',
|
'-f', dirname(abspath(__file__)) + '/../../../..',
|
||||||
'dictdDBs.wiktionary.version',
|
'dictdDBs.wiktionary.version',
|
||||||
]).decode('utf-8')
|
])
|
||||||
|
|
||||||
parser = WiktionaryLatestVersionParser(current_version)
|
parser = WiktionaryLatestVersionParser(current_version)
|
||||||
|
|
||||||
with urlopen('https://dumps.wikimedia.org/enwiktionary/') as resp:
|
with urlopen('https://dumps.wikimedia.org/enwiktionary/') as resp:
|
||||||
parser.feed(resp.read().decode('utf-8'))
|
parser.feed(resp.read())
|
||||||
|
|
||||||
print(parser.latest_version)
|
print(parser.latest_version)
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
# Based on code from wiktiondict by Greg Hewgill
|
# Based on code from wiktiondict by Greg Hewgill
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import codecs
|
|
||||||
import os
|
import os
|
||||||
import textwrap
|
import textwrap
|
||||||
import time
|
import time
|
||||||
|
@ -48,7 +47,7 @@ class Delimiter:
|
||||||
return self.c
|
return self.c
|
||||||
|
|
||||||
def Tokenise(s):
|
def Tokenise(s):
|
||||||
s = unicode(s)
|
s = str(s)
|
||||||
stack = []
|
stack = []
|
||||||
last = 0
|
last = 0
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -109,17 +108,17 @@ def Tokenise(s):
|
||||||
yield s[last:i]
|
yield s[last:i]
|
||||||
|
|
||||||
def processSub(templates, tokens, args):
|
def processSub(templates, tokens, args):
|
||||||
t = tokens.next()
|
t = next(tokens)
|
||||||
if not isinstance(t, unicode):
|
if not isinstance(t, str):
|
||||||
raise SyntaxError
|
raise SyntaxError
|
||||||
name = t
|
name = t
|
||||||
t = tokens.next()
|
t = next(tokens)
|
||||||
default = None
|
default = None
|
||||||
if isinstance(t, Delimiter) and t.c == '|':
|
if isinstance(t, Delimiter) and t.c == '|':
|
||||||
default = ""
|
default = ""
|
||||||
while True:
|
while True:
|
||||||
t = tokens.next()
|
t = next(tokens)
|
||||||
if isinstance(t, unicode):
|
if isinstance(t, str):
|
||||||
default += t
|
default += t
|
||||||
elif isinstance(t, OpenDouble):
|
elif isinstance(t, OpenDouble):
|
||||||
default += processTemplateCall(templates, tokens, args)
|
default += processTemplateCall(templates, tokens, args)
|
||||||
|
@ -128,7 +127,7 @@ def processSub(templates, tokens, args):
|
||||||
elif isinstance(t, CloseTriple):
|
elif isinstance(t, CloseTriple):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print "Unexpected:", t
|
print("Unexpected:", t)
|
||||||
raise SyntaxError()
|
raise SyntaxError()
|
||||||
if name in args:
|
if name in args:
|
||||||
return args[name]
|
return args[name]
|
||||||
|
@ -142,14 +141,14 @@ def processTemplateCall(templates, tokens, args):
|
||||||
template = tokens.next().strip().lower()
|
template = tokens.next().strip().lower()
|
||||||
args = {}
|
args = {}
|
||||||
a = 1
|
a = 1
|
||||||
t = tokens.next()
|
t = next(tokens)
|
||||||
while True:
|
while True:
|
||||||
if isinstance(t, Delimiter):
|
if isinstance(t, Delimiter):
|
||||||
name = unicode(a)
|
name = str(a)
|
||||||
arg = ""
|
arg = ""
|
||||||
while True:
|
while True:
|
||||||
t = tokens.next()
|
t = next(tokens)
|
||||||
if isinstance(t, unicode):
|
if isinstance(t, str):
|
||||||
arg += t
|
arg += t
|
||||||
elif isinstance(t, OpenDouble):
|
elif isinstance(t, OpenDouble):
|
||||||
arg += processTemplateCall(templates, tokens, args)
|
arg += processTemplateCall(templates, tokens, args)
|
||||||
|
@ -163,9 +162,9 @@ def processTemplateCall(templates, tokens, args):
|
||||||
name = arg.strip()
|
name = arg.strip()
|
||||||
arg = ""
|
arg = ""
|
||||||
while True:
|
while True:
|
||||||
t = tokens.next()
|
t = next(tokens)
|
||||||
if isinstance(t, (unicode, Equals)):
|
if isinstance(t, (str, Equals)):
|
||||||
arg += unicode(t)
|
arg += str(t)
|
||||||
elif isinstance(t, OpenDouble):
|
elif isinstance(t, OpenDouble):
|
||||||
arg += processTemplateCall(templates, tokens, args)
|
arg += processTemplateCall(templates, tokens, args)
|
||||||
elif isinstance(t, OpenTriple):
|
elif isinstance(t, OpenTriple):
|
||||||
|
@ -181,7 +180,7 @@ def processTemplateCall(templates, tokens, args):
|
||||||
elif isinstance(t, CloseDouble):
|
elif isinstance(t, CloseDouble):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print "Unexpected:", t
|
print("Unexpected:", t)
|
||||||
raise SyntaxError
|
raise SyntaxError
|
||||||
#print template, args
|
#print template, args
|
||||||
if template[0] == '#':
|
if template[0] == '#':
|
||||||
|
@ -208,7 +207,7 @@ def processTemplateCall(templates, tokens, args):
|
||||||
else:
|
else:
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
print "Unknown ParserFunction:", template
|
print("Unknown ParserFunction:", template)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
if template not in templates:
|
if template not in templates:
|
||||||
return "{{%s}}" % template
|
return "{{%s}}" % template
|
||||||
|
@ -225,13 +224,13 @@ def process(templates, s, args = {}):
|
||||||
tokens = Tokenise(s)
|
tokens = Tokenise(s)
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
t = tokens.next()
|
t = next(tokens)
|
||||||
if isinstance(t, OpenDouble):
|
if isinstance(t, OpenDouble):
|
||||||
r += processTemplateCall(templates, tokens, args)
|
r += processTemplateCall(templates, tokens, args)
|
||||||
elif isinstance(t, OpenTriple):
|
elif isinstance(t, OpenTriple):
|
||||||
r += processSub(templates, tokens, args)
|
r += processSub(templates, tokens, args)
|
||||||
else:
|
else:
|
||||||
r += unicode(t)
|
r += str(t)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
pass
|
pass
|
||||||
return r
|
return r
|
||||||
|
@ -250,11 +249,11 @@ def test():
|
||||||
't6': "t2demo|a",
|
't6': "t2demo|a",
|
||||||
}
|
}
|
||||||
def t(text, expected):
|
def t(text, expected):
|
||||||
print "text:", text
|
print("text:", text)
|
||||||
s = process(templates, text)
|
s = process(templates, text)
|
||||||
if s != expected:
|
if s != expected:
|
||||||
print "got:", s
|
print("got:", s)
|
||||||
print "expected:", expected
|
print("expected:", expected)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
|
t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
|
||||||
t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
|
t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
|
||||||
|
@ -463,7 +462,7 @@ Parts = {
|
||||||
'Verbal noun': "v.n.",
|
'Verbal noun': "v.n.",
|
||||||
}
|
}
|
||||||
PartsUsed = {}
|
PartsUsed = {}
|
||||||
for p in Parts.keys():
|
for p in list(Parts.keys()):
|
||||||
PartsUsed[p] = 0
|
PartsUsed[p] = 0
|
||||||
|
|
||||||
def encode(s):
|
def encode(s):
|
||||||
|
@ -641,7 +640,7 @@ def formatNormal(word, doc):
|
||||||
# r += " "*(depth-1) + word + " (" + p + ")\n\n"
|
# r += " "*(depth-1) + word + " (" + p + ")\n\n"
|
||||||
r += " "*(depth-1) + section.heading + "\n\n"
|
r += " "*(depth-1) + section.heading + "\n\n"
|
||||||
else:
|
else:
|
||||||
print >>errors, "Unknown part: (%s) %s" % (word, section.heading)
|
print("Unknown part: (%s) %s" % (word, section.heading), file=errors)
|
||||||
return ""
|
return ""
|
||||||
elif depth > posdepth:
|
elif depth > posdepth:
|
||||||
return ""
|
return ""
|
||||||
|
@ -709,8 +708,8 @@ class WikiHandler(xml.sax.ContentHandler):
|
||||||
if self.element == "text":
|
if self.element == "text":
|
||||||
if self.page:
|
if self.page:
|
||||||
if self.page in self.long:
|
if self.page in self.long:
|
||||||
print self.page, len(self.text)
|
print(self.page, len(self.text))
|
||||||
print
|
print()
|
||||||
self.doPage(self.page, self.text)
|
self.doPage(self.page, self.text)
|
||||||
self.page = None
|
self.page = None
|
||||||
self.text = ""
|
self.text = ""
|
||||||
|
@ -760,8 +759,7 @@ info = """ This file was converted from the original database on:
|
||||||
Wiktionary is available under the GNU Free Documentation License.
|
Wiktionary is available under the GNU Free Documentation License.
|
||||||
""" % (time.ctime(), os.path.basename(fn))
|
""" % (time.ctime(), os.path.basename(fn))
|
||||||
|
|
||||||
errors = codecs.open("mkdict.err", "w", "utf_8")
|
errors = open("mkdict.err", "w")
|
||||||
e = codecs.getencoder("utf_8")
|
|
||||||
|
|
||||||
Templates = {}
|
Templates = {}
|
||||||
f = os.popen("bunzip2 -c %s" % fn, "r")
|
f = os.popen("bunzip2 -c %s" % fn, "r")
|
||||||
|
@ -769,10 +767,9 @@ xml.sax.parse(f, TemplateHandler())
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
f = os.popen("bunzip2 -c %s" % fn, "r")
|
f = os.popen("bunzip2 -c %s" % fn, "r")
|
||||||
out = codecs.getwriter("utf_8")(
|
out = os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")
|
||||||
os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w"))
|
|
||||||
|
|
||||||
out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8'))
|
out.write("%%h English Wiktionary\n%s" % info)
|
||||||
xml.sax.parse(f, WordHandler())
|
xml.sax.parse(f, WordHandler())
|
||||||
f.close()
|
f.close()
|
||||||
out.close()
|
out.close()
|
||||||
|
|
Loading…
Reference in a new issue