dictdDbs.wiktionary: use python3

In pkgs/servers/dict:
  * wiktionary/default.nix: change python2 -> python3
  * wiktionary/latest_version.py: decodes('utf-8') no longer needed
  * wiktionary/wiktionary2dict.py: 'import codesc' no longer needed
This commit is contained in:
Barry Fishman 2023-01-09 15:35:23 -05:00
parent 01d76ca6c1
commit 9287d37501
3 changed files with 34 additions and 38 deletions

View file

@ -1,4 +1,4 @@
{ lib, stdenv, fetchurl, python2, dict, glibcLocales }: { lib, stdenv, fetchurl, python3, dict, glibcLocales }:
stdenv.mkDerivation rec { stdenv.mkDerivation rec {
pname = "dict-db-wiktionary"; pname = "dict-db-wiktionary";
@ -9,8 +9,7 @@ stdenv.mkDerivation rec {
sha256 = "qsha26LL2513SDtriE/0zdPX1zlnpzk1KKk+R9dSdew="; sha256 = "qsha26LL2513SDtriE/0zdPX1zlnpzk1KKk+R9dSdew=";
}; };
# script in nixpkgs does not support python2 nativeBuildInputs = [ python3 dict glibcLocales ];
nativeBuildInputs = [ python2 dict glibcLocales ];
dontUnpack = true; dontUnpack = true;
@ -18,7 +17,7 @@ stdenv.mkDerivation rec {
mkdir -p $out/share/dictd/ mkdir -p $out/share/dictd/
cd $out/share/dictd cd $out/share/dictd
${python2.interpreter} -O ${./wiktionary2dict.py} "${src}" ${python3.interpreter} -O ${./wiktionary2dict.py} "${src}"
dictzip wiktionary-en.dict dictzip wiktionary-en.dict
echo en_US.UTF-8 > locale echo en_US.UTF-8 > locale
''; '';

View file

@ -25,18 +25,18 @@ def nix_prefetch_url(url, algo='sha256'):
"""Prefetches the content of the given URL.""" """Prefetches the content of the given URL."""
print(f'nix-prefetch-url {url}') print(f'nix-prefetch-url {url}')
out = subprocess.check_output(['nix-prefetch-url', '--type', algo, url]) out = subprocess.check_output(['nix-prefetch-url', '--type', algo, url])
return out.decode('utf-8').rstrip() return out.rstrip()
current_version = subprocess.check_output([ current_version = subprocess.check_output([
'nix', 'eval', '--raw', 'nix', 'eval', '--raw',
'-f', dirname(abspath(__file__)) + '/../../../..', '-f', dirname(abspath(__file__)) + '/../../../..',
'dictdDBs.wiktionary.version', 'dictdDBs.wiktionary.version',
]).decode('utf-8') ])
parser = WiktionaryLatestVersionParser(current_version) parser = WiktionaryLatestVersionParser(current_version)
with urlopen('https://dumps.wikimedia.org/enwiktionary/') as resp: with urlopen('https://dumps.wikimedia.org/enwiktionary/') as resp:
parser.feed(resp.read().decode('utf-8')) parser.feed(resp.read())
print(parser.latest_version) print(parser.latest_version)

View file

@ -2,7 +2,6 @@
# Based on code from wiktiondict by Greg Hewgill # Based on code from wiktiondict by Greg Hewgill
import re import re
import sys import sys
import codecs
import os import os
import textwrap import textwrap
import time import time
@ -48,7 +47,7 @@ class Delimiter:
return self.c return self.c
def Tokenise(s): def Tokenise(s):
s = unicode(s) s = str(s)
stack = [] stack = []
last = 0 last = 0
i = 0 i = 0
@ -109,17 +108,17 @@ def Tokenise(s):
yield s[last:i] yield s[last:i]
def processSub(templates, tokens, args): def processSub(templates, tokens, args):
t = tokens.next() t = next(tokens)
if not isinstance(t, unicode): if not isinstance(t, str):
raise SyntaxError raise SyntaxError
name = t name = t
t = tokens.next() t = next(tokens)
default = None default = None
if isinstance(t, Delimiter) and t.c == '|': if isinstance(t, Delimiter) and t.c == '|':
default = "" default = ""
while True: while True:
t = tokens.next() t = next(tokens)
if isinstance(t, unicode): if isinstance(t, str):
default += t default += t
elif isinstance(t, OpenDouble): elif isinstance(t, OpenDouble):
default += processTemplateCall(templates, tokens, args) default += processTemplateCall(templates, tokens, args)
@ -128,7 +127,7 @@ def processSub(templates, tokens, args):
elif isinstance(t, CloseTriple): elif isinstance(t, CloseTriple):
break break
else: else:
print "Unexpected:", t print("Unexpected:", t)
raise SyntaxError() raise SyntaxError()
if name in args: if name in args:
return args[name] return args[name]
@ -142,14 +141,14 @@ def processTemplateCall(templates, tokens, args):
template = tokens.next().strip().lower() template = tokens.next().strip().lower()
args = {} args = {}
a = 1 a = 1
t = tokens.next() t = next(tokens)
while True: while True:
if isinstance(t, Delimiter): if isinstance(t, Delimiter):
name = unicode(a) name = str(a)
arg = "" arg = ""
while True: while True:
t = tokens.next() t = next(tokens)
if isinstance(t, unicode): if isinstance(t, str):
arg += t arg += t
elif isinstance(t, OpenDouble): elif isinstance(t, OpenDouble):
arg += processTemplateCall(templates, tokens, args) arg += processTemplateCall(templates, tokens, args)
@ -163,9 +162,9 @@ def processTemplateCall(templates, tokens, args):
name = arg.strip() name = arg.strip()
arg = "" arg = ""
while True: while True:
t = tokens.next() t = next(tokens)
if isinstance(t, (unicode, Equals)): if isinstance(t, (str, Equals)):
arg += unicode(t) arg += str(t)
elif isinstance(t, OpenDouble): elif isinstance(t, OpenDouble):
arg += processTemplateCall(templates, tokens, args) arg += processTemplateCall(templates, tokens, args)
elif isinstance(t, OpenTriple): elif isinstance(t, OpenTriple):
@ -181,7 +180,7 @@ def processTemplateCall(templates, tokens, args):
elif isinstance(t, CloseDouble): elif isinstance(t, CloseDouble):
break break
else: else:
print "Unexpected:", t print("Unexpected:", t)
raise SyntaxError raise SyntaxError
#print template, args #print template, args
if template[0] == '#': if template[0] == '#':
@ -208,7 +207,7 @@ def processTemplateCall(templates, tokens, args):
else: else:
return "" return ""
else: else:
print "Unknown ParserFunction:", template print("Unknown ParserFunction:", template)
sys.exit(1) sys.exit(1)
if template not in templates: if template not in templates:
return "{{%s}}" % template return "{{%s}}" % template
@ -225,13 +224,13 @@ def process(templates, s, args = {}):
tokens = Tokenise(s) tokens = Tokenise(s)
try: try:
while True: while True:
t = tokens.next() t = next(tokens)
if isinstance(t, OpenDouble): if isinstance(t, OpenDouble):
r += processTemplateCall(templates, tokens, args) r += processTemplateCall(templates, tokens, args)
elif isinstance(t, OpenTriple): elif isinstance(t, OpenTriple):
r += processSub(templates, tokens, args) r += processSub(templates, tokens, args)
else: else:
r += unicode(t) r += str(t)
except StopIteration: except StopIteration:
pass pass
return r return r
@ -250,11 +249,11 @@ def test():
't6': "t2demo|a", 't6': "t2demo|a",
} }
def t(text, expected): def t(text, expected):
print "text:", text print("text:", text)
s = process(templates, text) s = process(templates, text)
if s != expected: if s != expected:
print "got:", s print("got:", s)
print "expected:", expected print("expected:", expected)
sys.exit(1) sys.exit(1)
t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].") t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].") t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
@ -463,7 +462,7 @@ Parts = {
'Verbal noun': "v.n.", 'Verbal noun': "v.n.",
} }
PartsUsed = {} PartsUsed = {}
for p in Parts.keys(): for p in list(Parts.keys()):
PartsUsed[p] = 0 PartsUsed[p] = 0
def encode(s): def encode(s):
@ -641,7 +640,7 @@ def formatNormal(word, doc):
# r += " "*(depth-1) + word + " (" + p + ")\n\n" # r += " "*(depth-1) + word + " (" + p + ")\n\n"
r += " "*(depth-1) + section.heading + "\n\n" r += " "*(depth-1) + section.heading + "\n\n"
else: else:
print >>errors, "Unknown part: (%s) %s" % (word, section.heading) print("Unknown part: (%s) %s" % (word, section.heading), file=errors)
return "" return ""
elif depth > posdepth: elif depth > posdepth:
return "" return ""
@ -709,8 +708,8 @@ class WikiHandler(xml.sax.ContentHandler):
if self.element == "text": if self.element == "text":
if self.page: if self.page:
if self.page in self.long: if self.page in self.long:
print self.page, len(self.text) print(self.page, len(self.text))
print print()
self.doPage(self.page, self.text) self.doPage(self.page, self.text)
self.page = None self.page = None
self.text = "" self.text = ""
@ -760,8 +759,7 @@ info = """ This file was converted from the original database on:
Wiktionary is available under the GNU Free Documentation License. Wiktionary is available under the GNU Free Documentation License.
""" % (time.ctime(), os.path.basename(fn)) """ % (time.ctime(), os.path.basename(fn))
errors = codecs.open("mkdict.err", "w", "utf_8") errors = open("mkdict.err", "w")
e = codecs.getencoder("utf_8")
Templates = {} Templates = {}
f = os.popen("bunzip2 -c %s" % fn, "r") f = os.popen("bunzip2 -c %s" % fn, "r")
@ -769,10 +767,9 @@ xml.sax.parse(f, TemplateHandler())
f.close() f.close()
f = os.popen("bunzip2 -c %s" % fn, "r") f = os.popen("bunzip2 -c %s" % fn, "r")
out = codecs.getwriter("utf_8")( out = os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")
os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w"))
out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8')) out.write("%%h English Wiktionary\n%s" % info)
xml.sax.parse(f, WordHandler()) xml.sax.parse(f, WordHandler())
f.close() f.close()
out.close() out.close()