mirror of
https://github.com/NixOS/nixpkgs.git
synced 2024-11-17 07:13:23 +01:00
Merge pull request #209961 from barryfm/fix/dictd-python3-update
Fix/dictd python3 update
This commit is contained in:
commit
2cab4c5d8b
5 changed files with 75 additions and 78 deletions
|
@ -1,10 +1,10 @@
|
|||
{lib, stdenv, python2, wordnet, writeScript}:
|
||||
{lib, stdenv, python3, wordnet, writeScript}:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
version = "542";
|
||||
pname = "dict-db-wordnet";
|
||||
|
||||
buildInputs = [python2 wordnet];
|
||||
buildInputs = [python3 wordnet];
|
||||
convert = ./wordnet_structures.py;
|
||||
|
||||
builder = writeScript "builder.sh" ''
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
{ lib, stdenv, fetchurl, python2, dict, glibcLocales }:
|
||||
{ lib, stdenv, fetchurl, python3, dict, glibcLocales }:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
pname = "dict-db-wiktionary";
|
||||
|
@ -9,8 +9,7 @@ stdenv.mkDerivation rec {
|
|||
sha256 = "qsha26LL2513SDtriE/0zdPX1zlnpzk1KKk+R9dSdew=";
|
||||
};
|
||||
|
||||
# script in nixpkgs does not support python2
|
||||
nativeBuildInputs = [ python2 dict glibcLocales ];
|
||||
nativeBuildInputs = [ python3 dict glibcLocales ];
|
||||
|
||||
dontUnpack = true;
|
||||
|
||||
|
@ -18,7 +17,7 @@ stdenv.mkDerivation rec {
|
|||
mkdir -p $out/share/dictd/
|
||||
cd $out/share/dictd
|
||||
|
||||
${python2.interpreter} -O ${./wiktionary2dict.py} "${src}"
|
||||
${python3.interpreter} -O ${./wiktionary2dict.py} "${src}"
|
||||
dictzip wiktionary-en.dict
|
||||
echo en_US.UTF-8 > locale
|
||||
'';
|
||||
|
|
|
@ -25,18 +25,18 @@ def nix_prefetch_url(url, algo='sha256'):
|
|||
"""Prefetches the content of the given URL."""
|
||||
print(f'nix-prefetch-url {url}')
|
||||
out = subprocess.check_output(['nix-prefetch-url', '--type', algo, url])
|
||||
return out.decode('utf-8').rstrip()
|
||||
return out.rstrip()
|
||||
|
||||
|
||||
current_version = subprocess.check_output([
|
||||
'nix', 'eval', '--raw',
|
||||
'-f', dirname(abspath(__file__)) + '/../../../..',
|
||||
'dictdDBs.wiktionary.version',
|
||||
]).decode('utf-8')
|
||||
])
|
||||
|
||||
parser = WiktionaryLatestVersionParser(current_version)
|
||||
|
||||
with urlopen('https://dumps.wikimedia.org/enwiktionary/') as resp:
|
||||
parser.feed(resp.read().decode('utf-8'))
|
||||
parser.feed(resp.read())
|
||||
|
||||
print(parser.latest_version)
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
# Based on code from wiktiondict by Greg Hewgill
|
||||
import re
|
||||
import sys
|
||||
import codecs
|
||||
import os
|
||||
import textwrap
|
||||
import time
|
||||
|
@ -48,7 +47,7 @@ class Delimiter:
|
|||
return self.c
|
||||
|
||||
def Tokenise(s):
|
||||
s = unicode(s)
|
||||
s = str(s)
|
||||
stack = []
|
||||
last = 0
|
||||
i = 0
|
||||
|
@ -109,17 +108,17 @@ def Tokenise(s):
|
|||
yield s[last:i]
|
||||
|
||||
def processSub(templates, tokens, args):
|
||||
t = tokens.next()
|
||||
if not isinstance(t, unicode):
|
||||
t = next(tokens)
|
||||
if not isinstance(t, str):
|
||||
raise SyntaxError
|
||||
name = t
|
||||
t = tokens.next()
|
||||
t = next(tokens)
|
||||
default = None
|
||||
if isinstance(t, Delimiter) and t.c == '|':
|
||||
default = ""
|
||||
while True:
|
||||
t = tokens.next()
|
||||
if isinstance(t, unicode):
|
||||
t = next(tokens)
|
||||
if isinstance(t, str):
|
||||
default += t
|
||||
elif isinstance(t, OpenDouble):
|
||||
default += processTemplateCall(templates, tokens, args)
|
||||
|
@ -128,7 +127,7 @@ def processSub(templates, tokens, args):
|
|||
elif isinstance(t, CloseTriple):
|
||||
break
|
||||
else:
|
||||
print "Unexpected:", t
|
||||
print("Unexpected:", t)
|
||||
raise SyntaxError()
|
||||
if name in args:
|
||||
return args[name]
|
||||
|
@ -142,14 +141,14 @@ def processTemplateCall(templates, tokens, args):
|
|||
template = tokens.next().strip().lower()
|
||||
args = {}
|
||||
a = 1
|
||||
t = tokens.next()
|
||||
t = next(tokens)
|
||||
while True:
|
||||
if isinstance(t, Delimiter):
|
||||
name = unicode(a)
|
||||
name = str(a)
|
||||
arg = ""
|
||||
while True:
|
||||
t = tokens.next()
|
||||
if isinstance(t, unicode):
|
||||
t = next(tokens)
|
||||
if isinstance(t, str):
|
||||
arg += t
|
||||
elif isinstance(t, OpenDouble):
|
||||
arg += processTemplateCall(templates, tokens, args)
|
||||
|
@ -163,9 +162,9 @@ def processTemplateCall(templates, tokens, args):
|
|||
name = arg.strip()
|
||||
arg = ""
|
||||
while True:
|
||||
t = tokens.next()
|
||||
if isinstance(t, (unicode, Equals)):
|
||||
arg += unicode(t)
|
||||
t = next(tokens)
|
||||
if isinstance(t, (str, Equals)):
|
||||
arg += str(t)
|
||||
elif isinstance(t, OpenDouble):
|
||||
arg += processTemplateCall(templates, tokens, args)
|
||||
elif isinstance(t, OpenTriple):
|
||||
|
@ -181,7 +180,7 @@ def processTemplateCall(templates, tokens, args):
|
|||
elif isinstance(t, CloseDouble):
|
||||
break
|
||||
else:
|
||||
print "Unexpected:", t
|
||||
print("Unexpected:", t)
|
||||
raise SyntaxError
|
||||
#print template, args
|
||||
if template[0] == '#':
|
||||
|
@ -208,7 +207,7 @@ def processTemplateCall(templates, tokens, args):
|
|||
else:
|
||||
return ""
|
||||
else:
|
||||
print "Unknown ParserFunction:", template
|
||||
print("Unknown ParserFunction:", template)
|
||||
sys.exit(1)
|
||||
if template not in templates:
|
||||
return "{{%s}}" % template
|
||||
|
@ -225,13 +224,13 @@ def process(templates, s, args = {}):
|
|||
tokens = Tokenise(s)
|
||||
try:
|
||||
while True:
|
||||
t = tokens.next()
|
||||
t = next(tokens)
|
||||
if isinstance(t, OpenDouble):
|
||||
r += processTemplateCall(templates, tokens, args)
|
||||
elif isinstance(t, OpenTriple):
|
||||
r += processSub(templates, tokens, args)
|
||||
else:
|
||||
r += unicode(t)
|
||||
r += str(t)
|
||||
except StopIteration:
|
||||
pass
|
||||
return r
|
||||
|
@ -250,11 +249,11 @@ def test():
|
|||
't6': "t2demo|a",
|
||||
}
|
||||
def t(text, expected):
|
||||
print "text:", text
|
||||
print("text:", text)
|
||||
s = process(templates, text)
|
||||
if s != expected:
|
||||
print "got:", s
|
||||
print "expected:", expected
|
||||
print("got:", s)
|
||||
print("expected:", expected)
|
||||
sys.exit(1)
|
||||
t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
|
||||
t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
|
||||
|
@ -463,7 +462,7 @@ Parts = {
|
|||
'Verbal noun': "v.n.",
|
||||
}
|
||||
PartsUsed = {}
|
||||
for p in Parts.keys():
|
||||
for p in list(Parts.keys()):
|
||||
PartsUsed[p] = 0
|
||||
|
||||
def encode(s):
|
||||
|
@ -641,7 +640,7 @@ def formatNormal(word, doc):
|
|||
# r += " "*(depth-1) + word + " (" + p + ")\n\n"
|
||||
r += " "*(depth-1) + section.heading + "\n\n"
|
||||
else:
|
||||
print >>errors, "Unknown part: (%s) %s" % (word, section.heading)
|
||||
print("Unknown part: (%s) %s" % (word, section.heading), file=errors)
|
||||
return ""
|
||||
elif depth > posdepth:
|
||||
return ""
|
||||
|
@ -709,8 +708,8 @@ class WikiHandler(xml.sax.ContentHandler):
|
|||
if self.element == "text":
|
||||
if self.page:
|
||||
if self.page in self.long:
|
||||
print self.page, len(self.text)
|
||||
print
|
||||
print(self.page, len(self.text))
|
||||
print()
|
||||
self.doPage(self.page, self.text)
|
||||
self.page = None
|
||||
self.text = ""
|
||||
|
@ -760,8 +759,7 @@ info = """ This file was converted from the original database on:
|
|||
Wiktionary is available under the GNU Free Documentation License.
|
||||
""" % (time.ctime(), os.path.basename(fn))
|
||||
|
||||
errors = codecs.open("mkdict.err", "w", "utf_8")
|
||||
e = codecs.getencoder("utf_8")
|
||||
errors = open("mkdict.err", "w")
|
||||
|
||||
Templates = {}
|
||||
f = os.popen("bunzip2 -c %s" % fn, "r")
|
||||
|
@ -769,10 +767,9 @@ xml.sax.parse(f, TemplateHandler())
|
|||
f.close()
|
||||
|
||||
f = os.popen("bunzip2 -c %s" % fn, "r")
|
||||
out = codecs.getwriter("utf_8")(
|
||||
os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w"))
|
||||
out = os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")
|
||||
|
||||
out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8'))
|
||||
out.write("%%h English Wiktionary\n%s" % info)
|
||||
xml.sax.parse(f, WordHandler())
|
||||
f.close()
|
||||
out.close()
|
||||
|
|
77
pkgs/servers/dict/wordnet_structures.py
Normal file → Executable file
77
pkgs/servers/dict/wordnet_structures.py
Normal file → Executable file
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
#Copyright 2007 Sebastian Hagen
|
||||
# This file is part of wordnet_tools.
|
||||
|
||||
|
@ -26,6 +26,7 @@
|
|||
# written.
|
||||
|
||||
import datetime
|
||||
import math
|
||||
from textwrap import TextWrapper
|
||||
|
||||
CAT_ADJECTIVE = 0
|
||||
|
@ -49,7 +50,7 @@ class WordIndex:
|
|||
self.ptrs = ptrs
|
||||
self.synsets = synsets
|
||||
self.tagsense_count = tagsense_count
|
||||
|
||||
|
||||
@classmethod
|
||||
def build_from_line(cls, line_data, synset_map):
|
||||
line_split = line_data.split()
|
||||
|
@ -61,14 +62,14 @@ class WordIndex:
|
|||
tagsense_count = int(line_split[5 + ptr_count],10)
|
||||
synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
|
||||
return cls(lemma, category, ptrs, synsets, tagsense_count)
|
||||
|
||||
|
||||
@classmethod
|
||||
def build_from_file(cls, f, synset_map, rv_base=None):
|
||||
if (rv_base is None):
|
||||
rv = {}
|
||||
else:
|
||||
rv = rv_base
|
||||
|
||||
|
||||
for line in f:
|
||||
if (line.startswith(' ')):
|
||||
continue
|
||||
|
@ -81,8 +82,8 @@ class WordIndex:
|
|||
|
||||
def __repr__(self):
|
||||
return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
|
||||
|
||||
|
||||
|
||||
|
||||
class WordIndexDictFormatter(WordIndex):
|
||||
category_map_rev = {
|
||||
CAT_NOUN: 'n',
|
||||
|
@ -96,12 +97,12 @@ class WordIndexDictFormatter(WordIndex):
|
|||
prefix_fmtn_line_first = ' '
|
||||
prefix_fmtf_line_nonfirst = '%5d: '
|
||||
prefix_fmtn_line_nonfirst = ' '
|
||||
|
||||
|
||||
def dict_str(self):
|
||||
tw = TextWrapper(width=self.LINE_WIDTH_MAX,
|
||||
initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
|
||||
subsequent_indent=self.prefix_fmtn_line_first)
|
||||
|
||||
|
||||
lines = (tw.wrap(self.synsets[0].dict_str()))
|
||||
i = 2
|
||||
for synset in self.synsets[1:]:
|
||||
|
@ -122,7 +123,7 @@ class Synset:
|
|||
self.gloss = gloss
|
||||
self.frames = frames
|
||||
self.comments = []
|
||||
|
||||
|
||||
@classmethod
|
||||
def build_from_line(cls, line_data):
|
||||
line_split = line_data.split()
|
||||
|
@ -132,7 +133,7 @@ class Synset:
|
|||
words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
|
||||
ptr_count = int(line_split[4 + word_count*2],10)
|
||||
ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
|
||||
|
||||
|
||||
tok = line_split[5 + word_count*2 + ptr_count*4]
|
||||
base = 6 + word_count*2 + ptr_count*4
|
||||
if (tok != '|'):
|
||||
|
@ -141,20 +142,20 @@ class Synset:
|
|||
base += frame_count*3 + 1
|
||||
else:
|
||||
frames = []
|
||||
|
||||
|
||||
line_split2 = line_data.split(None, base)
|
||||
if (len(line_split2) < base):
|
||||
gloss = None
|
||||
else:
|
||||
gloss = line_split2[-1]
|
||||
|
||||
|
||||
return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
|
||||
|
||||
|
||||
@classmethod
|
||||
def build_from_file(cls, f):
|
||||
rv = {}
|
||||
comments = []
|
||||
|
||||
|
||||
for line in f:
|
||||
if (line.startswith(' ')):
|
||||
line_s = line.lstrip().rstrip('\n')
|
||||
|
@ -197,14 +198,14 @@ original version.\n\n
|
|||
|
||||
datetime_fmt = '%Y-%m-%dT%H:%M:%S'
|
||||
base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
|
||||
|
||||
|
||||
def __init__(self, wn_url, desc_short, desc_long):
|
||||
self.word_data = {}
|
||||
self.wn_url = wn_url
|
||||
self.desc_short = desc_short
|
||||
self.desc_long = desc_long
|
||||
self.wn_license = None
|
||||
|
||||
|
||||
def wn_dict_add(self, file_index, file_data):
|
||||
file_data.seek(0)
|
||||
file_index.seek(0)
|
||||
|
@ -212,7 +213,7 @@ original version.\n\n
|
|||
WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
|
||||
if (license_lines):
|
||||
self.wn_license = '\n'.join(license_lines) + '\n'
|
||||
|
||||
|
||||
@classmethod
|
||||
def base64_encode(cls, i):
|
||||
"""Encode a non-negative integer into a dictd compatible base64 string"""
|
||||
|
@ -223,15 +224,15 @@ original version.\n\n
|
|||
while (r < i):
|
||||
e += 1
|
||||
r = 64**e - 1
|
||||
|
||||
|
||||
rv = ''
|
||||
while (e > 0):
|
||||
e -= 1
|
||||
d = (i / 64**e)
|
||||
d = math.floor(i / 64**e)
|
||||
rv += cls.base64_map[d]
|
||||
i = i % (64**e)
|
||||
return rv
|
||||
|
||||
|
||||
@classmethod
|
||||
def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
|
||||
"""Write a single dict entry for <key> to index and data files"""
|
||||
|
@ -240,7 +241,7 @@ original version.\n\n
|
|||
entry_len = len(entry)
|
||||
file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
|
||||
cls.base64_encode(entry_len), linesep))
|
||||
|
||||
|
||||
def dict_generate(self, file_index, file_data):
|
||||
file_index.seek(0)
|
||||
file_data.seek(0)
|
||||
|
@ -261,8 +262,8 @@ original version.\n\n
|
|||
self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
|
||||
self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
|
||||
|
||||
|
||||
words = self.word_data.keys()
|
||||
|
||||
words = list(self.word_data.keys())
|
||||
words.sort()
|
||||
for word in words:
|
||||
for wi in self.word_data[word]:
|
||||
|
@ -280,14 +281,14 @@ original version.\n\n
|
|||
else:
|
||||
continue
|
||||
break
|
||||
|
||||
|
||||
outstr = ''
|
||||
for wi in self.word_data[word]:
|
||||
outstr += wi.dict_str() + '\n'
|
||||
|
||||
|
||||
outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
|
||||
self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
|
||||
|
||||
|
||||
file_index.truncate()
|
||||
file_data.truncate()
|
||||
|
||||
|
@ -300,20 +301,20 @@ if (__name__ == '__main__'):
|
|||
op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
|
||||
op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description')
|
||||
op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
|
||||
|
||||
|
||||
(options, args) = op.parse_args()
|
||||
|
||||
|
||||
wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
|
||||
|
||||
|
||||
for i in range(0,len(args),2):
|
||||
print 'Opening index file %r...' % args[i]
|
||||
file_index = file(args[i])
|
||||
print 'Opening data file %r...' % args[i+1]
|
||||
file_data = file(args[i+1])
|
||||
print 'Parsing index file and data file...'
|
||||
print('Opening index file %r...' % args[i])
|
||||
file_index = open(args[i])
|
||||
print('Opening data file %r...' % args[i+1])
|
||||
file_data = open(args[i+1])
|
||||
print('Parsing index file and data file...')
|
||||
wnd.wn_dict_add(file_index, file_data)
|
||||
|
||||
print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)
|
||||
|
||||
wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w'))
|
||||
print 'All done.'
|
||||
print('All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od))
|
||||
|
||||
wnd.dict_generate(open(options.oi, 'w'),open(options.od, 'w'))
|
||||
print('All done.')
|
||||
|
|
Loading…
Reference in a new issue