Merge pull request #28036 from roberth/frog

frog: init at v0.13.7
This commit is contained in:
Jörg Thalheim 2017-08-26 16:06:43 +01:00 committed by GitHub
commit 58dc4a8569
23 changed files with 516 additions and 0 deletions

View file

@ -0,0 +1,53 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines
}:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-frog.json);
in
stdenv.mkDerivation {
name = "frog";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "frog-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2 icu
languageMachines.ticcutils
languageMachines.timbl
languageMachines.mbt
languageMachines.libfolia
languageMachines.ucto
languageMachines.frogdata
];
preConfigure = ''
sh bootstrap.sh
'';
postInstall = ''
# frog expects the data files installed in the same prefix
mkdir -p $out/share/frog/;
for f in ${languageMachines.frogdata}/share/frog/*; do
ln -s $f $out/share/frog/;
done;
make check
'';
meta = with stdenv.lib; {
description = "A Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for Dutch";
homepage = https://languagemachines.github.io/frog;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
longDescription = ''
Frog is an integration of memory-based natural language processing (NLP) modules developed for Dutch. All NLP modules are based on Timbl, the Tilburg memory-based learning software package. Most modules were created in the 1990s at the ILK Research Group (Tilburg University, the Netherlands) and the CLiPS Research Centre (University of Antwerp, Belgium). Over the years they have been integrated into a single text processing tool, which is currently maintained and developed by the Language Machines Research Group and the Centre for Language and Speech Technology at Radboud University Nijmegen. A dependency parser, a base phrase chunker, and a named-entity recognizer module were added more recently. Where possible, Frog makes use of multi-processor support to run subtasks in parallel.
Various (re)programming rounds have been made possible through funding by NWO, the Netherlands Organisation for Scientific Research, particularly under the CGN project, the IMIX programme, the Implicit Linguistics project, the CLARIN-NL programme and the CLARIAH programme.
'';
};
}

View file

@ -0,0 +1,31 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines
}:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-frogdata.json);
in
stdenv.mkDerivation {
name = "frogdata";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "frogdata-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
];
preConfigure = ''
sh bootstrap.sh
'';
meta = with stdenv.lib; {
description = "Data for Frog, a Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for Dutch";
homepage = https://languagemachines.github.io/frog;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
};
}

View file

@ -0,0 +1,30 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines }:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-libfolia.json);
in
stdenv.mkDerivation {
name = "libfolia";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "libfolia-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive libxml2 icu languageMachines.ticcutils ];
preConfigure = "sh bootstrap.sh";
meta = with stdenv.lib; {
description = "A C++ API for FoLiA documents; an XML-based linguistic annotation format.";
homepage = https://proycon.github.io/folia/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
longDescription = ''
A high-level C++ API to read, manipulate, and create FoLiA documents. FoLiA is an XML-based annotation format, suitable for the representation of linguistically annotated language resources. FoLiAs intended use is as a format for storing and/or exchanging language resources, including corpora.
'';
};
}

View file

@ -0,0 +1,13 @@
--- a/configure.ac 2017-06-12 06:48:15.000000000 +0200
+++ b/configure.ac 2017-06-12 06:50:06.000000000 +0200
@@ -76,6 +76,10 @@
CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
LIBS="$LIBS $ticcutils_LIBS"
+PKG_CHECK_MODULES([libxml2], [libxml-2.0 >= 2.6.16] )
+CXXFLAGS="$CXXFLAGS $libxml2_CFLAGS"
+LIBS="$LIBS $libxml2_LIBS"
+
AC_CONFIG_FILES([
Makefile
mbt.pc

View file

@ -0,0 +1,40 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2
, languageMachines
}:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-mbt.json);
in
stdenv.mkDerivation {
name = "mbt";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "mbt-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2
languageMachines.ticcutils
languageMachines.timbl
];
patches = [ ./mbt-add-libxml2-dep.patch ];
preConfigure = ''
sh bootstrap.sh
'';
meta = with stdenv.lib; {
description = "Memory Based Tagger";
homepage = https://languagemachines.github.io/mbt/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
longDescription = ''
MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech.
Mbt is used by Frog for Dutch tagging.
'';
};
}

View file

@ -0,0 +1,14 @@
{ callPackage }:
{
ticcutils = callPackage ./ticcutils.nix { };
libfolia = callPackage ./libfolia.nix { };
ucto = callPackage ./ucto.nix { };
uctodata = callPackage ./uctodata.nix { };
timbl = callPackage ./timbl.nix { };
timblserver = callPackage ./timblserver.nix { };
mbt = callPackage ./mbt.nix { };
frog = callPackage ./frog.nix { };
frogdata = callPackage ./frogdata.nix { };
test = callPackage ./test.nix { };
}

View file

@ -0,0 +1,5 @@
{
"version": "v0.13.7",
"url": "https://api.github.com/repos/LanguageMachines/frog/tarball/v0.13.7",
"sha256": "0swyfi3g862n888qj8v8kd18745hasy0vnc70i9qlv0ji0321bnf"
}

View file

@ -0,0 +1,5 @@
{
"version": "v0.13",
"url": "https://api.github.com/repos/LanguageMachines/frogdata/tarball/v0.13",
"sha256": "13mhv8qacl0n20ddl1ay49xi6h2m0a149ya3rrsmaah3x4adb4sg"
}

View file

@ -0,0 +1,5 @@
{
"version": "v1.7",
"url": "https://api.github.com/repos/LanguageMachines/libfolia/tarball/v1.7",
"sha256": "0hpxdry7n2887klryc587xv46p6z6jp6hz9x7k2pk5v7jb0z4s65"
}

View file

@ -0,0 +1,5 @@
{
"version": "v3.2.16",
"url": "https://api.github.com/repos/LanguageMachines/mbt/tarball/v3.2.16",
"sha256": "0f9f5l84m0lmmv4km9myn3yhy67jbmk3qn2fi40dy025gx4l0x3x"
}

View file

@ -0,0 +1,5 @@
{
"version": "v0.15",
"url": "https://api.github.com/repos/LanguageMachines/ticcutils/tarball/v0.15",
"sha256": "0lssb1klx2flmr6fy78j37i5lbq3gfhzjx24j6n72ndm2rvprvcn"
}

View file

@ -0,0 +1,5 @@
{
"version": "v6.4.9",
"url": "https://api.github.com/repos/LanguageMachines/timbl/tarball/v6.4.9",
"sha256": "1279npc3xlq05hnkylpbkgg941gjhvl6sd5fw4vgwcx2rwmmlaay"
}

View file

@ -0,0 +1,5 @@
{
"version": "v1.11",
"url": "https://api.github.com/repos/LanguageMachines/timblserver/tarball/v1.11",
"sha256": "02k8c704wr5miy82w6zj0imm7sdfnxf3db34qiaa8l3myhn17qlw"
}

View file

@ -0,0 +1,5 @@
{
"version": "v0.9.6",
"url": "https://api.github.com/repos/LanguageMachines/ucto/tarball/v0.9.6",
"sha256": "0fxq4j32g7kp6789xz23651c4v2j7zlz87cshfv9g1xjs7jxns3f"
}

View file

@ -0,0 +1,5 @@
{
"version": "v0.4",
"url": "https://api.github.com/repos/LanguageMachines/uctodata/tarball/v0.4",
"sha256": "02c78qmwi9ijpk5wila3p62fmfdy1rpmlvvzbxs3wg0rdb0nwvd2"
}

View file

@ -0,0 +1,25 @@
{ runCommand
, languageMachines
}:
runCommand "frog-test" {} ''
${languageMachines.frog}/bin/frog >$out <<EOF
Dit is een test
EOF
echo "Frog output:"
cat $out
expected () {
echo "Test expectation failed: $@"
exit 1
}
lines="$(wc -l $out | awk '{print $1}')"
test 5 = $lines || expected "Five lines of output"
grep "is" $out | grep "zijn" >/dev/null || expected "Stemming works"
grep "een" $out | grep "onbep" >/dev/null || expected "Tagging works"
deps="$(echo $(awk 'BEGIN { FS = "\t*" } ; {print $1 " -> " $9 "; "}' <$out))"
test "1 -> 2; 2 -> 0; 3 -> 4; 4 -> 2; -> ;" = "$deps" || expected "Dependency parsing works"
''

View file

@ -0,0 +1,29 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, zlib, bzip2, libtar }:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-ticcutils.json);
in
stdenv.mkDerivation {
name = "ticcutils";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "ticcutils-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive libxml2
# optional:
zlib bzip2 libtar
# broken but optional: boost
];
preConfigure = "sh bootstrap.sh";
meta = with stdenv.lib; {
description = "This module contains useful functions for general use in the TiCC software stack and beyond.";
homepage = https://github.com/LanguageMachines/ticcutils;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
};
}

View file

@ -0,0 +1,36 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2
, languageMachines
}:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-timbl.json);
in
stdenv.mkDerivation {
name = "timbl";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "timbl-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2
languageMachines.ticcutils
];
preConfigure = "sh bootstrap.sh";
meta = with stdenv.lib; {
description = "TiMBL implements several memory-based learning algorithms";
homepage = https://github.com/LanguageMachines/timbl/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
longDescription = ''
TiMBL is an open source software package implementing several memory-based learning algorithms, among which IB1-IG, an implementation of k-nearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases.
For over fifteen years TiMBL has been mostly used in natural language processing as a machine learning classifier component, but its use extends to virtually any supervised machine learning domain. Due to its particular decision-tree-based implementation, TiMBL is in many cases far more efficient in classification than a standard k-nearest neighbor algorithm would be.
'';
};
}

View file

@ -0,0 +1,37 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2
, languageMachines
}:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-timblserver.json);
in
stdenv.mkDerivation {
name = "timblserver";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "timblserver-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
libxml2
languageMachines.ticcutils
languageMachines.timbl
];
preConfigure = "sh bootstrap.sh";
meta = with stdenv.lib; {
description = "This server for TiMBL implements several memory-based learning algorithms";
homepage = https://github.com/LanguageMachines/timblserver/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
longDescription = ''
This implements a server for TiMBL. TiMBL is an open source software package implementing several memory-based learning algorithms, among which IB1-IG, an implementation of k-nearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases.
For over fifteen years TiMBL has been mostly used in natural language processing as a machine learning classifier component, but its use extends to virtually any supervised machine learning domain. Due to its particular decision-tree-based implementation, TiMBL is in many cases far more efficient in classification than a standard k-nearest neighbor algorithm would be.
'';
};
}

View file

@ -0,0 +1,48 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines
}:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-ucto.json);
in
stdenv.mkDerivation {
name = "ucto";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "ucto-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive
icu libxml2
languageMachines.ticcutils
languageMachines.libfolia
languageMachines.uctodata
# TODO textcat from libreoffice? Pulls in X11 dependencies?
];
preConfigure = "sh bootstrap.sh;";
postInstall = ''
# ucto expects the data files installed in the same prefix
mkdir -p $out/share/ucto/;
for f in ${languageMachines.uctodata}/share/ucto/*; do
echo "Linking $f"
ln -s $f $out/share/ucto/;
done;
'';
meta = with stdenv.lib; {
description = "A rule-based tokenizer for natural language";
homepage = https://languagemachines.github.io/ucto/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
longDescription = ''
Ucto tokenizes text files: it separates words from punctuation, and splits sentences. It offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation.
Ucto comes with tokenisation rules for several languages and can be easily extended to suit other languages. It has been incorporated for tokenizing Dutch text in Frog, a Dutch morpho-syntactic processor.
'';
};
}

View file

@ -0,0 +1,32 @@
{ stdenv, fetchurl
, automake, autoconf, libtool, pkgconfig, autoconf-archive
, libxml2, icu
, languageMachines }:
let
release = builtins.fromJSON (builtins.readFile ./release-info/LanguageMachines-uctodata.json);
in
stdenv.mkDerivation {
name = "uctodata";
version = release.version;
src = fetchurl { inherit (release) url sha256;
name = "uctodata-${release.version}.tar.gz"; };
buildInputs = [ automake autoconf libtool pkgconfig autoconf-archive ];
preConfigure = "sh bootstrap.sh";
meta = with stdenv.lib; {
description = "A rule-based tokenizer for natural language";
homepage = https://languagemachines.github.io/ucto/;
license = licenses.gpl3;
platforms = platforms.all;
maintainers = with maintainers; [ roberth ];
longDescription = ''
Ucto tokenizes text files: it separates words from punctuation, and splits sentences. It offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation.
Ucto comes with tokenisation rules for several languages and can be easily extended to suit other languages. It has been incorporated for tokenizing Dutch text in Frog, a Dutch morpho-syntactic processor.
'';
};
}

View file

@ -0,0 +1,79 @@
#!/usr/bin/env nix-shell
#!nix-shell --packages curl
#!nix-shell --packages jq
#!nix-shell --packages parallel
#!nix-shell -i bash
# Exit immediately if a command exits with a non-zero status.
# Exit when a producer fails in a pipe
# Treat undefined variable references as errors
set -e -o pipefail -u
# Check if working directory is (probably) right
test "./update" = $0 || {
echo "The working directory ought to be the same is the update script location. Please invoke as ./update" 1>&2
exit 1
}
# Create temporary directory with automatic cleanup
readonly MY_TMP="$(mktemp -d)"
cleanup () {
rm -rf "$MY_TMP"
}
trap cleanup EXIT
# stdout: file containing release info and a convenient placeholder
# for the sha256 attribute
getRelease () {
local owner="$1"
local repo="$2"
local out="$MY_TMP/$owner--$repo-release"
curl -fSs https://api.github.com/repos/"$owner"/"$repo"/releases/latest \
| jq '{ version: .name, url: .tarball_url, sha256: "__SHA256__" }' \
> "$out"
echo "$out"
}
# 'getters' for the release info file
# stdout: unquoted tarball url
releaseUrl () {
local file="$1"
jq -r '.url' <"$file"
}
# stdout: unquoted version
releaseVersion () {
local file="$1"
jq -r '.version' <"$file"
}
# Fetch release tarball and compute hash
# stdout: base32 sha256 to be used in fetchurl
getReleaseHash () {
local file="$1"
local name="$2"
nix-prefetch-url "$(releaseUrl "$file")" --name "$name-$(releaseVersion "$file").tar.gz"
}
# Write a release info file to release-info/$owner-$repo.json
updateRelease () {
local owner="$1"
local repo="$2"
local r="$(getRelease "$owner" "$repo")"
local hash="$(getReleaseHash "$r" "$repo")"
sed \
-e s/__SHA256__/"$hash"/\
<"$r" \
>"release-info/$owner-$repo.json"
}
updateRelease LanguageMachines frogdata
updateRelease LanguageMachines frog
updateRelease LanguageMachines libfolia
updateRelease LanguageMachines mbt
updateRelease LanguageMachines ticcutils
updateRelease LanguageMachines timbl
updateRelease LanguageMachines timblserver
updateRelease LanguageMachines ucto
updateRelease LanguageMachines uctodata

View file

@ -7891,6 +7891,8 @@ with pkgs;
freetts = callPackage ../development/libraries/freetts { };
frog = self.languageMachines.frog;
fstrm = callPackage ../development/libraries/fstrm { };
cfitsio = callPackage ../development/libraries/cfitsio { };
@ -8493,6 +8495,8 @@ with pkgs;
};
libkrb5 = krb5Full.override { type = "lib"; };
languageMachines = recurseIntoAttrs (import ../development/libraries/languagemachines/packages.nix { inherit callPackage; });
lasso = callPackage ../development/libraries/lasso { };
LASzip = callPackage ../development/libraries/LASzip { };