Daniël de Kok 2020-11-10 18:16:13 +01:00 committed by Jonathan Ringer
parent 5c737382f3
commit c67382b02f

View file

@ -32,6 +32,14 @@ let
url = "https://norvig.com/big.txt";
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
};
docPipelineTokenizer = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
};
docQuicktourTokenizer = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
};
openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
@ -42,16 +50,16 @@ let
};
in rustPlatform.buildRustPackage rec {
pname = "tokenizers";
version = "0.9.2";
version = "0.9.4";
src = fetchFromGitHub {
owner = "huggingface";
repo = pname;
rev = "python-v${version}";
sha256 = "0rsm1g5zfq3ygdb3s8v9xqqpgfzvvkc4n5ik3ahy8sw7pyjljb4m";
hash = "sha256-JXoH9yfhMIFg5qDY5zrF6iWb7XKugjMfk1NxSizfaWg=";
};
cargoSha256 = "0yn699dq9hdjh7fyci99ni8mmd5qdhzrsi80grzgf5cch8g38rbi";
cargoSha256 = "sha256-u9qitrOxJSABs0VjwHUZgmw7VTQXNbp6l8fKKE/RQ7M=";
sourceRoot = "source/bindings/python";
@ -82,6 +90,8 @@ in rustPlatform.buildRustPackage rec {
ln -s ${robertaMerges} roberta-base-merges.txt
ln -s ${albertVocab} albert-base-v1-tokenizer.json
ln -s ${bertVocab} bert-base-uncased-vocab.txt
ln -s ${docPipelineTokenizer} bert-wiki.json
ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
ln -s ${norvigBig} big.txt
ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt )