nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
2020-08-05 08:23:11 +02:00

130 lines
3.8 KiB
Nix

{ stdenv
, rustPlatform
, fetchFromGitHub
, fetchurl
, maturin
, pipInstallHook
, pytest
, python
, requests
}:
let
robertaVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
};
robertaMerges = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
};
bertVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
};
openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
};
openaiMerges = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
};
in rustPlatform.buildRustPackage rec {
pname = "tokenizers";
version = "0.8.1";
src = fetchFromGitHub {
owner = "huggingface";
repo = pname;
rev = "python-v${version}";
sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
};
# Update parking_lot to be compatible with recent Rust versions, that
# replace asm! by llvm_asm!:
#
# https://github.com/Amanieu/parking_lot/pull/223
#
# Remove once upstream updates this dependency.
cargoPatches = [ ./update-parking-lot.diff ];
cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
sourceRoot = "source/bindings/python";
nativeBuildInputs = [
maturin
pipInstallHook
];
propagatedBuildInputs = [
python
];
# tokenizers uses pyo3, which requires Rust nightly.
RUSTC_BOOTSTRAP = 1;
doCheck = false;
doInstallCheck = true;
postUnpack = ''
# Add data files for tests, otherwise tests attempt network access.
mkdir $sourceRoot/tests/data
( cd $sourceRoot/tests/data
ln -s ${robertaVocab} roberta-base-vocab.json
ln -s ${robertaMerges} roberta-base-merges.txt
ln -s ${bertVocab} bert-base-uncased-vocab.txt
ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt )
'';
postPatch = ''
# pyo3's build check verifies that Rust is a nightly
# version. Disable this check.
substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
--replace "check_rustc_version()?;" ""
# Patching the vendored dependency invalidates the file
# checksums, so remove them. This should be safe, since
# this is just a copy of the vendored dependencies and
# the integrity of the vendored dependencies is validated
# by cargoSha256.
sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
$NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
# Maturin uses the crate name as the wheel name.
substituteInPlace Cargo.toml \
--replace "tokenizers-python" "tokenizers"
'';
buildPhase = ''
maturin build --release --manylinux off
'';
installPhase = ''
# Put the wheels where the pip install hook can find them.
install -Dm644 -t dist target/wheels/*.whl
pipInstallPhase
'';
installCheckInputs = [
pytest
requests
];
installCheckPhase = ''
# Append paths, or the binding's tokenizer module will be
# used, since the test directories have __init__.py
pytest --import-mode=append
'';
meta = with stdenv.lib; {
homepage = "https://github.com/huggingface/tokenizers";
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
license = licenses.asl20;
platforms = platforms.unix;
maintainers = with maintainers; [ danieldk ];
};
}