mirror of
https://github.com/SebastianWendel/nixpkgs.git
synced 2024-11-06 18:26:45 +01:00
ca1ea69972
Upstream changes:
* Tesseract 4.00.00alpha:
* Version parsing: Ignore suffix (so '4.00.00alpha' == (4, 0, 0))
* Libtesseract: Load libtesseract.so.4 instead of libtesseract.so.3
if available
* Support for Tesseract 3.05.00:
* Builders: Split field 'tess_conf' into 'tess_flags' and 'tess_conf'
* Libtesseract: If available, use
TessBaseAPIDetectOrientationScript() instead of
TessBaseAPIDetectOS
* Libtesseract:
* Workaround: Prevents possible segfault in image_to_string() when
the target language is not available
Full upstream change log can be found at:
https://github.com/openpaperwork/pyocr/blob/b006123d1d002711b9/ChangeLog
The tesseract.patch for supporting Tesseract version 3.05.00 has been
applied upstream and we can safely drop it.
We now use substituteInPlace in conjunction with a patch to insert the
relevant store paths instead of sed, so it's less fragile whenever we
have upstream changes in handling of these paths.
I've tested this by reverting 48a941e29f
and applying a build
fix patch of Cuneiform 1.1.0 from Arch Linux, because right now
Cuneiform is an experimental version that can't be fixed on behalf of
pyocr (the reason is that pyocr needs to get a list of languages, which
doesn't work in that version anymore).
In addition to that I've successfully built paperwork-backend which by
now is the one package which depends on pyocr. However, I didn't do
runtime tests of Paperwork.
Signed-off-by: aszlig <aszlig@redmoonstudios.org>
Cc: @7c6f434c
70 lines
2.1 KiB
Nix
70 lines
2.1 KiB
Nix
{ lib, fetchFromGitHub, buildPythonPackage, pillow, six
|
|
, tesseract, cuneiform
|
|
}:
|
|
|
|
buildPythonPackage rec {
|
|
name = "pyocr-${version}";
|
|
version = "0.4.7";
|
|
|
|
# Don't fetch from PYPI because it doesn't contain tests.
|
|
src = fetchFromGitHub {
|
|
owner = "jflesch";
|
|
repo = "pyocr";
|
|
rev = version;
|
|
sha256 = "1iw73r8yrgjf8g00yzpz62ymqbf89cqhyhl9g430srmsrq7mn2yd";
|
|
};
|
|
|
|
NIX_CUNEIFORM_CMD = "${cuneiform}/bin/cuneiform";
|
|
NIX_CUNEIFORM_DATA = "${cuneiform}/share/cuneiform";
|
|
NIX_LIBTESSERACT_PATH = "${tesseract}/lib/libtesseract.so";
|
|
NIX_TESSDATA_PREFIX = "${tesseract}/share/tessdata";
|
|
NIX_TESSERACT_CMD = "${tesseract}/bin/tesseract";
|
|
|
|
patches = [ ./paths.patch ];
|
|
|
|
postPatch = ''
|
|
substituteInPlace src/pyocr/cuneiform.py \
|
|
--subst-var NIX_CUNEIFORM_CMD \
|
|
--subst-var NIX_CUNEIFORM_CMD
|
|
|
|
substituteInPlace src/pyocr/tesseract.py \
|
|
--subst-var NIX_TESSERACT_CMD
|
|
|
|
substituteInPlace src/pyocr/libtesseract/tesseract_raw.py \
|
|
--subst-var NIX_TESSDATA_PREFIX \
|
|
--subst-var NIX_LIBTESSERACT_PATH
|
|
|
|
# Disable specific tests that are probably failing because of this issue:
|
|
# https://github.com/jflesch/pyocr/issues/52
|
|
for test in $disabledTests; do
|
|
file="''${test%%:*}"
|
|
fun="''${test#*:}"
|
|
echo "$fun = unittest.skip($fun)" >> "tests/tests_$file.py"
|
|
done
|
|
'';
|
|
|
|
disabledTests = [
|
|
"cuneiform:TestTxt.test_basic"
|
|
"cuneiform:TestTxt.test_european"
|
|
"cuneiform:TestTxt.test_french"
|
|
"cuneiform:TestWordBox.test_basic"
|
|
"cuneiform:TestWordBox.test_european"
|
|
"cuneiform:TestWordBox.test_french"
|
|
"libtesseract:TestBasicDoc.test_basic"
|
|
"libtesseract:TestDigitLineBox.test_digits"
|
|
"libtesseract:TestLineBox.test_japanese"
|
|
"libtesseract:TestTxt.test_japanese"
|
|
"libtesseract:TestWordBox.test_japanese"
|
|
"tesseract:TestDigitLineBox.test_digits"
|
|
"tesseract:TestTxt.test_japanese"
|
|
];
|
|
|
|
propagatedBuildInputs = [ pillow six ];
|
|
|
|
meta = {
|
|
homepage = "https://github.com/jflesch/pyocr";
|
|
description = "A Python wrapper for Tesseract and Cuneiform";
|
|
license = lib.licenses.gpl3Plus;
|
|
};
|
|
}
|