From 315002db4e19d66b7460f4baa6c0d111682a03fc Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Fri, 6 May 2022 18:48:22 -0400 Subject: [PATCH 1/5] arrow-cpp: 7.0.0 -> 8.0.0 --- .../libraries/arrow-cpp/default.nix | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/pkgs/development/libraries/arrow-cpp/default.nix b/pkgs/development/libraries/arrow-cpp/default.nix index 806df81a7080..5fb8746efae6 100644 --- a/pkgs/development/libraries/arrow-cpp/default.nix +++ b/pkgs/development/libraries/arrow-cpp/default.nix @@ -19,6 +19,7 @@ , grpc , gtest , jemalloc +, libbacktrace , lz4 , minio , ninja @@ -69,21 +70,20 @@ let in stdenv.mkDerivation rec { pname = "arrow-cpp"; - version = "7.0.0"; + version = "8.0.0"; src = fetchurl { - url = - "mirror://apache/arrow/arrow-${version}/apache-arrow-${version}.tar.gz"; - hash = "sha256-6PSbFJoV7O9OQPz6sbh8ETxrHuGGAFwWnlzfldMamd4="; + url = "mirror://apache/arrow/arrow-${version}/apache-arrow-${version}.tar.gz"; + hash = "sha256-rZoFcFEXyYnBFrrprHBJL+AVBQ4bgPsOOP3ktdhjqqM="; }; sourceRoot = "apache-arrow-${version}/cpp"; ${if enableJemalloc then "ARROW_JEMALLOC_URL" else null} = jemalloc.src; + # versions are all taken from + # https://github.com/apache/arrow/blob/apache-arrow-8.0.0/cpp/thirdparty/versions.txt + ARROW_MIMALLOC_URL = fetchFromGitHub { - # From - # ./cpp/cmake_modules/ThirdpartyToolchain.cmake - # ./cpp/thirdparty/versions.txt owner = "microsoft"; repo = "mimalloc"; rev = "v1.7.3"; @@ -93,8 +93,15 @@ stdenv.mkDerivation rec { ARROW_XSIMD_URL = fetchFromGitHub { owner = "xtensor-stack"; repo = "xsimd"; - rev = "aeec9c872c8b475dedd7781336710f2dd2666cb2"; - hash = "sha256-vWKdJkieKhaxyAJhijXUmD7NmNvMWd79PskQojulA1w="; + rev = "7d1778c3b38d63db7cec7145d939f40bc5d859d1"; + hash = "sha256-89AysBUVnTdWyMPazeJegnQ6WEH90Ns7qQInZLMSXY4="; + }; + + ARROW_SUBSTRAIT_URL = fetchFromGitHub { + owner = "substrait-io"; + repo = "substrait"; + rev = "e1b4c04a1b518912f4c4065b16a1b2c0ac8e14cf"; + hash = "sha256-56FSjDngsROSHLjMv+OYAIYqphEu3GzgIMHbgh/ZQw0="; }; patches = [ @@ -115,7 +122,10 @@ stdenv.mkDerivation rec { gflags glog gtest + libbacktrace lz4 + nlohmann_json # alternative JSON parser to rapidjson + protobuf # substrait requires protobuf rapidjson re2 snappy @@ -150,6 +160,9 @@ stdenv.mkDerivation rec { "-DARROW_BUILD_SHARED=${if enableShared then "ON" else "OFF"}" "-DARROW_BUILD_STATIC=${if enableShared then "OFF" else "ON"}" "-DARROW_BUILD_TESTS=ON" + "-DARROW_BUILD_INTEGRATION=ON" + "-DARROW_BUILD_UTILITIES=ON" + "-DARROW_EXTRA_ERROR_CONTEXT=ON" "-DARROW_VERBOSE_THIRDPARTY_BUILD=ON" "-DARROW_DEPENDENCY_SOURCE=SYSTEM" "-DThrift_SOURCE=AUTO" # search for Thrift using pkg-config (ThriftConfig.cmake requires OpenSSL and libevent) @@ -168,8 +181,10 @@ stdenv.mkDerivation rec { # Disable Python for static mode because openblas is currently broken there. "-DARROW_PYTHON=${if enableShared then "ON" else "OFF"}" "-DARROW_USE_GLOG=ON" + "-DARROW_WITH_BACKTRACE=ON" "-DARROW_WITH_BROTLI=ON" "-DARROW_WITH_LZ4=ON" + "-DARROW_WITH_NLOHMANN_JSON=ON" "-DARROW_WITH_SNAPPY=ON" "-DARROW_WITH_UTF8PROC=ON" "-DARROW_WITH_ZLIB=ON" @@ -177,8 +192,10 @@ stdenv.mkDerivation rec { "-DARROW_MIMALLOC=ON" # Parquet options: "-DARROW_PARQUET=ON" + "-DARROW_SUBSTRAIT=ON" "-DPARQUET_BUILD_EXECUTABLES=ON" "-DARROW_FLIGHT=${if enableFlight then "ON" else "OFF"}" + "-DARROW_FLIGHT_TESTING=${if enableFlight then "ON" else "OFF"}" "-DARROW_S3=${if enableS3 then "ON" else "OFF"}" "-DARROW_GCS=${if enableGcs then "ON" else "OFF"}" ] ++ lib.optionals (!enableShared) [ From e0f70681f9ee98371cc74cb28ec906fcf357b96c Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 May 2022 08:13:26 -0400 Subject: [PATCH 2/5] python3Packages.pyarrow: enable s3 --- .../development/python-modules/pyarrow/default.nix | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pkgs/development/python-modules/pyarrow/default.nix b/pkgs/development/python-modules/pyarrow/default.nix index cfe63a34d85a..141d90a5a971 100644 --- a/pkgs/development/python-modules/pyarrow/default.nix +++ b/pkgs/development/python-modules/pyarrow/default.nix @@ -47,8 +47,10 @@ buildPythonPackage rec { PYARROW_WITH_DATASET = zero_or_one true; PYARROW_WITH_FLIGHT = zero_or_one _arrow-cpp.enableFlight; - PYARROW_WITH_PARQUET = zero_or_one true; PYARROW_WITH_HDFS = zero_or_one true; + PYARROW_WITH_PARQUET = zero_or_one true; + PYARROW_WITH_PLASMA = zero_or_one (!stdenv.isDarwin); + PYARROW_WITH_S3 = zero_or_one _arrow-cpp.enableS3; PYARROW_CMAKE_OPTIONS = [ "-DCMAKE_INSTALL_RPATH=${ARROW_HOME}/lib" @@ -73,6 +75,11 @@ buildPythonPackage rec { # enabled in nixpkgs. # Upstream Issue: https://issues.apache.org/jira/browse/ARROW-11393 "--deselect=pyarrow/tests/test_memory.py::test_env_var" + # these tests require access to s3 via the internet + "--deselect=pyarrow/tests/test_fs.py::test_resolve_s3_region" + "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws" + "--deselect=pyarrow/tests/test_fs.py::test_s3_real_aws_region_selection" + "--deselect=pyarrow/tests/test_fs.py::test_s3_options" ] ++ lib.optionals stdenv.isDarwin [ # Requires loopback networking "--deselect=pyarrow/tests/test_ipc.py::test_socket_" @@ -84,16 +91,17 @@ buildPythonPackage rec { rm -r pyarrow/!(tests) ''; - pythonImportsCheck = map (module: "pyarrow.${module}") [ + pythonImportsCheck = [ "pyarrow" ] ++ map (module: "pyarrow.${module}") ([ "compute" "csv" "dataset" + "feather" "flight" "fs" "hdfs" "json" "parquet" - ]; + ] ++ lib.optionals (!stdenv.isDarwin) [ "plasma" ]); meta = with lib; { description = "A cross-language development platform for in-memory data"; From 3bf217495b166a0513788c16aa96efba5425d6bc Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 May 2022 13:04:51 -0400 Subject: [PATCH 3/5] python3Packages.db-dtypes: patch to support arrow 8 --- .../python-modules/db-dtypes/default.nix | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pkgs/development/python-modules/db-dtypes/default.nix b/pkgs/development/python-modules/db-dtypes/default.nix index 883439aba462..636882e4edff 100644 --- a/pkgs/development/python-modules/db-dtypes/default.nix +++ b/pkgs/development/python-modules/db-dtypes/default.nix @@ -1,6 +1,7 @@ { lib , buildPythonPackage -, fetchPypi +, fetchpatch +, fetchFromGitHub , numpy , packaging , pandas @@ -12,11 +13,20 @@ buildPythonPackage rec { pname = "db-dtypes"; version = "1.0.0"; - src = fetchPypi { - inherit pname version; - sha256 = "3070d1a8d86ff0b5d9b16f15c5fab9c18893c6b3d5723cd95ee397b169049454"; + src = fetchFromGitHub { + owner = "googleapis"; + repo = "python-db-dtypes-pandas"; + rev = "v${version}"; + hash = "sha256-7u/E0ICiz7LQfuplm/mkGlWrgGEPqeMwM3CUhfH6868="; }; + patches = [ + (fetchpatch { + url = "https://github.com/googleapis/python-db-dtypes-pandas/commit/fb30adfd427d3df9919df00b096210ba1eb1b91d.patch"; + sha256 = "sha256-39kZtYGbn3U1WXiDTczki5EM6SjUlSRXz8UMcdTU20g="; + }) + ]; + propagatedBuildInputs = [ numpy packaging From 15751e9f727b7557ace90a03d52cfb0464dbdc12 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 7 May 2022 13:13:35 -0400 Subject: [PATCH 4/5] python3Packages.google-cloud-bigquery: patch for arrow 8 and parallelize tests --- .../python-modules/google-cloud-bigquery/default.nix | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkgs/development/python-modules/google-cloud-bigquery/default.nix b/pkgs/development/python-modules/google-cloud-bigquery/default.nix index 3a5142a9ceff..a98bac95963d 100644 --- a/pkgs/development/python-modules/google-cloud-bigquery/default.nix +++ b/pkgs/development/python-modules/google-cloud-bigquery/default.nix @@ -16,6 +16,7 @@ , proto-plus , psutil , pyarrow +, pytest-xdist }: buildPythonPackage rec { @@ -28,6 +29,11 @@ buildPythonPackage rec { sha256 = "sha256-UmW6BEV44Ucdg/hUGSQk/kyDnB+Hsyx4q3AXTQe89hI="; }; + postPatch = '' + substituteInPlace setup.py \ + --replace 'pyarrow >= 3.0.0, < 8.0dev' 'pyarrow >= 3.0.0, < 9.0dev' + ''; + propagatedBuildInputs = [ google-cloud-core google-cloud-bigquery-storage @@ -47,6 +53,7 @@ buildPythonPackage rec { google-cloud-datacatalog google-cloud-storage pytestCheckHook + pytest-xdist ]; # prevent google directory from shadowing google imports From e4fb3cb6894368f2bd472df7d4f56e957fa4206d Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 8 May 2022 09:23:15 -0400 Subject: [PATCH 5/5] python3Packages.apache-beam: patch in upstream deprecation fix --- .../development/python-modules/apache-beam/default.nix | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pkgs/development/python-modules/apache-beam/default.nix b/pkgs/development/python-modules/apache-beam/default.nix index 8d774410ca35..e3545ee0e6dd 100644 --- a/pkgs/development/python-modules/apache-beam/default.nix +++ b/pkgs/development/python-modules/apache-beam/default.nix @@ -5,6 +5,7 @@ , dill , fastavro , fetchFromGitHub +, fetchpatch , freezegun , grpcio , grpcio-tools @@ -51,6 +52,15 @@ buildPythonPackage rec { sha256 = "sha256-FmfTxRLqXUHhhAZIxCRx2+phX0bmU5rIHaftBU4yBJY="; }; + patches = [ + # patch in the pyarrow.Table.to_batches(max_chunksize=...) argument fix + (fetchpatch { + url = "https://github.com/apache/beam/commit/2418a14ee99ff490d1c82944043f97f37ec97a85.patch"; + sha256 = "sha256-G8ARBBf7nmF46P2ncnlteGFnPWq5iCqZDfuaosre9jY="; + stripLen = 2; + }) + ]; + # See https://github.com/NixOS/nixpkgs/issues/156957. postPatch = '' substituteInPlace setup.py \