From 9d8831c8fe31ce26777274af68054c55342f48dc Mon Sep 17 00:00:00 2001 From: Greg Price Date: Sun, 29 Mar 2020 15:32:53 -0700 Subject: [PATCH 1/4] cpython: Drop unrecognized --with-threads configure flag. The ./configure script prints a warning when passed this flag, starting with 3.7: configure: WARNING: unrecognized options: --with-threads The reason is that there's no longer such a thing as a build without threads. Eliminate the warning, by only passing the flag on the older releases that accept it. Upstream change and discussion: https://github.com/python/cpython/commit/a6a4dc816 https://bugs.python.org/issue31370 --- pkgs/development/interpreters/python/cpython/default.nix | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkgs/development/interpreters/python/cpython/default.nix b/pkgs/development/interpreters/python/cpython/default.nix index b778b62f908d..1df70fb99ccb 100644 --- a/pkgs/development/interpreters/python/cpython/default.nix +++ b/pkgs/development/interpreters/python/cpython/default.nix @@ -136,10 +136,12 @@ in with passthru; stdenv.mkDerivation { configureFlags = [ "--enable-shared" - "--with-threads" "--without-ensurepip" "--with-system-expat" "--with-system-ffi" + ] ++ optionals (pythonOlder "3.7") [ + # This is unconditionally true starting in CPython 3.7. + "--with-threads" ] ++ optionals (sqlite != null && isPy3k) [ "--enable-loadable-sqlite-extensions" ] ++ optionals (openssl != null) [ From f8a8243bd3127ba9c6090d57f43d979389c1653c Mon Sep 17 00:00:00 2001 From: Greg Price Date: Sun, 29 Mar 2020 14:05:04 -0700 Subject: [PATCH 2/4] cpython: Use --enable-optimizations, for a 16% speedup. Without this flag, the configure script prints a warning at the end, like this (reformatted): If you want a release build with all stable optimizations active (PGO, etc), please run ./configure --enable-optimizations We're doing a build to distribute to people for day-to-day use, doing things other than developing the Python interpreter. So that's certainly a release build -- we're the target audience for this recommendation. --- And, trying it out, upstream isn't kidding! I ran the standard benchmark suite that the CPython developers use for performance work, "pyperformance". Following its usage instructions: https://pyperformance.readthedocs.io/usage.html I ran the whole suite, like so: $ nix-shell -p ./result."$variant" --run ' cd $(mktemp -d); python -m venv venv; . venv/bin/activate pip install pyperformance pyperformance run -o ~/tmp/result.'"$variant"'.json ' and then examined the results with commands like: $ python -m pyperf compare_to --table -G \ ~/tmp/result.{$before,$after}.json Across all the benchmarks in the suite, the median speedup was 16%. (Meaning 1.16x faster; 14% less time). The middle half of them ranged from a 13% to a 22% speedup. Each of the 60 benchmarks in the suite got faster, by speedups ranging from 3% to 53%. --- One reason this isn't just the default to begin with is that, until recently, it made the build a lot slower. What it does is turn on profile-guided optimization, which means first build for profiling, then run some task to get a profile, then build again using the profile. And, short of further customization, the task it would use would be nearly the full test suite, which includes a lot of expensive and slow tests, and can easily take half an hour to run. Happily, in 2019 an upstream developer did the work to carefully select a more appropriate set of tests to use for the profile: https://github.com/python/cpython/commit/4e16a4a31 https://bugs.python.org/issue36044 This suite takes just 2 minutes to run. And the resulting final build is actually slightly faster than with the much longer suite, at least as measured by those standard "pyperformance" benchmarks. That work went into the 3.8 release, but the same list works great if used on older releases too. So, start passing that --enable-optimizations flag; and backport that good-for-PGO set of tests, so that we use it on all releases. --- .../python/cpython/2.7/default.nix | 4 ++++ .../python/cpython/2.7/profile-task.patch | 21 +++++++++++++++++++ .../python/cpython/3.5/profile-task.patch | 21 +++++++++++++++++++ .../python/cpython/3.6/profile-task.patch | 21 +++++++++++++++++++ .../interpreters/python/cpython/default.nix | 9 ++++++++ 5 files changed, 76 insertions(+) create mode 100644 pkgs/development/interpreters/python/cpython/2.7/profile-task.patch create mode 100644 pkgs/development/interpreters/python/cpython/3.5/profile-task.patch create mode 100644 pkgs/development/interpreters/python/cpython/3.6/profile-task.patch diff --git a/pkgs/development/interpreters/python/cpython/2.7/default.nix b/pkgs/development/interpreters/python/cpython/2.7/default.nix index 77b37c5f5c39..0df49213f5a3 100644 --- a/pkgs/development/interpreters/python/cpython/2.7/default.nix +++ b/pkgs/development/interpreters/python/cpython/2.7/default.nix @@ -85,6 +85,9 @@ let # backported in debian since 2013. # https://bugs.python.org/issue13146 ./atomic_pyc.patch + + # Backport from CPython 3.8 of a good list of tests to run for PGO. + ./profile-task.patch ] ++ optionals (x11Support && stdenv.isDarwin) [ ./use-correct-tcl-tk-on-darwin.patch ] ++ optionals stdenv.isLinux [ @@ -135,6 +138,7 @@ let ''; configureFlags = [ + "--enable-optimizations" "--enable-shared" "--with-threads" "--enable-unicode=ucs${toString ucsEncoding}" diff --git a/pkgs/development/interpreters/python/cpython/2.7/profile-task.patch b/pkgs/development/interpreters/python/cpython/2.7/profile-task.patch new file mode 100644 index 000000000000..9c085657ac9d --- /dev/null +++ b/pkgs/development/interpreters/python/cpython/2.7/profile-task.patch @@ -0,0 +1,21 @@ +Backport from CPython 3.8 of a good list of tests to run for PGO. + +Upstream commit: + https://github.com/python/cpython/commit/4e16a4a31 + +Upstream discussion: + https://bugs.python.org/issue36044 + +diff --git a/Makefile.pre.in b/Makefile.pre.in +index 00fdd21ce..713dc1e53 100644 +--- a/Makefile.pre.in ++++ b/Makefile.pre.in +@@ -259,7 +259,7 @@ TCLTK_LIBS= + # The task to run while instrumented when building the profile-opt target. + # We exclude unittests with -x that take a rediculious amount of time to + # run in the instrumented training build or do not provide much value. +-PROFILE_TASK=-m test.regrtest --pgo -x test_asyncore test_gdb test_multiprocessing test_subprocess ++PROFILE_TASK=-m test.regrtest --pgo test_array test_base64 test_binascii test_binop test_bisect test_bytes test_bz2 test_cmath test_codecs test_collections test_complex test_dataclasses test_datetime test_decimal test_difflib test_embed test_float test_fstring test_functools test_generators test_hashlib test_heapq test_int test_itertools test_json test_long test_lzma test_math test_memoryview test_operator test_ordered_dict test_pickle test_pprint test_re test_set test_sqlite test_statistics test_struct test_tabnanny test_time test_unicode test_xml_etree test_xml_etree_c + + # report files for gcov / lcov coverage report + COVERAGE_INFO= $(abs_builddir)/coverage.info diff --git a/pkgs/development/interpreters/python/cpython/3.5/profile-task.patch b/pkgs/development/interpreters/python/cpython/3.5/profile-task.patch new file mode 100644 index 000000000000..39d5587379ca --- /dev/null +++ b/pkgs/development/interpreters/python/cpython/3.5/profile-task.patch @@ -0,0 +1,21 @@ +Backport from CPython 3.8 of a good list of tests to run for PGO. + +Upstream commit: + https://github.com/python/cpython/commit/4e16a4a31 + +Upstream discussion: + https://bugs.python.org/issue36044 + +diff --git a/Makefile.pre.in b/Makefile.pre.in +index 00fdd21ce..713dc1e53 100644 +--- a/Makefile.pre.in ++++ b/Makefile.pre.in +@@ -259,7 +259,7 @@ TCLTK_LIBS= + # The task to run while instrumented when building the profile-opt target. + # We exclude unittests with -x that take a rediculious amount of time to + # run in the instrumented training build or do not provide much value. +-PROFILE_TASK=-m test.regrtest --pgo -x test_asyncore test_gdb test_multiprocessing_fork test_multiprocessing_forkserver test_multiprocessing_main_handling test_multiprocessing_spawn test_subprocess ++PROFILE_TASK=-m test.regrtest --pgo test_array test_base64 test_binascii test_binop test_bisect test_bytes test_bz2 test_cmath test_codecs test_collections test_complex test_dataclasses test_datetime test_decimal test_difflib test_embed test_float test_fstring test_functools test_generators test_hashlib test_heapq test_int test_itertools test_json test_long test_lzma test_math test_memoryview test_operator test_ordered_dict test_pickle test_pprint test_re test_set test_sqlite test_statistics test_struct test_tabnanny test_time test_unicode test_xml_etree test_xml_etree_c + + # report files for gcov / lcov coverage report + COVERAGE_INFO= $(abs_builddir)/coverage.info diff --git a/pkgs/development/interpreters/python/cpython/3.6/profile-task.patch b/pkgs/development/interpreters/python/cpython/3.6/profile-task.patch new file mode 100644 index 000000000000..df55da3a4132 --- /dev/null +++ b/pkgs/development/interpreters/python/cpython/3.6/profile-task.patch @@ -0,0 +1,21 @@ +Backport from CPython 3.8 of a good list of tests to run for PGO. + +Upstream commit: + https://github.com/python/cpython/commit/4e16a4a31 + +Upstream discussion: + https://bugs.python.org/issue36044 + +diff --git a/Makefile.pre.in b/Makefile.pre.in +index 00fdd21ce..713dc1e53 100644 +--- a/Makefile.pre.in ++++ b/Makefile.pre.in +@@ -259,7 +259,7 @@ TCLTK_LIBS= + # The task to run while instrumented when building the profile-opt target. + # We exclude unittests with -x that take a rediculious amount of time to + # run in the instrumented training build or do not provide much value. +-PROFILE_TASK=-m test.regrtest --pgo ++PROFILE_TASK=-m test.regrtest --pgo test_array test_base64 test_binascii test_binop test_bisect test_bytes test_bz2 test_cmath test_codecs test_collections test_complex test_dataclasses test_datetime test_decimal test_difflib test_embed test_float test_fstring test_functools test_generators test_hashlib test_heapq test_int test_itertools test_json test_long test_lzma test_math test_memoryview test_operator test_ordered_dict test_pickle test_pprint test_re test_set test_sqlite test_statistics test_struct test_tabnanny test_time test_unicode test_xml_etree test_xml_etree_c + + # report files for gcov / lcov coverage report + COVERAGE_INFO= $(abs_builddir)/coverage.info diff --git a/pkgs/development/interpreters/python/cpython/default.nix b/pkgs/development/interpreters/python/cpython/default.nix index 1df70fb99ccb..7dbe6216e728 100644 --- a/pkgs/development/interpreters/python/cpython/default.nix +++ b/pkgs/development/interpreters/python/cpython/default.nix @@ -104,6 +104,14 @@ in with passthru; stdenv.mkDerivation { ] ++ optionals (isPy37 || isPy38) [ # Fix darwin build https://bugs.python.org/issue34027 ./3.7/darwin-libutil.patch + ] ++ optionals (pythonOlder "3.8") [ + # Backport from CPython 3.8 of a good list of tests to run for PGO. + ( + if isPy36 || isPy37 then + ./3.6/profile-task.patch + else + ./3.5/profile-task.patch + ) ] ++ optionals (isPy3k && hasDistutilsCxxPatch) [ # Fix for http://bugs.python.org/issue1222585 # Upstream distutils is calling C compiler to compile C++ code, which @@ -135,6 +143,7 @@ in with passthru; stdenv.mkDerivation { PYTHONHASHSEED=0; configureFlags = [ + "--enable-optimizations" "--enable-shared" "--without-ensurepip" "--with-system-expat" From 52c04b0347d60064f11ae8430e5a6c57a30ec831 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Wed, 13 May 2020 19:52:36 -0700 Subject: [PATCH 3/4] cpython: Use autoreconfHook to rebuild configure script. In particular this will let us use patches that apply to configure.ac. --- .../development/interpreters/python/cpython/2.7/default.nix | 6 ++++-- pkgs/development/interpreters/python/cpython/default.nix | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pkgs/development/interpreters/python/cpython/2.7/default.nix b/pkgs/development/interpreters/python/cpython/2.7/default.nix index 0df49213f5a3..e9d6c97cfd45 100644 --- a/pkgs/development/interpreters/python/cpython/2.7/default.nix +++ b/pkgs/development/interpreters/python/cpython/2.7/default.nix @@ -12,6 +12,7 @@ , zlib , self , configd, coreutils +, autoreconfHook , python-setup-hook # Some proprietary libs assume UCS2 unicode, especially on darwin :( , ucsEncoding ? 4 @@ -186,8 +187,9 @@ let ++ optionals x11Support [ tcl tk xlibsWrapper libX11 ] ++ optional (stdenv.isDarwin && configd != null) configd; nativeBuildInputs = - optionals (stdenv.hostPlatform != stdenv.buildPlatform) - [ buildPackages.stdenv.cc buildPackages.python ]; + [ autoreconfHook ] + ++ optionals (stdenv.hostPlatform != stdenv.buildPlatform) + [ buildPackages.stdenv.cc buildPackages.python ]; mkPaths = paths: { C_INCLUDE_PATH = makeSearchPathOutput "dev" "include" paths; diff --git a/pkgs/development/interpreters/python/cpython/default.nix b/pkgs/development/interpreters/python/cpython/default.nix index 7dbe6216e728..b860e357b04b 100644 --- a/pkgs/development/interpreters/python/cpython/default.nix +++ b/pkgs/development/interpreters/python/cpython/default.nix @@ -12,6 +12,7 @@ , zlib , self , configd +, autoreconfHook , python-setup-hook , nukeReferences # For the Python package set @@ -51,6 +52,7 @@ let version = with sourceVersion; "${major}.${minor}.${patch}${suffix}"; nativeBuildInputs = [ + autoreconfHook nukeReferences ] ++ optionals (stdenv.hostPlatform != stdenv.buildPlatform) [ buildPackages.stdenv.cc From 480c8d199166b2f8cd20e6e245d8a019329ec466 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Sun, 29 Mar 2020 15:46:12 -0700 Subject: [PATCH 4/4] cpython: Optimize dynamic symbol tables, for a 6% speedup. I took a close look at how Debian builds the Python interpreter, because I noticed it ran substantially faster than the one in nixpkgs and I was curious why. One thing that I found made a material difference in performance was this pair of linker flags (passed to the compiler): -Wl,-O1 -Wl,-Bsymbolic-functions In other words, effectively the linker gets passed the flags: -O1 -Bsymbolic-functions Doing the same thing in nixpkgs turns out to make the interpreter run about 6% faster, which is quite a big win for such an easy change. So, let's apply it. --- I had not known there was a `-O1` flag for the *linker*! But indeed there is. These flags are unrelated to "link-time optimization" (LTO), despite the latter's name. LTO means doing classic compiler optimizations on the actual code, at the linking step when it becomes possible to do them with cross-object-file information. These two flags, by contrast, cause the linker to make certain optimizations within the scope of its job as the linker. Documentation is here, though sparse: https://sourceware.org/binutils/docs-2.31/ld/Options.html The meaning of -O1 was explained in more detail in this LWN article: https://lwn.net/Articles/192624/ Apparently it makes the resulting symbol table use a bigger hash table, so the load factor is smaller and lookups are faster. Cool. As for -Bsymbolic-functions, the documentation indicates that it's a way of saving lookups through the symbol table entirely. There can apparently be situations where it changes the behavior of a program, specifically if the program relies on linker tricks to provide customization features: https://bugs.launchpad.net/ubuntu/+source/xfe/+bug/644645 https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=637184#35 But I'm pretty sure CPython doesn't permit that kind of trick: you don't load a shared object that tries to redefine some symbol found in the interpreter core. The stronger reason I'm confident using -Bsymbolic-functions is safe, though, is empirical. Both Debian and Ubuntu have been shipping a Python built this way since forever -- it was introduced for the Python 2.4 and 2.5 in Ubuntu "hardy", and Debian "lenny", released in 2008 and 2009. In those 12 years they haven't seen a need to drop this flag; and I've been unable to locate any reports of trouble related to it, either on the Web in general or on the Debian bug tracker. (There are reports of a handful of other programs breaking with it, but not Python/CPython.) So that seems like about as thorough testing as one could hope for. --- As for the performance impact: I ran CPython upstream's preferred benchmark suite, "pyperformance", in the same way as described in the previous commit. On top of that commit's change, the results across the 60 benchmarks in the suite are: The median is 6% faster. The middle half (aka interquartile range) is from 4% to 8% faster. Out of 60 benchmarks, 3 come out slower, by 1-4%. At the other end, 5 are at least 10% faster, and one is 17% faster. So, that's quite a material speedup! I don't know how big the effect of these flags is for other software; but certainly CPython tends to do plenty of dynamic linking, as that's how it loads extension modules, which are ubiquitous in the stdlib as well as popular third-party libraries. So perhaps that helps explain why optimizing the dynamic linker has such an impact. --- .../python/cpython/2.7/default.nix | 7 ++++++ .../interpreters/python/cpython/default.nix | 25 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pkgs/development/interpreters/python/cpython/2.7/default.nix b/pkgs/development/interpreters/python/cpython/2.7/default.nix index e9d6c97cfd45..ca4fae51269c 100644 --- a/pkgs/development/interpreters/python/cpython/2.7/default.nix +++ b/pkgs/development/interpreters/python/cpython/2.7/default.nix @@ -100,6 +100,13 @@ let # libuuid, slowing down program startup a lot). ./no-ldconfig.patch + # Optimize symbol tables for the sake of dynamic linking. + # Significant for Python because of extension modules. + (fetchpatch { + url = "https://salsa.debian.org/cpython-team/python3/-/raw/27103a32e/debian/patches/link-opt.diff"; + sha256 = "0vp36276ndbrwr7882vg7vjd61c8mv7bqgal6bbh2fimp6zlkdhv"; + }) + ] ++ optionals stdenv.hostPlatform.isCygwin [ ./2.5.2-ctypes-util-find_library.patch ./2.5.2-tkinter-x11.patch diff --git a/pkgs/development/interpreters/python/cpython/default.nix b/pkgs/development/interpreters/python/cpython/default.nix index b860e357b04b..dc3997481be3 100644 --- a/pkgs/development/interpreters/python/cpython/default.nix +++ b/pkgs/development/interpreters/python/cpython/default.nix @@ -97,6 +97,31 @@ in with passthru; stdenv.mkDerivation { # (since it will do a futile invocation of gcc (!) to find # libuuid, slowing down program startup a lot). (./. + "/${sourceVersion.major}.${sourceVersion.minor}/no-ldconfig.patch") + ] ++ optionals stdenv.isLinux [ + # Optimize symbol tables for the sake of dynamic linking. + # Significant for Python because of extension modules. + ( + if pythonAtLeast "3.8" then + fetchpatch { + url = "https://salsa.debian.org/cpython-team/python3/-/raw/3.8.3rc1-1/debian/patches/link-opt.diff"; + sha256 = "0va85318nahnqgydwjs7723h8gx41inbdawdy6v4hiykzgc8s7vs"; + } + else if isPy37 then + fetchurl { + url = "https://salsa.debian.org/cpython-team/python3/-/raw/3.7.6-1/debian/patches/link-opt.diff"; + sha256 = "1aqvsc0p3sxnfsi8jz7537wl6v95v26ba4nflwvmn5lxlc3y3g13"; + } + else if isPy36 then + fetchpatch { + url = "https://salsa.debian.org/cpython-team/python3/-/raw/3.6.8-1/debian/patches/link-opt.diff"; + sha256 = "1nhdrgla75ily9gk7xx0crxa7ynqzks0djxk36sa3lgg5w8vjvyr"; + } + else + fetchpatch { + url = "https://salsa.debian.org/cpython-team/python3/-/raw/27103a32e/debian/patches/link-opt.diff"; + sha256 = "0vp36276ndbrwr7882vg7vjd61c8mv7bqgal6bbh2fimp6zlkdhv"; + } + ) ] ++ optionals (isPy35 || isPy36) [ # Determinism: Write null timestamps when compiling python files. ./3.5/force_bytecode_determinism.patch