From 4ecf0258149f4e197e9dea4d71ab22756ffc5ece Mon Sep 17 00:00:00 2001 From: Silvan Mosberger Date: Tue, 26 Sep 2023 02:10:46 +0200 Subject: [PATCH] lib.fileset.intersection: init --- lib/fileset/README.md | 29 +++++++++++- lib/fileset/default.nix | 41 +++++++++++++++++ lib/fileset/internal.nix | 93 +++++++++++++++++++++++++++++++++++++++ lib/fileset/tests.sh | 95 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 257 insertions(+), 1 deletion(-) diff --git a/lib/fileset/README.md b/lib/fileset/README.md index 1aed7efec4ca..393cafb42e39 100644 --- a/lib/fileset/README.md +++ b/lib/fileset/README.md @@ -143,6 +143,33 @@ Arguments: - (-) Leaves us with no identity element for `union` and no reasonable return value for `unions []`. From a set theory perspective, which has a well-known notion of empty sets, this is unintuitive. +### No intersection for lists + +While there is `intersection a b`, there is no function `intersections [ a b c ]`. + +Arguments: +- (+) There is no known use case for such a function, it can be added later if a use case arises +- (+) There is no suitable return value for `intersections [ ]`, see also "Nullary intersections" [here](https://en.wikipedia.org/w/index.php?title=List_of_set_identities_and_relations&oldid=1177174035#Definitions) + - (-) Could throw an error for that case + - (-) Create a special value to represent "all the files" and return that + - (+) Such a value could then not be used with `fileFilter` unless the internal representation is changed considerably + - (-) Could return the empty file set + - (+) This would be wrong in set theory +- (-) Inconsistent with `union` and `unions` + +### Intersection base path + +The base path of the result of an `intersection` is the longest base path of the arguments. +E.g. the base path of `intersection ./foo ./foo/bar` is `./foo/bar`. +Meanwhile `intersection ./foo ./bar` returns the empty file set without a base path. + +Arguments: +- Alternative: Use the common prefix of all base paths as the resulting base path + - (-) This is unnecessarily strict, because the purpose of the base path is to track the directory under which files _could_ be in the file set. It should be as long as possible. + All files contained in `intersection ./foo ./foo/bar` will be under `./foo/bar` (never just under `./foo`), and `intersection ./foo ./bar` will never contain any files (never under `./.`). + This would lead to `toSource` having to unexpectedly throw errors for cases such as `toSource { root = ./foo; fileset = intersect ./foo base; }`, where `base` may be `./bar` or `./.`. + - (-) There is no benefit to the user, since base path is not directly exposed in the interface + ### Empty directories File sets can only represent a _set_ of local files, directories on their own are not representable. @@ -161,7 +188,7 @@ Arguments: - `./.` represents all files in `./.` _and_ the directory itself, but not its subdirectories, meaning that at least `./.` will be preserved even if it's empty. - In that case, `intersect ./. ./foo` should only include files and no directories themselves, since `./.` includes only `./.` as a directory, and same for `./foo`, so there's no overlap in directories. + In that case, `intersection ./. ./foo` should only include files and no directories themselves, since `./.` includes only `./.` as a directory, and same for `./foo`, so there's no overlap in directories. But intuitively this operation should result in the same as `./foo` – everything else is just confusing. - (+) This matches how Git only supports files, so developers should already be used to it. - (-) Empty directories (even if they contain nested directories) are neither representable nor preserved when coercing from paths. diff --git a/lib/fileset/default.nix b/lib/fileset/default.nix index 93a552262b0c..7bd701670386 100644 --- a/lib/fileset/default.nix +++ b/lib/fileset/default.nix @@ -7,6 +7,7 @@ let _toSourceFilter _unionMany _printFileset + _intersection ; inherit (builtins) @@ -18,6 +19,7 @@ let ; inherit (lib.lists) + elemAt imap0 ; @@ -276,6 +278,45 @@ If a directory does not recursively contain any file, it is omitted from the sto _unionMany ]; + /* + The file set containing all files that are in both of two given file sets. + See also [Intersection (set theory)](https://en.wikipedia.org/wiki/Intersection_(set_theory)). + + The given file sets are evaluated as lazily as possible, + with the first argument being evaluated first if needed. + + Type: + intersection :: FileSet -> FileSet -> FileSet + + Example: + # Limit the selected files to the ones in ./., so only ./src and ./Makefile + intersection ./. (unions [ ../LICENSE ./src ./Makefile ]) + */ + intersection = + # The first file set. + # This argument can also be a path, + # which gets [implicitly coerced to a file set](#sec-fileset-path-coercion). + fileset1: + # The second file set. + # This argument can also be a path, + # which gets [implicitly coerced to a file set](#sec-fileset-path-coercion). + fileset2: + let + filesets = _coerceMany "lib.fileset.intersection" [ + { + context = "first argument"; + value = fileset1; + } + { + context = "second argument"; + value = fileset2; + } + ]; + in + _intersection + (elemAt filesets 0) + (elemAt filesets 1); + /* Incrementally evaluate and trace a file set in a pretty way. This function is only intended for debugging purposes. diff --git a/lib/fileset/internal.nix b/lib/fileset/internal.nix index 6d8165691e13..546b93f158a1 100644 --- a/lib/fileset/internal.nix +++ b/lib/fileset/internal.nix @@ -477,6 +477,27 @@ rec { in recurse (length targetBaseComponents); + # Transforms the filesetTree of a file set to a longer base path, e.g. + # _lengthenTreeBase [ "foo" "bar" ] (_create /foo { bar.baz = "regular"; }) + # => { baz = "regular"; } + _lengthenTreeBase = targetBaseComponents: fileset: + let + recurse = index: tree: + # If the filesetTree is an attribute set and we haven't reached the required depth yet + if isAttrs tree && index < length targetBaseComponents then + # Recurse with the tree under the right component (which might not exist) + recurse (index + 1) (tree.${elemAt targetBaseComponents index} or null) + else + # For all values here we can just return the tree itself: + # tree == null -> the result is also null, everything is excluded + # tree == "directory" -> the result is also "directory", + # because the base path is always a directory and everything is included + # isAttrs tree -> the result is `tree` + # because we don't need to recurse any more since `index == length longestBaseComponents` + tree; + in + recurse (length fileset._internalBaseComponents) fileset._internalTree; + # Computes the union of a list of filesets. # The filesets must already be coerced and validated to be in the same filesystem root # Type: [ Fileset ] -> Fileset @@ -545,4 +566,76 @@ rec { # The non-null elements have to be attribute sets representing partial trees # We need to recurse into those zipAttrsWith (name: _unionTrees) withoutNull; + + # Computes the intersection of a list of filesets. + # The filesets must already be coerced and validated to be in the same filesystem root + # Type: Fileset -> Fileset -> Fileset + _intersection = fileset1: fileset2: + let + # The common base components prefix, e.g. + # (/foo/bar, /foo/bar/baz) -> /foo/bar + # (/foo/bar, /foo/baz) -> /foo + commonBaseComponentsLength = + # TODO: Have a `lib.lists.commonPrefixLength` function such that we don't need the list allocation from commonPrefix here + length ( + commonPrefix + fileset1._internalBaseComponents + fileset2._internalBaseComponents + ); + + # To be able to intersect filesetTree's together, they need to have the same base path. + # Base paths can be intersected by taking the longest one (if any) + + # The fileset with the longest base, if any, e.g. + # (/foo/bar, /foo/bar/baz) -> /foo/bar/baz + # (/foo/bar, /foo/baz) -> null + longestBaseFileset = + if commonBaseComponentsLength == length fileset1._internalBaseComponents then + # The common prefix is the same as the first path, so the second path is equal or longer + fileset2 + else if commonBaseComponentsLength == length fileset2._internalBaseComponents then + # The common prefix is the same as the second path, so the first path is longer + fileset1 + else + # The common prefix is neither the first nor the second path + # This means there's no overlap between the two sets + null; + + # Whether the result should be the empty value without a base + resultIsEmptyWithoutBase = + # If either fileset is the empty fileset without a base, the intersection is too + fileset1._internalIsEmptyWithoutBase + || fileset2._internalIsEmptyWithoutBase + # If there is no overlap between the base paths + || longestBaseFileset == null; + + # Lengthen each fileset's tree to the longest base prefix + tree1 = _lengthenTreeBase longestBaseFileset._internalBaseComponents fileset1; + tree2 = _lengthenTreeBase longestBaseFileset._internalBaseComponents fileset2; + + # With two filesetTree's with the same base, we can compute their intersection + resultTree = _intersectTree tree1 tree2; + in + if resultIsEmptyWithoutBase then + _emptyWithoutBase + else + _create longestBaseFileset._internalBase resultTree; + + # The intersection of two filesetTree's with the same base path + # The second element is only evaluated as much as necessary. + # Type: filesetTree -> filesetTree -> filesetTree + _intersectTree = lhs: rhs: + if isAttrs lhs && isAttrs rhs then + # Both sides are attribute sets, we can recurse for the attributes existing on both sides + mapAttrs + (name: _intersectTree lhs.${name}) + (builtins.intersectAttrs lhs rhs) + else if lhs == null || isString rhs then + # If the lhs is null, the result should also be null + # And if the rhs is the identity element + # (a string, aka it includes everything), then it's also the lhs + lhs + else + # In all other cases it's the rhs + rhs; } diff --git a/lib/fileset/tests.sh b/lib/fileset/tests.sh index 9e09da809246..7a104654983f 100755 --- a/lib/fileset/tests.sh +++ b/lib/fileset/tests.sh @@ -587,6 +587,97 @@ done # So, just using 1000 files for now. checkFileset 'unions (mapAttrsToList (name: _: ./. + "/${name}/a") (builtins.readDir ./.))' + +## lib.fileset.intersection + + +# Different filesystem roots in root and fileset are not supported +mkdir -p {foo,bar}/mock-root +expectFailure 'with ((import ).extend (import )).fileset; + toSource { root = ./.; fileset = intersection ./foo/mock-root ./bar/mock-root; } +' 'lib.fileset.intersection: Filesystem roots are not the same: +\s*first argument: root "'"$work"'/foo/mock-root" +\s*second argument: root "'"$work"'/bar/mock-root" +\s*Different roots are not supported.' +rm -rf -- * + +# Coercion errors show the correct context +expectFailure 'toSource { root = ./.; fileset = intersection ./a ./.; }' 'lib.fileset.intersection: first argument \('"$work"'/a\) does not exist.' +expectFailure 'toSource { root = ./.; fileset = intersection ./. ./b; }' 'lib.fileset.intersection: second argument \('"$work"'/b\) does not exist.' + +# The tree of later arguments should not be evaluated if a former argument already excludes all files +tree=( + [a]=0 +) +checkFileset 'intersection _emptyWithoutBase (_create ./. (abort "This should not be used!"))' +# We don't have any combinators that can explicitly remove files yet, so we need to rely on internal functions to test this for now +checkFileset 'intersection (_create ./. { a = null; }) (_create ./. { a = abort "This should not be used!"; })' + +# If either side is empty, the result is empty +tree=( + [a]=0 +) +checkFileset 'intersection _emptyWithoutBase _emptyWithoutBase' +checkFileset 'intersection _emptyWithoutBase (_create ./. null)' +checkFileset 'intersection (_create ./. null) _emptyWithoutBase' +checkFileset 'intersection (_create ./. null) (_create ./. null)' + +# If the intersection base paths are not overlapping, the result is empty and has no base path +mkdir a b c +touch {a,b,c}/x +expectEqual 'toSource { root = ./c; fileset = intersection ./a ./b; }' 'toSource { root = ./c; fileset = _emptyWithoutBase; }' +rm -rf -- * + +# If the intersection exists, the resulting base path is the longest of them +mkdir a +touch x a/b +expectEqual 'toSource { root = ./a; fileset = intersection ./a ./.; }' 'toSource { root = ./a; fileset = ./a; }' +expectEqual 'toSource { root = ./a; fileset = intersection ./. ./a; }' 'toSource { root = ./a; fileset = ./a; }' +rm -rf -- * + +# Also finds the intersection with null'd filesetTree's +tree=( + [a]=0 + [b]=1 + [c]=0 +) +checkFileset 'intersection (_create ./. { a = "regular"; b = "regular"; c = null; }) (_create ./. { a = null; b = "regular"; c = "regular"; })' + +# Actually computes the intersection between files +tree=( + [a]=0 + [b]=0 + [c]=1 + [d]=1 + [e]=0 + [f]=0 +) +checkFileset 'intersection (unions [ ./a ./b ./c ./d ]) (unions [ ./c ./d ./e ./f ])' + +tree=( + [a/x]=0 + [a/y]=0 + [b/x]=1 + [b/y]=1 + [c/x]=0 + [c/y]=0 +) +checkFileset 'intersection ./b ./.' +checkFileset 'intersection ./b (unions [ ./a/x ./a/y ./b/x ./b/y ./c/x ./c/y ])' + +# Complicated case +tree=( + [a/x]=0 + [a/b/i]=1 + [c/d/x]=0 + [c/d/f]=1 + [c/x]=0 + [c/e/i]=1 + [c/e/j]=1 +) +checkFileset 'intersection (unions [ ./a/b ./c/d ./c/e ]) (unions [ ./a ./c/d/f ./c/e ])' + + ## Tracing # The second trace argument is returned @@ -609,6 +700,10 @@ rm -rf -- * # The empty file set without a base also prints as empty expectTrace '_emptyWithoutBase' '(empty)' expectTrace 'unions [ ]' '(empty)' +mkdir foo bar +touch {foo,bar}/x +expectTrace 'intersection ./foo ./bar' '(empty)' +rm -rf -- * # If a directory is fully included, print it as such touch a