1010#include " nix/store/filetransfer.hh"
1111#include " nix/util/cgroup.hh"
1212#include " nix/util/linux-namespaces.hh"
13+ #include " nix/util/file-system-at.hh"
1314#include " nix/util/logging.hh"
1415#include " nix/util/serialise.hh"
1516#include " linux/fchmodat2-compat.hh"
1617
1718#include < algorithm>
1819#include < string_view>
1920#include < cstdint>
20- #include < fcntl.h >
21+ #include < atomic >
2122
23+ #include < fcntl.h>
2224#include < sys/ioctl.h>
2325#include < net/if.h>
2426#include < netinet/ip.h>
@@ -205,35 +207,148 @@ static void setupLandlock()
205207
206208#endif
207209
208- static void doBind (const std::filesystem::path & source, const std::filesystem::path & target, bool optional = false )
210+ static void doBind (
211+ const std::filesystem::path & source,
212+ Descriptor chrootRootDirFd,
213+ const std::filesystem::path & target,
214+ const std::filesystem::path & chrootRootDirPath,
215+ bool optional = false )
209216{
210- debug (" bind mounting %1% to %2%" , PathFmt (source), PathFmt (target));
217+ /* `target` denotes a relative path inside the chroot directory. All operations happen
218+ relative to chrootRootDirFd. Bail out if the path is not what we expect. */
219+ if (target.empty () || target.is_absolute ())
220+ throw Error (" invalid path to bind mount in the chroot: %s" , PathFmt (target));
221+
222+ /* Sanity check against insane setups. This would never be passed in
223+ sandbox paths for dependencies, but the user might specify it in
224+ extra-sandbox-paths. */
225+ if (const auto filename = target.filename ().native (); filename == " ." || filename == " .." )
226+ throw Error (" sandbox path to bind mount in the chroot has an invalid filename: %s" , PathFmt (target));
227+
228+ debug (" bind mounting %1% to %2%" , PathFmt (source), PathFmt (chrootRootDirPath / target));
229+
230+ auto fallbackBindMount = [&](Descriptor sourceFd, Descriptor destFd) {
231+ auto selfProcSourcePath = std::filesystem::path (" /proc/self/fd" ) / std::to_string (sourceFd);
232+ auto selfProcDestPath = std::filesystem::path (" /proc/self/fd" ) / std::to_string (destFd);
233+
234+ if (mount (selfProcSourcePath.c_str (), selfProcDestPath.c_str (), " " , MS_BIND | MS_REC, 0 ) == -1 )
235+ throw SysError (
236+ " bind mount from %1% (%3%) to %2% (%4%) failed" ,
237+ PathFmt (source),
238+ PathFmt (chrootRootDirPath / target),
239+ PathFmt (selfProcSourcePath),
240+ PathFmt (selfProcDestPath));
241+ };
242+
243+ auto bindMount = [&](Descriptor sourceFd, Descriptor destFd) {
244+ static std::atomic_flag openTreeUnsupported{};
245+ if (openTreeUnsupported.test ())
246+ return fallbackBindMount (sourceFd, destFd);
247+
248+ AutoCloseFD treeFd =
249+ ::open_tree (sourceFd, " " , OPEN_TREE_CLONE | AT_RECURSIVE | OPEN_TREE_CLOEXEC | AT_EMPTY_PATH);
250+
251+ if (!treeFd) {
252+ if (errno != ENOSYS)
253+ throw SysError (" opening bind mount source %1%" , PathFmt (source));
211254
212- auto bindMount = [&]() {
213- if (mount (source.c_str (), target.c_str (), " " , MS_BIND | MS_REC, 0 ) == -1 )
214- throw SysError (" bind mount from %1% to %2% failed" , PathFmt (source), PathFmt (target));
255+ /* Cache and try the old fallback via procfs. */
256+ openTreeUnsupported.test_and_set ();
257+ return fallbackBindMount (sourceFd, destFd);
258+ }
259+
260+ if (::move_mount (treeFd.get (), " " , destFd, " " , MOVE_MOUNT_T_EMPTY_PATH | MOVE_MOUNT_F_EMPTY_PATH) == -1 )
261+ throw SysError (" bind mount from %1% to %2% failed" , PathFmt (source), PathFmt (chrootRootDirPath / target));
215262 };
216263
217- auto maybeSt = maybeLstat (source);
218- if (!maybeSt) {
219- if (optional)
264+ /* TODO: Replace the sucky implementation of createDirs and use that instead. */
265+ auto createDirsAndOpen = [&](const std::filesystem::path & path) -> std::pair<AutoCloseFD, Descriptor> {
266+ Descriptor parentFd = chrootRootDirFd;
267+ AutoCloseFD maybeParentFdOwned;
268+ /* How many directories deep we are. */
269+ unsigned nestedDirCount = 0 ;
270+
271+ for (const auto & p : path) {
272+ /* Trailing `/`, we don't care. It might matter for symlink resolution, so better skip it.
273+ Also skip `.` as a no-op. */
274+ if (p.native ().empty () || p.native () == " ." )
275+ continue ;
276+
277+ /* TODO: Add additional validation in config parsing in case the
278+ user specified a borked `extra-sandbox-paths`. Maybe we should just
279+ reject `..` components completely? */
280+ if (p.native () == " .." ) {
281+ if (nestedDirCount == 0 )
282+ throw Error (" sandbox path %s escapes the chroot" , PathFmt (path));
283+ --nestedDirCount;
284+ } else {
285+ ++nestedDirCount;
286+ }
287+
288+ assert (!p.native ().starts_with (" /" )); /* Already check that the path is relative, can't happen. */
289+
290+ /* No symlinks shall be followed. */
291+ AutoCloseFD nextComponent = ::openat (parentFd, p.c_str (), O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
292+ if (!nextComponent) {
293+ if (errno != ENOENT) /* We might have to create it if it doesn't exist. */
294+ throw SysError (" creating directory component %s in the sandbox path %s" , PathFmt (p), PathFmt (path));
295+
296+ /* Let the umask restrict the permissions. TODO: Do we want
297+ this? We don't handle EEXIST here because that would mean
298+ somebody is already racing us. Fail closed. */
299+ if (::mkdirat (parentFd, p.c_str (), 0777 ) == -1 )
300+ throw SysError (" creating directory component %s in the sandbox path %s" , PathFmt (p), PathFmt (path));
301+
302+ /* Try again after creating the directory. Should succeed. */
303+ nextComponent = ::openat (parentFd, p.c_str (), O_DIRECTORY | O_NOFOLLOW | O_CLOEXEC);
304+ if (!nextComponent) /* Something is terribly wrong, bail out. */
305+ throw SysError (" creating directory component %s in the sandbox path %s" , PathFmt (p), PathFmt (path));
306+ }
307+
308+ /* Proceed to the next iteration. */
309+ maybeParentFdOwned = std::move (nextComponent);
310+ parentFd = maybeParentFdOwned.get ();
311+ }
312+
313+ return {std::move (maybeParentFdOwned), parentFd};
314+ };
315+
316+ AutoCloseFD sourceFd = ::open (source.c_str (), O_PATH | O_NOFOLLOW | O_CLOEXEC);
317+
318+ /* Skip non-existent files source paths if .optional is set. */
319+ if (!sourceFd) {
320+ if (optional && (errno == ENOENT || errno == ENOTDIR))
220321 return ;
221322 else
222323 throw SysError (" getting attributes of path %1%" , PathFmt (source));
223324 }
224- auto st = *maybeSt;
325+
326+ auto st = nix::fstat (sourceFd.get ());
225327
226328 if (S_ISDIR (st.st_mode )) {
227- createDirs (target);
228- bindMount ();
329+ auto [maybeDirFdOwned, dirFd] = createDirsAndOpen (target);
330+ /* We must always open a fresh directory - i.e. the path must not resolve to chrootRootDir itself. */
331+ if (!maybeDirFdOwned)
332+ throw Error (" sandbox path %s resolves to the chroot root directory" , PathFmt (chrootRootDirPath / target));
333+ assert (maybeDirFdOwned.get () == dirFd); /* See the comment above. */
334+ bindMount (sourceFd.get (), dirFd);
229335 } else if (S_ISLNK (st.st_mode )) {
230- // Symlinks can (apparently) not be bind-mounted, so just copy it
231- createDirs (target.parent_path ());
232- copyFile (source, target, false );
336+ /* Symlinks can't be bind-mounted, so copy the contents.
337+ The kernel implies AT_EMPTY_PATH for readlinkat
338+ since https://github.com/torvalds/linux/commit/65cfc6722361 (v2.6.39).
339+ See also: https://github.com/cyphar/libpathrs/issues/18 */
340+ auto symlinkTarget = readLinkAt (sourceFd.get (), CanonPath::root);
341+ auto [maybeParentFdOwned, parentFd] = createDirsAndOpen (target.parent_path ());
342+ if (::symlinkat (symlinkTarget.data (), parentFd, target.filename ().c_str ()) == -1 )
343+ throw SysError (" creating symlink %s" , PathFmt (chrootRootDirPath / target));
233344 } else {
234- createDirs (target.parent_path ());
235- writeFile (target, " " );
236- bindMount ();
345+ auto [maybeParentFdOwned, parentFd] = createDirsAndOpen (target.parent_path ());
346+ /* Strictly speaking, O_NOFOLLOW is redundant because O_CREAT | O_EXCL would never follow links anyway. */
347+ AutoCloseFD destFd =
348+ ::openat (parentFd, target.filename().c_str(), O_CREAT | O_EXCL | O_WRONLY | O_NOFOLLOW | O_CLOEXEC, 0666);
349+ if (!destFd)
350+ throw SysError (" creating regular file %s" , PathFmt (chrootRootDirPath / target));
351+ bindMount (sourceFd.get (), destFd.get ());
237352 }
238353}
239354
@@ -647,6 +762,11 @@ void ChrootLinuxDerivationBuilder::enterChroot()
647762 pathsInChroot.emplace (i, canonicalPath);
648763 }
649764
765+ /* We really do have to reopen the dirfd in the child, otherwise move_mount dies with EINVAL. */
766+ AutoCloseFD chrootRootDirFd = openDirectory (chrootRootDir, FinalSymlink::DontFollow);
767+ if (!chrootRootDirFd)
768+ throw SysError (" opening directory %s" , PathFmt (chrootRootDir));
769+
650770 /* Bind-mount all the directories from the "host"
651771 filesystem that we want in the chroot
652772 environment. */
@@ -666,7 +786,7 @@ void ChrootLinuxDerivationBuilder::enterChroot()
666786 } else
667787#endif
668788 {
669- doBind (i.second .source , chrootRootDir / i.first .relative_path (), i.second .optional );
789+ doBind (i.second .source , chrootRootDirFd. get (), i.first .relative_path (), chrootRootDir , i.second .optional );
670790 }
671791 }
672792
@@ -709,8 +829,8 @@ void ChrootLinuxDerivationBuilder::enterChroot()
709829 } else {
710830 if (errno != EINVAL)
711831 throw SysError (" mounting /dev/pts" );
712- doBind (" /dev/pts" , chrootRootDir / " dev" / " pts" );
713- doBind (" /dev/ptmx" , chrootRootDir / " dev" / " ptmx " );
832+ doBind (" /dev/pts" , chrootRootDirFd. get (), " dev/ pts" , chrootRootDir );
833+ doBind (" /dev/ptmx" , chrootRootDirFd. get (), " dev/pts " , chrootRootDir );
714834 }
715835 }
716836
@@ -796,7 +916,8 @@ void ChrootLinuxDerivationBuilder::killSandbox(bool getStats)
796916
797917void ChrootLinuxDerivationBuilder::addDependencyImpl (const StorePath & path)
798918{
799- auto [source, target] = ChrootDerivationBuilder::addDependencyPrep (path);
919+ auto [source, targetRelPath] = ChrootDerivationBuilder::addDependencyPrep (path);
920+ assert (targetRelPath.is_relative ()); /* The path is relative to the chroot. */
800921
801922 /* Bind-mount the path into the sandbox. This requires
802923 entering its mount namespace, which is not possible
@@ -809,7 +930,11 @@ void ChrootLinuxDerivationBuilder::addDependencyImpl(const StorePath & path)
809930 if (setns (sandboxMountNamespace.get (), CLONE_NEWNS) == -1 )
810931 throw SysError (" entering sandbox mount namespace" );
811932
812- doBind (source, target);
933+ /* We really do have to reopen the dirfd in the child, otherwise move_mount dies with EINVAL. */
934+ AutoCloseFD chrootRootDirFd = openDirectory (chrootRootDir, FinalSymlink::DontFollow);
935+ if (!chrootRootDirFd)
936+ throw SysError (" opening directory %s" , PathFmt (chrootRootDir));
937+ doBind (source, chrootRootDirFd.get (), targetRelPath, chrootRootDir);
813938
814939 _exit (0 );
815940 }));
0 commit comments