diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 7eb3a99a9d6..0c07ae6c875 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1276,8 +1276,7 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts // restore using CRIU. This function is inspired from the code in // rootfs_linux.go func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { - switch m.Device { - case "cgroup": + if m.Device == "cgroup" { // No mount point(s) need to be created: // // * for v1, mount points are saved by CRIU because @@ -1286,26 +1285,11 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { // * for v2, /sys/fs/cgroup is a real mount, but // the mountpoint appears as soon as /sys is mounted return nil - case "bind": - // The prepareBindMount() function checks if source - // exists. So it cannot be used for other filesystem types. - // TODO: pass something else than nil? Not sure if criu is - // impacted by issue #2484 - if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil { - return err - } - default: - // for all other filesystems just create the mountpoints - dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) - if err != nil { - return err - } - if err := checkProcMount(c.config.Rootfs, dest, m, ""); err != nil { - return err - } - if err := os.MkdirAll(dest, 0o755); err != nil { - return err - } + } + // TODO: pass something else than nil? Not sure if criu is + // impacted by issue #2484 + if _, err := createMountpoint(c.config.Rootfs, m, nil, ""); err != nil { + return fmt.Errorf("create criu restore mount for %s mount: %w", m.Destination, err) } return nil } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index f66267307e5..78b6998c38f 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -224,36 +224,6 @@ func mountCmd(cmd configs.Command) error { return nil } -func prepareBindMount(m *configs.Mount, rootfs string, mountFd *int) error { - source := m.Source - if mountFd != nil { - source = "/proc/self/fd/" + strconv.Itoa(*mountFd) - } - - stat, err := os.Stat(source) - if err != nil { - // error out if the source of a bind mount does not exist as we will be - // unable to bind anything to it. - return err - } - // ensure that the destination of the bind mount is resolved of symlinks at mount time because - // any previous mounts can invalidate the next mount's destination. - // this can happen when a user specifies mounts within other mounts to cause breakouts or other - // evil stuff to try to escape the container's rootfs. - var dest string - if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { - return err - } - if err := checkProcMount(rootfs, dest, m, source); err != nil { - return err - } - if err := createIfNotExists(dest, stat.IsDir()); err != nil { - return err - } - - return nil -} - func mountCgroupV1(m *configs.Mount, c *mountConfig) error { binds, err := getCgroupMounts(m) if err != nil { @@ -282,7 +252,8 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { for _, b := range binds { if c.cgroupns { subsystemPath := filepath.Join(c.root, b.Destination) - if err := os.MkdirAll(subsystemPath, 0o755); err != nil { + subsystemName := filepath.Base(b.Destination) + if err := utils.MkdirAllInRoot(c.root, subsystemPath, 0o755); err != nil { return err } if err := utils.WithProcfd(c.root, b.Destination, func(procfd string) error { @@ -292,7 +263,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { } var ( source = "cgroup" - data = filepath.Base(subsystemPath) + data = subsystemName ) if data == "systemd" { data = cgroups.CgroupNamePrefix + data @@ -322,14 +293,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { } func mountCgroupV2(m *configs.Mount, c *mountConfig) error { - dest, err := securejoin.SecureJoin(c.root, m.Destination) - if err != nil { - return err - } - if err := os.MkdirAll(dest, 0o755); err != nil { - return err - } - err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error { + err := utils.WithProcfd(c.root, m.Destination, func(procfd string) error { return mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data) }) if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) { @@ -411,6 +375,81 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) { }) } +var errRootfsToFile = errors.New("config tries to change rootfs to file") + +func createMountpoint(rootfs string, m *configs.Mount, mountFd *int, source string) (string, error) { + dest, err := securejoin.SecureJoin(rootfs, m.Destination) + if err != nil { + return "", err + } + if err := checkProcMount(rootfs, dest, m, source); err != nil { + return "", fmt.Errorf("check proc-safety of %s mount: %w", m.Destination, err) + } + + switch m.Device { + case "bind": + source := m.Source + if mountFd != nil { + source = "/proc/self/fd/" + strconv.Itoa(*mountFd) + } + + fi, err := os.Stat(source) + if err != nil { + // Error out if the source of a bind mount does not exist as we + // will be unable to bind anything to it. + return "", fmt.Errorf("bind mount source stat: %w", err) + } + // If the original source is not a directory, make the target a file. + if !fi.IsDir() { + // Make sure we aren't tricked into trying to make the root a file. + if rootfs == dest { + return "", fmt.Errorf("%w: file bind mount over rootfs", errRootfsToFile) + } + // Make the parent directory. + destDir, destBase := filepath.Split(dest) + destDirFd, err := utils.MkdirAllInRootOpen(rootfs, destDir, 0o755) + if err != nil { + return "", fmt.Errorf("make parent dir of file bind-mount: %w", err) + } + defer destDirFd.Close() + // Make the target file. We want to avoid opening any file that is + // already there because it could be a "bad" file like an invalid + // device or hung tty that might cause a DoS, so we use mknodat. + // destBase does not contain any "/" components, and mknodat does + // not follow trailing symlinks, so we can safely just call mknodat + // here. + if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|0o644, 0); err != nil { + // If we get EEXIST, there was already an inode there and + // we can consider that a success. + if !errors.Is(err, unix.EEXIST) { + err = &os.PathError{Op: "mknod regular file", Path: dest, Err: err} + return "", fmt.Errorf("create target of file bind-mount: %w", err) + } + } + // Nothing left to do. + return dest, nil + } + + case "tmpfs": + // If the original target exists, copy the mode for the tmpfs mount. + if stat, err := os.Stat(dest); err == nil { + dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode())) + if m.Data != "" { + dt = dt + "," + m.Data + } + m.Data = dt + + // Nothing left to do. + return dest, nil + } + } + + if err := utils.MkdirAllInRoot(rootfs, dest, 0o755); err != nil { + return "", err + } + return dest, nil +} + func mountToRootfs(m *configs.Mount, c *mountConfig) error { rootfs := c.root @@ -435,53 +474,34 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { } else if !fi.IsDir() { return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device) } - if err := os.MkdirAll(dest, 0o755); err != nil { + if err := utils.MkdirAllInRoot(rootfs, dest, 0o755); err != nil { return err } // Selinux kernels do not support labeling of /proc or /sys. return mountPropagate(m, rootfs, "", nil) } - mountLabel := c.label mountFd := c.fd - dest, err := securejoin.SecureJoin(rootfs, m.Destination) + dest, err := createMountpoint(rootfs, m, mountFd, m.Source) if err != nil { - return err + return fmt.Errorf("create mount destination for %s mount: %w", m.Destination, err) } + mountLabel := c.label switch m.Device { case "mqueue": - if err := os.MkdirAll(dest, 0o755); err != nil { - return err - } if err := mountPropagate(m, rootfs, "", nil); err != nil { return err } return label.SetFileLabel(dest, mountLabel) case "tmpfs": - if stat, err := os.Stat(dest); err != nil { - if err := os.MkdirAll(dest, 0o755); err != nil { - return err - } - } else { - dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode())) - if m.Data != "" { - dt = dt + "," + m.Data - } - m.Data = dt - } - if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP { err = doTmpfsCopyUp(m, rootfs, mountLabel) } else { err = mountPropagate(m, rootfs, mountLabel, nil) } - return err case "bind": - if err := prepareBindMount(m, rootfs, mountFd); err != nil { - return err - } if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil { return err } @@ -509,12 +529,6 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { } return mountCgroupV1(m, c) default: - if err := checkProcMount(rootfs, dest, m, m.Source); err != nil { - return err - } - if err := os.MkdirAll(dest, 0o755); err != nil { - return err - } return mountPropagate(m, rootfs, mountLabel, mountFd) } if err := setRecAttr(m, rootfs); err != nil { @@ -745,7 +759,10 @@ func createDeviceNode(rootfs string, node *devices.Device, bind bool) error { if err != nil { return err } - if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil { + if dest == rootfs { + return fmt.Errorf("%w: mknod over rootfs", errRootfsToFile) + } + if err := utils.MkdirAllInRoot(rootfs, filepath.Dir(dest), 0o755); err != nil { return err } if bind { @@ -1011,26 +1028,6 @@ func chroot() error { return nil } -// createIfNotExists creates a file or a directory only if it does not already exist. -func createIfNotExists(path string, isDir bool) error { - if _, err := os.Stat(path); err != nil { - if os.IsNotExist(err) { - if isDir { - return os.MkdirAll(path, 0o755) - } - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { - return err - } - f, err := os.OpenFile(path, os.O_CREATE, 0o755) - if err != nil { - return err - } - _ = f.Close() - } - } - return nil -} - // readonlyPath will make a path read only. func readonlyPath(path string) error { if err := mount(path, path, "", "", unix.MS_BIND|unix.MS_REC, ""); err != nil { diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go index 16edc6ba6d9..32bab6922bd 100644 --- a/libcontainer/system/linux.go +++ b/libcontainer/system/linux.go @@ -6,6 +6,8 @@ package system import ( "os" "os/exec" + "runtime" + "strings" "unsafe" "golang.org/x/sys/unix" @@ -102,3 +104,42 @@ func GetSubreaper() (int, error) { return int(i), nil } + +func prepareAt(dir *os.File, path string) (int, string) { + if dir == nil { + return unix.AT_FDCWD, path + } + + // Rather than just filepath.Join-ing path here, do it manually so the + // error and handle correctly indicate cases like path=".." as being + // relative to the correct directory. The handle.Name() might end up being + // wrong but because this is (currently) only used in MkdirAllInRoot, that + // isn't a problem. + dirName := dir.Name() + if !strings.HasSuffix(dirName, "/") { + dirName += "/" + } + fullPath := dirName + path + + return int(dir.Fd()), fullPath +} + +func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) { + dirFd, fullPath := prepareAt(dir, path) + fd, err := unix.Openat(dirFd, path, flags, mode) + if err != nil { + return nil, &os.PathError{Op: "openat", Path: fullPath, Err: err} + } + runtime.KeepAlive(dir) + return os.NewFile(uintptr(fd), fullPath), nil +} + +func Mkdirat(dir *os.File, path string, mode uint32) error { + dirFd, fullPath := prepareAt(dir, path) + err := unix.Mkdirat(dirFd, path, mode) + if err != nil { + err = &os.PathError{Op: "mkdirat", Path: fullPath, Err: err} + } + runtime.KeepAlive(dir) + return err +} diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index bf3237a2911..460b94cef3f 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -4,11 +4,17 @@ package utils import ( + "errors" "fmt" "os" + "path/filepath" "strconv" + "strings" _ "unsafe" // for go:linkname + "github.com/opencontainers/runc/libcontainer/system" + + securejoin "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" ) @@ -115,3 +121,126 @@ func NewSockPair(name string) (parent *os.File, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"), +// but properly handling the case where path or root are "/". +// +// NOTE: The return value only make sense if the path doesn't contain "..". +func IsLexicallyInRoot(root, path string) bool { + if root != "/" { + root += "/" + } + if path != "/" { + path += "/" + } + return strings.HasPrefix(path, root) +} + +// MkdirAllInRootOpen attempts to make +// +// path, _ := securejoin.SecureJoin(root, unsafePath) +// os.MkdirAll(path, mode) +// os.Open(path) +// +// safer against attacks where components in the path are changed between +// SecureJoin returning and MkdirAll (or Open) being called. In particular, we +// try to detect any symlink components in the path while we are doing the +// MkdirAll. +// +// NOTE: Unlike os.MkdirAll, mode is not Go's os.FileMode, it is the unix mode +// (the suid/sgid/sticky bits are not the same as for os.FileMode). +// +// NOTE: If unsafePath is a subpath of root, we assume that you have already +// called SecureJoin and so we use the provided path verbatim without resolving +// any symlinks (this is done in a way that avoids symlink-exchange races). +// This means that the path also must not contain ".." elements, otherwise an +// error will occur. +// +// This is a somewhat less safe alternative to +// , but it should +// detect attempts to trick us into creating directories outside of the root. +// We should migrate to securejoin.MkdirAll once it is merged. +func MkdirAllInRootOpen(root, unsafePath string, mode uint32) (_ *os.File, Err error) { + // If the path is already "within" the root, use it verbatim. + fullPath := unsafePath + if !IsLexicallyInRoot(root, unsafePath) { + var err error + fullPath, err = securejoin.SecureJoin(root, unsafePath) + if err != nil { + return nil, err + } + } + subPath, err := filepath.Rel(root, fullPath) + if err != nil { + return nil, err + } + + // Check for any silly mode bits. + if mode&^0o7777 != 0 { + return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode) + } + + currentDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open root handle: %w", err) + } + defer func() { + if Err != nil { + currentDir.Close() + } + }() + + for _, part := range strings.Split(subPath, string(filepath.Separator)) { + switch part { + case "", ".": + // Skip over no-op components. + continue + case "..": + return nil, fmt.Errorf("possible breakout detected: found %q component in SecureJoin subpath %s", part, subPath) + } + + nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + switch { + case err == nil: + // Update the currentDir. + _ = currentDir.Close() + currentDir = nextDir + + case errors.Is(err, unix.ENOTDIR): + // This might be a symlink or some other random file. Either way, + // error out. + return nil, fmt.Errorf("cannot mkdir in %s/%s: %w", currentDir.Name(), part, unix.ENOTDIR) + + case errors.Is(err, os.ErrNotExist): + // Luckily, mkdirat will not follow trailing symlinks, so this is + // safe to do as-is. + if err := system.Mkdirat(currentDir, part, mode); err != nil { + return nil, err + } + // Open the new directory. There is a race here where an attacker + // could swap the directory with a different directory, but + // MkdirAll's fuzzy semantics mean we don't care about that. + nextDir, err := system.Openat(currentDir, part, unix.O_DIRECTORY|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open newly created directory: %w", err) + } + // Update the currentDir. + _ = currentDir.Close() + currentDir = nextDir + + default: + return nil, err + } + } + return currentDir, nil +} + +// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the +// returned handle, for callers that don't need to use it. +func MkdirAllInRoot(root, unsafePath string, mode uint32) error { + f, err := MkdirAllInRootOpen(root, unsafePath, mode) + if err == nil { + _ = f.Close() + } + return err +}