From e81938064402940ca8176d6f3145f65b1d455996 Mon Sep 17 00:00:00 2001 From: zhongjiawei Date: Thu, 1 Feb 2024 18:25:16 +0800 Subject: [PATCH] runc:fix CVE-2024-21626 --- libcontainer/container_linux.go | 50 +- libcontainer/container_linux.go.orig | 1660 ----------------- libcontainer/factory_linux.go | 15 +- libcontainer/init_linux.go | 39 +- libcontainer/process_linux.go | 3 +- libcontainer/setns_init_linux.go | 19 + libcontainer/standard_init_linux.go | 28 +- libcontainer/standard_init_linux.go.orig | 223 --- libcontainer/utils/utils.go | 38 - libcontainer/utils/utils_unix.go | 253 ++- vendor/golang.org/x/sys/unix/flock.go | 5 + .../x/sys/unix/zerrors_linux_amd64.go | 1 + .../x/sys/unix/zerrors_linux_arm64.go | 1 + .../x/sys/unix/zsyscall_linux_amd64.go | 10 + .../x/sys/unix/zsyscall_linux_arm64.go | 10 + .../x/sys/unix/zsysnum_linux_amd64.go | 1 + .../x/sys/unix/zsysnum_linux_arm64.go | 1 + .../x/sys/unix/ztypes_linux_amd64.go | 5 + .../x/sys/unix/ztypes_linux_arm64.go | 5 + 19 files changed, 403 insertions(+), 1964 deletions(-) delete mode 100644 libcontainer/container_linux.go.orig delete mode 100644 libcontainer/standard_init_linux.go.orig diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index a4859ca..c757d71 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package libcontainer @@ -28,6 +29,7 @@ import ( "github.com/opencontainers/runc/libcontainer/utils" "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" ) const stdioFdCount = 3 @@ -321,6 +323,15 @@ func (c *linuxContainer) start(process *Process) error { }() } + // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC + // to make sure we don't leak any files into "runc init". Any files to be + // passed to "runc init" through ExtraFiles will get dup2'd by the Go + // runtime and thus their O_CLOEXEC flag will be cleared. This is some + // additional protection against attacks like CVE-2024-21626, by making + // sure we never leak files to "runc init" we didn't intend to. + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) + } if err := parent.start(); err != nil { // terminate the process to ensure that it properly is reaped. if err := parent.terminate(); err != nil { @@ -414,6 +425,23 @@ func (c *linuxContainer) deleteExecFifo() { os.Remove(fifoName) } +// includeExecFifo opens the container's execfifo as a pathfd, so that the +// container cannot access the statedir (and the FIFO itself remains +// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited +// fd, with _LIBCONTAINER_FIFOFD set to its fd number. +func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { + fifoName := filepath.Join(c.root, execFifoFilename) + fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return err + } + + cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName)) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + return nil +} + func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { parentPipe, childPipe, err := utils.NewSockPair("init") if err != nil { @@ -430,18 +458,15 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } - // We only set up rootDir if we're not doing a `runc exec`. The reason for - // this is to avoid cases where a racing, unprivileged process inside the - // container can get access to the statedir file descriptor (which would - // allow for container rootfs escape). - rootDir, err := os.Open(c.root) - if err != nil { - return nil, err + // We only set up fifoFd if we're not doing a `runc exec`. The historic + // reason for this is that previously we would pass a dirfd that allowed + // for container rootfs escape (and not doing it in `runc exec` avoided + // that problem), but we no longer do that. However, there's no need to do + // this for `runc exec` so we just keep it this way to be safe. + if err := c.includeExecFifo(cmd); err != nil { + return nil, fmt.Errorf("unable to setup exec fifo: %w", err) } - cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) - return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) + return c.newInitProcess(p, cmd, parentPipe, childPipe) } func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { @@ -479,7 +504,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. return cmd, nil } -func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { @@ -501,7 +526,6 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c process: p, bootstrapData: data, sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID), - rootDir: rootDir, }, nil } diff --git a/libcontainer/container_linux.go.orig b/libcontainer/container_linux.go.orig deleted file mode 100644 index d678407..0000000 --- a/libcontainer/container_linux.go.orig +++ /dev/null @@ -1,1660 +0,0 @@ -// +build linux - -package libcontainer - -import ( - "bytes" - "encoding/json" - "errors" - "fmt" - "io" - "io/ioutil" - "os" - "os/exec" - "path/filepath" - "reflect" - "strings" - "sync" - "syscall" - "time" - - "github.com/Sirupsen/logrus" - "github.com/golang/protobuf/proto" - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/criurpc" - "github.com/opencontainers/runc/libcontainer/logs" - "github.com/opencontainers/runc/libcontainer/system" - "github.com/opencontainers/runc/libcontainer/utils" - "github.com/syndtr/gocapability/capability" - "github.com/vishvananda/netlink/nl" -) - -const stdioFdCount = 3 - -type linuxContainer struct { - id string - root string - config *configs.Config - cgroupManager cgroups.Manager - initArgs []string - initProcess parentProcess - initProcessStartTime string - criuPath string - m sync.Mutex - criuVersion int - state containerState - created time.Time -} - -// State represents a running container's state -type State struct { - BaseState - - // Platform specific fields below here - - // Specifies if the container was started under the rootless mode. - Rootless bool `json:"rootless"` - - // Path to all the cgroups setup for a container. Key is cgroup subsystem name - // with the value as the path. - CgroupPaths map[string]string `json:"cgroup_paths"` - - // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type - // with the value as the path. - NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` - - // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore - ExternalDescriptors []string `json:"external_descriptors,omitempty"` -} - -// CompatState -type CompatState struct { - State - Config configs.CompatConfig `json:"config"` -} - -// Container is a libcontainer container object. -// -// Each container is thread-safe within the same process. Since a container can -// be destroyed by a separate process, any function may return that the container -// was not found. -type Container interface { - BaseContainer - - // Methods below here are platform specific - - // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. - // - // errors: - // Systemerror - System error. - Checkpoint(criuOpts *CriuOpts) error - - // Restore restores the checkpointed container to a running state using the criu(8) utility. - // - // errors: - // Systemerror - System error. - Restore(process *Process, criuOpts *CriuOpts) error - - // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses - // the execution of any user processes. Asynchronously, when the container finished being paused the - // state is changed to PAUSED. - // If the Container state is PAUSED, do nothing. - // - // errors: - // ContainerNotExists - Container no longer exists, - // ContainerNotRunning - Container not running or created, - // Systemerror - System error. - Pause() error - - // If the Container state is PAUSED, resumes the execution of any user processes in the - // Container before setting the Container state to RUNNING. - // If the Container state is RUNNING, do nothing. - // - // errors: - // ContainerNotExists - Container no longer exists, - // ContainerNotPaused - Container is not paused, - // Systemerror - System error. - Resume() error - - // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. - // - // errors: - // Systemerror - System error. - NotifyOOM() (<-chan struct{}, error) - - // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level - // - // errors: - // Systemerror - System error. - NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) -} - -// ID returns the container's unique ID -func (c *linuxContainer) ID() string { - return c.id -} - -// Config returns the container's configuration -func (c *linuxContainer) Config() configs.Config { - return *c.config -} - -func (c *linuxContainer) Status() (Status, error) { - c.m.Lock() - defer c.m.Unlock() - return c.currentStatus() -} - -func (c *linuxContainer) State() (*State, error) { - c.m.Lock() - defer c.m.Unlock() - return c.currentState() -} - -func (c *linuxContainer) Processes() ([]int, error) { - pids, err := c.cgroupManager.GetAllPids() - if err != nil { - return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") - } - return pids, nil -} - -func (c *linuxContainer) Stats() (*Stats, error) { - var ( - err error - stats = &Stats{} - ) - if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { - return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") - } - for _, iface := range c.config.Networks { - switch iface.Type { - case "veth": - istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) - if err != nil { - return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) - } - stats.Interfaces = append(stats.Interfaces, istats) - } - } - return stats, nil -} - -func (c *linuxContainer) Set(config configs.Config) error { - c.m.Lock() - defer c.m.Unlock() - status, err := c.currentStatus() - if err != nil { - return err - } - if status == Stopped { - return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) - } - c.config = &config - return c.cgroupManager.Set(c.config) -} - -func (c *linuxContainer) Start(process *Process) error { - c.m.Lock() - defer c.m.Unlock() - if process.Init { - if err := c.createExecFifo(); err != nil { - return err - } - } - if err := c.start(process); err != nil { - if process.Init { - c.deleteExecFifo() - } - return err - } - return nil -} - -func (c *linuxContainer) Run(process *Process) error { - if err := c.Start(process); err != nil { - return err - } - if process.Init { - return c.exec() - } - return nil -} - -func (c *linuxContainer) Exec() error { - c.m.Lock() - defer c.m.Unlock() - return c.exec() -} - -func (c *linuxContainer) exec() error { - path := filepath.Join(c.root, execFifoFilename) - - fifoOpen := make(chan struct{}) - select { - case <-awaitProcessExit(c.initProcess.pid(), fifoOpen): - return errors.New("container process is already dead") - case result := <-awaitFifoOpen(path, fifoOpen): - if result.err != nil { - return result.err - } - f := result.file - defer f.Close() - if err := readFromExecFifo(f); err != nil { - return err - } - if err := os.Remove(path); !os.IsNotExist(err) { - return err - } - return nil - } -} - -func readFromExecFifo(execFifo io.Reader) error { - data, err := ioutil.ReadAll(execFifo) - if err != nil { - return err - } - if len(data) <= 0 { - return fmt.Errorf("cannot start an already running container") - } - return nil -} - -func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} { - isDead := make(chan struct{}) - go func() { - for { - select { - case <-exit: - return - case <-time.After(time.Millisecond * 100): - stat, err := system.GetProcessState(pid) - if err != nil || stat == system.Zombie { - select { - case <-exit: - return - default: - close(isDead) - } - return - } - } - } - }() - return isDead -} - -func awaitFifoOpen(path string, fifoOpen chan struct{}) <-chan openResult { - fifoOpened := make(chan openResult) - go func() { - f, err := os.OpenFile(path, os.O_RDONLY, 0) - close(fifoOpen) - if err != nil { - fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")} - return - } - fifoOpened <- openResult{file: f} - }() - return fifoOpened -} - -type openResult struct { - file *os.File - err error -} - -func (c *linuxContainer) start(process *Process) error { - parent, err := c.newParentProcess(process) - if err != nil { - return newSystemErrorWithCause(err, "creating new parent process") - } - - if logsDone := logs.ForwardLogs(); logsDone != nil { - defer func() { - select { - case <-logsDone: - case <-time.After(3 * time.Second): - logrus.Warnf("wait child close logfd timeout") - } - }() - } - - if err := parent.start(); err != nil { - // terminate the process to ensure that it properly is reaped. - if err := parent.terminate(); err != nil { - logrus.Warnf("parent process terminate error: %v", err) - } - return newSystemErrorWithCause(err, "starting container process") - } - // generate a timestamp indicating when the container was started - c.created = time.Now().UTC() - if process.Init { - c.state = &createdState{ - c: c, - } - state, err := c.updateState(parent) - if err != nil { - return err - } - c.initProcessStartTime = state.InitProcessStartTime - - if c.config.Hooks != nil { - s := configs.HookState{ - SpecState: configs.SpecState{ - Version: c.config.Version, - ID: c.id, - Pid: parent.pid(), - Bundle: utils.SearchLabels(c.config.Labels, "bundle"), - }, - Root: c.config.Rootfs, - } - for i, hook := range c.config.Hooks.Poststart { - logrus.Infof("run poststart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID) - if err := hook.Run(s); err != nil { - logrus.Warnf("running poststart hook %d:%s failed: %s, ContainerId: %s", i, hook.Info(), err, s.ID) - } - } - } - } else { - c.state = &runningState{ - c: c, - } - } - return nil -} - -func (c *linuxContainer) Signal(s os.Signal, all bool) error { - if all { - return signalAllProcesses(c.cgroupManager, s) - } - status, err := c.currentStatus() - if err != nil { - return err - } - // to avoid a PID reuse attack - if status == Running || status == Created { - if err := c.initProcess.signal(s); err != nil { - return newSystemErrorWithCause(err, "signaling init process") - } - return nil - } - return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) -} - -func (c *linuxContainer) createExecFifo() error { - rootuid, err := c.Config().HostRootUID() - if err != nil { - return err - } - rootgid, err := c.Config().HostRootGID() - if err != nil { - return err - } - - fifoName := filepath.Join(c.root, execFifoFilename) - if _, err := os.Stat(fifoName); err == nil { - return fmt.Errorf("exec fifo %s already exists", fifoName) - } - oldMask := syscall.Umask(0000) - if err := syscall.Mkfifo(fifoName, 0622); err != nil { - syscall.Umask(oldMask) - return err - } - syscall.Umask(oldMask) - if err := os.Chown(fifoName, rootuid, rootgid); err != nil { - return err - } - return nil -} - -func (c *linuxContainer) deleteExecFifo() { - fifoName := filepath.Join(c.root, execFifoFilename) - os.Remove(fifoName) -} - -func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { - parentPipe, childPipe, err := utils.NewSockPair("init") - if err != nil { - return nil, newSystemErrorWithCause(err, "creating new init pipe") - } - if err := logs.InitLogPipe(); err != nil { - return nil, fmt.Errorf("Unable to create the log pipe: %s", err) - } - cmd, err := c.commandTemplate(p, childPipe) - if err != nil { - return nil, newSystemErrorWithCause(err, "creating new command template") - } - if !p.Init { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe) - } - - // We only set up rootDir if we're not doing a `runc exec`. The reason for - // this is to avoid cases where a racing, unprivileged process inside the - // container can get access to the statedir file descriptor (which would - // allow for container rootfs escape). - rootDir, err := os.Open(c.root) - if err != nil { - return nil, err - } - cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) - return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) -} - -func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { - cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) - cmd.Stdin = p.Stdin - cmd.Stdout = p.Stdout - cmd.Stderr = p.Stderr - cmd.Dir = c.config.Rootfs - if cmd.SysProcAttr == nil { - cmd.SysProcAttr = &syscall.SysProcAttr{} - } - cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) - if p.ConsoleSocket != nil { - cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), - ) - } - cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), - ) - - cmd.ExtraFiles = append(cmd.ExtraFiles, logs.ChildLogPipe) - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), - ) - - // NOTE: when running a container with no PID namespace and the parent process spawning the container is - // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason - // even with the parent still running. - if c.config.ParentDeathSignal > 0 { - cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal) - } - return cmd, nil -} - -func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { - cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) - nsMaps := make(map[configs.NamespaceType]string) - for _, ns := range c.config.Namespaces { - if ns.Path != "" { - nsMaps[ns.Type] = ns.Path - } - } - data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) - if err != nil { - return nil, err - } - return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), - container: c, - process: p, - bootstrapData: data, - sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID), - rootDir: rootDir, - }, nil -} - -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { - cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) - state, err := c.currentState() - if err != nil { - return nil, newSystemErrorWithCause(err, "getting container's current state") - } - // for setns process, we don't have to set cloneflags as the process namespaces - // will only be set via setns syscall - data, err := c.bootstrapData(0, state.NamespacePaths) - if err != nil { - return nil, err - } - return &setnsProcess{ - cmd: cmd, - cgroupPaths: c.cgroupManager.GetPaths(), - childPipe: childPipe, - parentPipe: parentPipe, - config: c.newInitConfig(p), - process: p, - bootstrapData: data, - }, nil -} - -func (c *linuxContainer) newInitConfig(process *Process) *initConfig { - cfg := &initConfig{ - Config: c.config, - Args: process.Args, - Env: process.Env, - User: process.User, - AdditionalGroups: process.AdditionalGroups, - Cwd: process.Cwd, - Capabilities: process.Capabilities, - PassedFilesCount: len(process.ExtraFiles), - ContainerId: c.ID(), - NoNewPrivileges: c.config.NoNewPrivileges, - Rootless: c.config.Rootless, - AppArmorProfile: c.config.AppArmorProfile, - ProcessLabel: c.config.ProcessLabel, - Rlimits: c.config.Rlimits, - } - if process.NoNewPrivileges != nil { - cfg.NoNewPrivileges = *process.NoNewPrivileges - } - if process.AppArmorProfile != "" { - cfg.AppArmorProfile = process.AppArmorProfile - } - if process.Label != "" { - cfg.ProcessLabel = process.Label - } - if len(process.Rlimits) > 0 { - cfg.Rlimits = process.Rlimits - } - cfg.CreateConsole = process.ConsoleSocket != nil - return cfg -} - -func (c *linuxContainer) Destroy() error { - c.m.Lock() - defer c.m.Unlock() - return c.state.destroy() -} - -func (c *linuxContainer) Pause() error { - c.m.Lock() - defer c.m.Unlock() - status, err := c.currentStatus() - if err != nil { - return err - } - switch status { - case Running, Created: - if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { - return err - } - return c.state.transition(&pausedState{ - c: c, - }) - } - return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) -} - -func (c *linuxContainer) Resume() error { - c.m.Lock() - defer c.m.Unlock() - status, err := c.currentStatus() - if err != nil { - return err - } - if status != Paused { - return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) - } - if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { - return err - } - return c.state.transition(&runningState{ - c: c, - }) -} - -func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { - // XXX(cyphar): This requires cgroups. - if c.config.Rootless { - return nil, fmt.Errorf("cannot get OOM notifications from rootless container") - } - return notifyOnOOM(c.cgroupManager.GetPaths()) -} - -func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { - // XXX(cyphar): This requires cgroups. - if c.config.Rootless { - return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") - } - return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) -} - -var criuFeatures *criurpc.CriuFeatures - -func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error { - - var t criurpc.CriuReqType - t = criurpc.CriuReqType_FEATURE_CHECK - - if err := c.checkCriuVersion("1.8"); err != nil { - // Feature checking was introduced with CRIU 1.8. - // Ignore the feature check if an older CRIU version is used - // and just act as before. - // As all automated PR testing is done using CRIU 1.7 this - // code will not be tested by automated PR testing. - return nil - } - - // make sure the features we are looking for are really not from - // some previous check - criuFeatures = nil - - req := &criurpc.CriuReq{ - Type: &t, - // Theoretically this should not be necessary but CRIU - // segfaults if Opts is empty. - // Fixed in CRIU 2.12 - Opts: rpcOpts, - Features: criuFeat, - } - - err := c.criuSwrk(nil, req, criuOpts, false) - if err != nil { - logrus.Debugf("%s", err) - return fmt.Errorf("CRIU feature check failed") - } - - logrus.Debugf("Feature check says: %s", criuFeatures) - missingFeatures := false - - if *criuFeat.MemTrack && !*criuFeatures.MemTrack { - missingFeatures = true - logrus.Debugf("CRIU does not support MemTrack") - } - - if missingFeatures { - return fmt.Errorf("CRIU is missing features") - } - - return nil -} - -// checkCriuVersion checks Criu version greater than or equal to minVersion -func (c *linuxContainer) checkCriuVersion(minVersion string) error { - var x, y, z, versionReq int - - _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 - if err != nil { - _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6 - } - versionReq = x*10000 + y*100 + z - - out, err := exec.Command(c.criuPath, "-V").Output() - if err != nil { - return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath) - } - - x = 0 - y = 0 - z = 0 - if ep := strings.Index(string(out), "-"); ep >= 0 { - // criu Git version format - var version string - if sp := strings.Index(string(out), "GitID"); sp > 0 { - version = string(out)[sp:ep] - } else { - return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath) - } - - n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 - if err != nil { - n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6 - y++ - } else { - z++ - } - if n < 2 || err != nil { - return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err) - } - } else { - // criu release version format - n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2 - if err != nil { - n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6 - } - if n < 2 || err != nil { - return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err) - } - } - - c.criuVersion = x*10000 + y*100 + z - - if c.criuVersion < versionReq { - return fmt.Errorf("CRIU version must be %s or higher", minVersion) - } - - return nil -} - -const descriptorsFilename = "descriptors.json" - -func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { - mountDest := m.Destination - if strings.HasPrefix(mountDest, c.config.Rootfs) { - mountDest = mountDest[len(c.config.Rootfs):] - } - - extMnt := &criurpc.ExtMountMap{ - Key: proto.String(mountDest), - Val: proto.String(mountDest), - } - req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) -} - -func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { - for _, path := range c.config.MaskPaths { - fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) - if err != nil { - if os.IsNotExist(err) { - continue - } - return err - } - if fi.IsDir() { - continue - } - - extMnt := &criurpc.ExtMountMap{ - Key: proto.String(path), - Val: proto.String("/dev/null"), - } - req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) - } - - return nil -} - -func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { - c.m.Lock() - defer c.m.Unlock() - - // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has - // support for doing unprivileged dumps, but the setup of - // rootless containers might make this complicated. - if c.config.Rootless { - return fmt.Errorf("cannot checkpoint a rootless container") - } - - if err := c.checkCriuVersion("1.5.2"); err != nil { - return err - } - - if criuOpts.ImagesDirectory == "" { - return fmt.Errorf("invalid directory to save checkpoint") - } - - // Since a container can be C/R'ed multiple times, - // the checkpoint directory may already exist. - if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) { - return err - } - - if criuOpts.WorkDirectory == "" { - criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") - } - - if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) { - return err - } - - workDir, err := os.Open(criuOpts.WorkDirectory) - if err != nil { - return err - } - defer workDir.Close() - - imageDir, err := os.Open(criuOpts.ImagesDirectory) - if err != nil { - return err - } - defer imageDir.Close() - - rpcOpts := criurpc.CriuOpts{ - ImagesDirFd: proto.Int32(int32(imageDir.Fd())), - WorkDirFd: proto.Int32(int32(workDir.Fd())), - LogLevel: proto.Int32(4), - LogFile: proto.String("dump.log"), - Root: proto.String(c.config.Rootfs), - ManageCgroups: proto.Bool(true), - NotifyScripts: proto.Bool(true), - Pid: proto.Int32(int32(c.initProcess.pid())), - ShellJob: proto.Bool(criuOpts.ShellJob), - LeaveRunning: proto.Bool(criuOpts.LeaveRunning), - TcpEstablished: proto.Bool(criuOpts.TcpEstablished), - ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), - FileLocks: proto.Bool(criuOpts.FileLocks), - EmptyNs: proto.Uint32(criuOpts.EmptyNs), - } - - // append optional criu opts, e.g., page-server and port - if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { - rpcOpts.Ps = &criurpc.CriuPageServerInfo{ - Address: proto.String(criuOpts.PageServer.Address), - Port: proto.Int32(criuOpts.PageServer.Port), - } - } - - //pre-dump may need parentImage param to complete iterative migration - if criuOpts.ParentImage != "" { - rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) - rpcOpts.TrackMem = proto.Bool(true) - } - - // append optional manage cgroups mode - if criuOpts.ManageCgroupsMode != 0 { - if err := c.checkCriuVersion("1.7"); err != nil { - return err - } - mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) - rpcOpts.ManageCgroupsMode = &mode - } - - var t criurpc.CriuReqType - if criuOpts.PreDump { - feat := criurpc.CriuFeatures{ - MemTrack: proto.Bool(true), - } - - if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { - return err - } - - t = criurpc.CriuReqType_PRE_DUMP - } else { - t = criurpc.CriuReqType_DUMP - } - req := &criurpc.CriuReq{ - Type: &t, - Opts: &rpcOpts, - } - - //no need to dump these information in pre-dump - if !criuOpts.PreDump { - for _, m := range c.config.Mounts { - switch m.Device { - case "bind": - c.addCriuDumpMount(req, m) - break - case "cgroup": - binds, err := getCgroupMounts(m) - if err != nil { - return err - } - for _, b := range binds { - c.addCriuDumpMount(req, b) - } - break - } - } - - if err := c.addMaskPaths(req); err != nil { - return err - } - - for _, node := range c.config.Devices { - m := &configs.Mount{Destination: node.Path, Source: node.Path} - c.addCriuDumpMount(req, m) - } - - // Write the FD info to a file in the image directory - fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) - if err != nil { - return err - } - - err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) - if err != nil { - return err - } - } - - err = c.criuSwrk(nil, req, criuOpts, false) - if err != nil { - return err - } - return nil -} - -func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { - mountDest := m.Destination - if strings.HasPrefix(mountDest, c.config.Rootfs) { - mountDest = mountDest[len(c.config.Rootfs):] - } - - extMnt := &criurpc.ExtMountMap{ - Key: proto.String(mountDest), - Val: proto.String(m.Source), - } - req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) -} - -func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { - for _, iface := range c.config.Networks { - switch iface.Type { - case "veth": - veth := new(criurpc.CriuVethPair) - veth.IfOut = proto.String(iface.HostInterfaceName) - veth.IfIn = proto.String(iface.Name) - req.Opts.Veths = append(req.Opts.Veths, veth) - break - case "loopback": - break - } - } - for _, i := range criuOpts.VethPairs { - veth := new(criurpc.CriuVethPair) - veth.IfOut = proto.String(i.HostInterfaceName) - veth.IfIn = proto.String(i.ContainerInterfaceName) - req.Opts.Veths = append(req.Opts.Veths, veth) - } -} - -func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { - c.m.Lock() - defer c.m.Unlock() - - // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have - // support for unprivileged restore at the moment. - if c.config.Rootless { - return fmt.Errorf("cannot restore a rootless container") - } - - if err := c.checkCriuVersion("1.5.2"); err != nil { - return err - } - if criuOpts.WorkDirectory == "" { - criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") - } - // Since a container can be C/R'ed multiple times, - // the work directory may already exist. - if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { - return err - } - workDir, err := os.Open(criuOpts.WorkDirectory) - if err != nil { - return err - } - defer workDir.Close() - if criuOpts.ImagesDirectory == "" { - return fmt.Errorf("invalid directory to restore checkpoint") - } - imageDir, err := os.Open(criuOpts.ImagesDirectory) - if err != nil { - return err - } - defer imageDir.Close() - // CRIU has a few requirements for a root directory: - // * it must be a mount point - // * its parent must not be overmounted - // c.config.Rootfs is bind-mounted to a temporary directory - // to satisfy these requirements. - root := filepath.Join(c.root, "criu-root") - if err := os.Mkdir(root, 0755); err != nil { - return err - } - defer os.Remove(root) - root, err = filepath.EvalSymlinks(root) - if err != nil { - return err - } - err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") - if err != nil { - return err - } - defer syscall.Unmount(root, syscall.MNT_DETACH) - t := criurpc.CriuReqType_RESTORE - req := &criurpc.CriuReq{ - Type: &t, - Opts: &criurpc.CriuOpts{ - ImagesDirFd: proto.Int32(int32(imageDir.Fd())), - WorkDirFd: proto.Int32(int32(workDir.Fd())), - EvasiveDevices: proto.Bool(true), - LogLevel: proto.Int32(4), - LogFile: proto.String("restore.log"), - RstSibling: proto.Bool(true), - Root: proto.String(root), - ManageCgroups: proto.Bool(true), - NotifyScripts: proto.Bool(true), - ShellJob: proto.Bool(criuOpts.ShellJob), - ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), - TcpEstablished: proto.Bool(criuOpts.TcpEstablished), - FileLocks: proto.Bool(criuOpts.FileLocks), - EmptyNs: proto.Uint32(criuOpts.EmptyNs), - }, - } - - for _, m := range c.config.Mounts { - switch m.Device { - case "bind": - c.addCriuRestoreMount(req, m) - break - case "cgroup": - binds, err := getCgroupMounts(m) - if err != nil { - return err - } - for _, b := range binds { - c.addCriuRestoreMount(req, b) - } - break - } - } - - if len(c.config.MaskPaths) > 0 { - m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} - c.addCriuRestoreMount(req, m) - } - - for _, node := range c.config.Devices { - m := &configs.Mount{Destination: node.Path, Source: node.Path} - c.addCriuRestoreMount(req, m) - } - - if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { - c.restoreNetwork(req, criuOpts) - } - - // append optional manage cgroups mode - if criuOpts.ManageCgroupsMode != 0 { - if err := c.checkCriuVersion("1.7"); err != nil { - return err - } - mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) - req.Opts.ManageCgroupsMode = &mode - } - - var ( - fds []string - fdJSON []byte - ) - if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { - return err - } - - if err := json.Unmarshal(fdJSON, &fds); err != nil { - return err - } - for i := range fds { - if s := fds[i]; strings.Contains(s, "pipe:") { - inheritFd := new(criurpc.InheritFd) - inheritFd.Key = proto.String(s) - inheritFd.Fd = proto.Int32(int32(i)) - req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) - } - } - return c.criuSwrk(process, req, criuOpts, true) -} - -func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { - // XXX: Do we need to deal with this case? AFAIK criu still requires root. - if err := c.cgroupManager.Apply(pid); err != nil { - return err - } - - if err := c.cgroupManager.Set(c.config); err != nil { - return newSystemError(err) - } - - path := fmt.Sprintf("/proc/%d/cgroup", pid) - cgroupsPaths, err := cgroups.ParseCgroupFile(path) - if err != nil { - return err - } - - for c, p := range cgroupsPaths { - cgroupRoot := &criurpc.CgroupRoot{ - Ctrl: proto.String(c), - Path: proto.String(p), - } - req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) - } - - return nil -} - -func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { - fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) - if err != nil { - return err - } - - logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) - criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") - criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") - defer criuClient.Close() - defer criuServer.Close() - - args := []string{"swrk", "3"} - logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) - logrus.Debugf("Using CRIU with following args: %s", args) - cmd := exec.Command(c.criuPath, args...) - if process != nil { - cmd.Stdin = process.Stdin - cmd.Stdout = process.Stdout - cmd.Stderr = process.Stderr - } - cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) - - if err := cmd.Start(); err != nil { - return err - } - criuServer.Close() - - defer func() { - criuClient.Close() - _, err := cmd.Process.Wait() - if err != nil { - return - } - }() - - if applyCgroups { - err := c.criuApplyCgroups(cmd.Process.Pid, req) - if err != nil { - return err - } - } - - var extFds []string - if process != nil { - extFds, err = getPipeFds(cmd.Process.Pid) - if err != nil { - return err - } - } - - logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) - // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() - // should be empty. For older CRIU versions it still will be - // available but empty. - if req.GetType() != criurpc.CriuReqType_FEATURE_CHECK { - val := reflect.ValueOf(req.GetOpts()) - v := reflect.Indirect(val) - for i := 0; i < v.NumField(); i++ { - st := v.Type() - name := st.Field(i).Name - if strings.HasPrefix(name, "XXX_") { - continue - } - value := val.MethodByName("Get" + name).Call([]reflect.Value{}) - logrus.Debugf("CRIU option %s with value %v", name, value[0]) - } - } - data, err := proto.Marshal(req) - if err != nil { - return err - } - _, err = criuClient.Write(data) - if err != nil { - return err - } - - buf := make([]byte, 10*4096) - for true { - n, err := criuClient.Read(buf) - if err != nil { - return err - } - if n == 0 { - return fmt.Errorf("unexpected EOF") - } - if n == len(buf) { - return fmt.Errorf("buffer is too small") - } - - resp := new(criurpc.CriuResp) - err = proto.Unmarshal(buf[:n], resp) - if err != nil { - return err - } - if !resp.GetSuccess() { - typeString := req.GetType().String() - return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) - } - - t := resp.GetType() - switch { - case t == criurpc.CriuReqType_FEATURE_CHECK: - logrus.Debugf("Feature check says: %s", resp) - criuFeatures = resp.GetFeatures() - break - case t == criurpc.CriuReqType_NOTIFY: - if err := c.criuNotifications(resp, process, opts, extFds); err != nil { - return err - } - t = criurpc.CriuReqType_NOTIFY - req = &criurpc.CriuReq{ - Type: &t, - NotifySuccess: proto.Bool(true), - } - data, err = proto.Marshal(req) - if err != nil { - return err - } - _, err = criuClient.Write(data) - if err != nil { - return err - } - continue - case t == criurpc.CriuReqType_RESTORE: - case t == criurpc.CriuReqType_DUMP: - break - case t == criurpc.CriuReqType_PRE_DUMP: - // In pre-dump mode CRIU is in a loop and waits for - // the final DUMP command. - // The current runc pre-dump approach, however, is - // start criu in PRE_DUMP once for a single pre-dump - // and not the whole series of pre-dump, pre-dump, ...m, dump - // If we got the message CriuReqType_PRE_DUMP it means - // CRIU was successful and we need to forcefully stop CRIU - logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service") - criuClient.Close() - // Process status won't be success, because one end of sockets is closed - _, err := cmd.Process.Wait() - if err != nil { - logrus.Debugf("After PRE_DUMP CRIU exiting failed") - return err - } - return nil - default: - return fmt.Errorf("unable to parse the response %s", resp.String()) - } - - break - } - - // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. - // Here we want to wait only the CRIU process. - st, err := cmd.Process.Wait() - if err != nil { - return err - } - if !st.Success() { - return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath) - } - return nil -} - -// block any external network activity -func lockNetwork(config *configs.Config) error { - for _, config := range config.Networks { - strategy, err := getStrategy(config.Type) - if err != nil { - return err - } - - if err := strategy.detach(config); err != nil { - return err - } - } - return nil -} - -func unlockNetwork(config *configs.Config) error { - for _, config := range config.Networks { - strategy, err := getStrategy(config.Type) - if err != nil { - return err - } - if err = strategy.attach(config); err != nil { - return err - } - } - return nil -} - -func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error { - notify := resp.GetNotify() - if notify == nil { - return fmt.Errorf("invalid response: %s", resp.String()) - } - switch { - case notify.GetScript() == "post-dump": - f, err := os.Create(filepath.Join(c.root, "checkpoint")) - if err != nil { - return err - } - f.Close() - case notify.GetScript() == "network-unlock": - if err := unlockNetwork(c.config); err != nil { - return err - } - case notify.GetScript() == "network-lock": - if err := lockNetwork(c.config); err != nil { - return err - } - case notify.GetScript() == "setup-namespaces": - if c.config.Hooks != nil { - s := configs.HookState{ - SpecState: configs.SpecState{ - Version: c.config.Version, - ID: c.id, - Pid: int(notify.GetPid()), - Bundle: utils.SearchLabels(c.config.Labels, "bundle"), - }, - Root: c.config.Rootfs, - } - for i, hook := range c.config.Hooks.Prestart { - logrus.Infof("run prestart hook: %d:%s, ContainerID: %s", i, hook.Info(), s.ID) - if err := hook.Run(s); err != nil { - return newSystemErrorWithCausef(err, "running prestart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID) - } - logrus.Infof("prestart hook: %d:%s done", i, hook.Info()) - } - } - case notify.GetScript() == "post-restore": - pid := notify.GetPid() - r, err := newRestoredProcess(int(pid), fds) - if err != nil { - return err - } - process.ops = r - if err := c.state.transition(&restoredState{ - imageDir: opts.ImagesDirectory, - c: c, - }); err != nil { - return err - } - // create a timestamp indicating when the restored checkpoint was started - c.created = time.Now().UTC() - if _, err := c.updateState(r); err != nil { - return err - } - if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { - if !os.IsNotExist(err) { - logrus.Error(err) - } - } - } - return nil -} - -func (c *linuxContainer) updateState(process parentProcess) (*State, error) { - c.initProcess = process - state, err := c.currentState() - if err != nil { - return nil, err - } - err = c.saveState(state) - if err != nil { - return nil, err - } - return state, nil -} - -func (c *linuxContainer) saveState(s *State) error { - f, err := os.Create(filepath.Join(c.root, stateFilename)) - if err != nil { - return err - } - defer f.Close() - return utils.WriteJSON(f, s) -} - -func (c *linuxContainer) deleteState() error { - return os.Remove(filepath.Join(c.root, stateFilename)) -} - -func (c *linuxContainer) currentStatus() (Status, error) { - if err := c.refreshState(); err != nil { - return -1, err - } - return c.state.status(), nil -} - -// refreshState needs to be called to verify that the current state on the -// container is what is true. Because consumers of libcontainer can use it -// out of process we need to verify the container's status based on runtime -// information and not rely on our in process info. -func (c *linuxContainer) refreshState() error { - paused, err := c.isPaused() - if err != nil { - return err - } - if paused { - return c.state.transition(&pausedState{c: c}) - } - t, err := c.runType() - if err != nil { - return err - } - switch t { - case Created: - return c.state.transition(&createdState{c: c}) - case Running: - return c.state.transition(&runningState{c: c}) - } - return c.state.transition(&stoppedState{c: c}) -} - -// doesInitProcessExist checks if the init process is still the same process -// as the initial one, it could happen that the original process has exited -// and a new process has been created with the same pid, in this case, the -// container would already be stopped. -func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) { - startTime, err := system.GetProcessStartTime(initPid) - if err != nil { - return false, nil - } - if c.initProcessStartTime != startTime { - return false, nil - } - return true, nil -} - -func (c *linuxContainer) runType() (Status, error) { - if c.initProcess == nil { - return Stopped, nil - } - pid := c.initProcess.pid() - // return Running if the init process is alive - if err := syscall.Kill(pid, 0); err != nil { - if err == syscall.ESRCH { - // It means the process does not exist anymore, could happen when the - // process exited just when we call the function, we should not return - // error in this case. - return Stopped, nil - } - return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid) - } - // check if the process is still the original init process. - exist, err := c.doesInitProcessExist(pid) - if !exist || err != nil { - return Stopped, err - } - // We'll create exec fifo and blocking on it after container is created, - // and delete it after start container. - if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { - return Created, nil - } - return Running, nil -} - -func (c *linuxContainer) isPaused() (bool, error) { - fcg := c.cgroupManager.GetPaths()["freezer"] - if fcg == "" { - // A container doesn't have a freezer cgroup - return false, nil - } - data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state")) - if err != nil { - // If freezer cgroup is not mounted, the container would just be not paused. - if os.IsNotExist(err) { - return false, nil - } - return false, newSystemErrorWithCause(err, "checking if container is paused") - } - return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil -} - -func (c *linuxContainer) currentState() (*State, error) { - var ( - startTime string - externalDescriptors []string - pid = -1 - ) - if c.initProcess != nil { - pid = c.initProcess.pid() - startTime, _ = c.initProcess.startTime() - externalDescriptors = c.initProcess.externalDescriptors() - } - state := &State{ - BaseState: BaseState{ - ID: c.ID(), - Config: *c.config, - InitProcessPid: pid, - InitProcessStartTime: startTime, - Created: c.created, - }, - Rootless: c.config.Rootless, - CgroupPaths: c.cgroupManager.GetPaths(), - NamespacePaths: make(map[configs.NamespaceType]string), - ExternalDescriptors: externalDescriptors, - } - if pid > 0 { - for _, ns := range c.config.Namespaces { - state.NamespacePaths[ns.Type] = ns.GetPath(pid) - } - for _, nsType := range configs.NamespaceTypes() { - if !configs.IsNamespaceSupported(nsType) { - continue - } - if _, ok := state.NamespacePaths[nsType]; !ok { - ns := configs.Namespace{Type: nsType} - state.NamespacePaths[ns.Type] = ns.GetPath(pid) - } - } - } - return state, nil -} - -// orderNamespacePaths sorts namespace paths into a list of paths that we -// can setns in order. -func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { - paths := []string{} - - for _, ns := range configs.NamespaceTypes() { - - // Remove namespaces that we don't need to join. - if !c.config.Namespaces.Contains(ns) { - continue - } - - if p, ok := namespaces[ns]; ok && p != "" { - // check if the requested namespace is supported - if !configs.IsNamespaceSupported(ns) { - return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns)) - } - // only set to join this namespace if it exists - if _, err := os.Lstat(p); err != nil { - return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) - } - // do not allow namespace path with comma as we use it to separate - // the namespace paths - if strings.ContainsRune(p, ',') { - return nil, newSystemError(fmt.Errorf("invalid path %s", p)) - } - paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) - } - - } - - return paths, nil -} - -func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { - data := bytes.NewBuffer(nil) - for _, im := range idMap { - line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) - if _, err := data.WriteString(line); err != nil { - return nil, err - } - } - return data.Bytes(), nil -} - -// bootstrapData encodes the necessary data in netlink binary format -// as a io.Reader. -// Consumer can write the data to a bootstrap program -// such as one that uses nsenter package to bootstrap the container's -// init process correctly, i.e. with correct namespaces, uid/gid -// mapping etc. -func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { - // create the netlink message - r := nl.NewNetlinkRequest(int(InitMsg), 0) - - // write cloneFlags - r.AddData(&Int32msg{ - Type: CloneFlagsAttr, - Value: uint32(cloneFlags), - }) - - // write custom namespace paths - if len(nsMaps) > 0 { - nsPaths, err := c.orderNamespacePaths(nsMaps) - if err != nil { - return nil, err - } - r.AddData(&Bytemsg{ - Type: NsPathsAttr, - Value: []byte(strings.Join(nsPaths, ",")), - }) - } - - // write namespace paths only when we are not joining an existing user ns - _, joinExistingUser := nsMaps[configs.NEWUSER] - if !joinExistingUser { - // write uid mappings - if len(c.config.UidMappings) > 0 { - b, err := encodeIDMapping(c.config.UidMappings) - if err != nil { - return nil, err - } - r.AddData(&Bytemsg{ - Type: UidmapAttr, - Value: b, - }) - } - - // write gid mappings - if len(c.config.GidMappings) > 0 { - b, err := encodeIDMapping(c.config.GidMappings) - if err != nil { - return nil, err - } - r.AddData(&Bytemsg{ - Type: GidmapAttr, - Value: b, - }) - // The following only applies if we are root. - if !c.config.Rootless { - // check if we have CAP_SETGID to setgroup properly - pid, err := capability.NewPid(os.Getpid()) - if err != nil { - return nil, err - } - if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) - } - } - } - } - - // write oom_score_adj - r.AddData(&Bytemsg{ - Type: OomScoreAdjAttr, - Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), - }) - - // write rootless - r.AddData(&Boolmsg{ - Type: RootlessAttr, - Value: c.config.Rootless, - }) - - return bytes.NewReader(r.Serialize()), nil -} diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 0b2aa74..15ba017 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package libcontainer @@ -245,10 +246,10 @@ func (l *LinuxFactory) Type() string { // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { var ( - pipefd, rootfd int + pipefd, fifofd int consoleSocket *os.File envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") - envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR") + envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD") envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") ) @@ -264,11 +265,11 @@ func (l *LinuxFactory) StartInitialization() (err error) { ) defer pipe.Close() - // Only init processes have STATEDIR. - rootfd = -1 + // Only init processes have FIFOFD. + fifofd = -1 if it == initStandard { - if rootfd, err = strconv.Atoi(envStateDir); err != nil { - return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err) + if fifofd, err = strconv.Atoi(envFifoFd); err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) } } @@ -309,7 +310,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { } }() - i, err := newContainerInit(it, pipe, consoleSocket, rootfd, logPipeFd) + i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd) if err != nil { return err } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index e9a83e9..fd417ca 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -1,18 +1,23 @@ +//go:build linux // +build linux package libcontainer import ( "encoding/json" + "errors" "fmt" "io" "net" "os" + "path/filepath" "strings" "syscall" "unsafe" "github.com/Sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" @@ -66,7 +71,7 @@ type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD, logFd int) (initer, error) { +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -89,7 +94,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi consoleSocket: consoleSocket, parentPid: syscall.Getppid(), config: config, - stateDirFD: stateDirFD, + fifoFd: fifoFd, logFd: logFd, }, nil } @@ -111,6 +116,32 @@ func populateProcessEnvironment(env []string) error { return nil } +// verifyCwd ensures that the current directory is actually inside the mount +// namespace root of the current process. +func verifyCwd() error { + // getcwd(2) on Linux detects if cwd is outside of the rootfs of the + // current mount namespace root, and in that case prefixes "(unreachable)" + // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect + // when this happens and return ENOENT rather than returning a non-absolute + // path. In both cases we can therefore easily detect if we have an invalid + // cwd by checking the return value of getcwd(3). See getcwd(3) for more + // details, and CVE-2024-21626 for the security issue that motivated this + // check. + // + // We have to use unix.Getwd() here because os.Getwd() has a workaround for + // $PWD which involves doing stat(.), which can fail if the current + // directory is inaccessible to the container process. + if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { + return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") + } else if err != nil { + return fmt.Errorf("failed to verify if current working directory is safe: %w", err) + } else if !filepath.IsAbs(wd) { + // We shouldn't ever hit this, but check just in case. + return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) + } + return nil +} + // finalizeNamespace drops the caps, sets the correct user // and working dir, and closes any leaked file descriptors // before executing the command inside the namespace @@ -148,6 +179,10 @@ func finalizeNamespace(config *initConfig) error { if err := setupUser(config); err != nil { return err } + // Make sure our final working directory is inside the container. + if err := verifyCwd(); err != nil { + return err + } if err := system.ClearKeepCaps(); err != nil { return err } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 5cdc30c..e786419 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package libcontainer @@ -204,7 +205,6 @@ type initProcess struct { process *Process bootstrapData io.Reader sharePidns bool - rootDir *os.File } func (p *initProcess) pid() int { @@ -257,7 +257,6 @@ func (p *initProcess) start() error { err := p.cmd.Start() p.process.ops = p p.childPipe.Close() - p.rootDir.Close() logs.CloseChild() if err != nil { p.process.ops = nil diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 1f7ec98..e38165d 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package libcontainer @@ -73,5 +74,23 @@ func (l *linuxSetnsInit) Init() error { syscall.Close(l.logFd) } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 6236593..7ebf1a2 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package libcontainer @@ -15,14 +16,17 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/selinux/go-selinux/label" + + "golang.org/x/sys/unix" ) type linuxStandardInit struct { pipe *os.File consoleSocket *os.File parentPid int - stateDirFD int + fifoFd int config *initConfig logFd int } @@ -187,7 +191,7 @@ func (l *linuxStandardInit) Init() error { // exec'ing the users process. ch := make(chan Error, 1) go func() { - fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) + fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) if err != nil { ch <- newSystemErrorWithCause(err, "openat exec fifo") return @@ -215,7 +219,25 @@ func (l *linuxStandardInit) Init() error { } // close the statedir fd before exec because the kernel resets dumpable in the wrong order // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 - syscall.Close(l.stateDirFD) + unix.Close(l.fifoFd) + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { return newSystemErrorWithCause(err, "exec user process") } diff --git a/libcontainer/standard_init_linux.go.orig b/libcontainer/standard_init_linux.go.orig deleted file mode 100644 index 611b91d..0000000 --- a/libcontainer/standard_init_linux.go.orig +++ /dev/null @@ -1,223 +0,0 @@ -// +build linux - -package libcontainer - -import ( - "fmt" - "os" - "os/exec" - "strings" - "syscall" - "time" - - "github.com/opencontainers/runc/libcontainer/apparmor" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/keys" - "github.com/opencontainers/runc/libcontainer/seccomp" - "github.com/opencontainers/runc/libcontainer/system" - "github.com/opencontainers/selinux/go-selinux/label" -) - -type linuxStandardInit struct { - pipe *os.File - consoleSocket *os.File - parentPid int - stateDirFD int - config *initConfig - logFd int -} - -func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { - var newperms uint32 - - if l.config.Config.Namespaces.Contains(configs.NEWUSER) { - // with user ns we need 'other' search permissions - newperms = 0x8 - } else { - // without user ns we need 'UID' search permissions - newperms = 0x80000 - } - - // create a unique per session container name that we can - // join in setns; however, other containers can also join it - return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms -} - -// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value -// the kernel -const PR_SET_NO_NEW_PRIVS = 0x26 - -func (l *linuxStandardInit) Init() error { - if !l.config.Config.NoNewKeyring { - ringname, keepperms, newperms := l.getSessionRingParams() - - // do not inherit the parent's session keyring - sessKeyId, err := keys.JoinSessionKeyring(ringname) - if err != nil { - return err - } - // make session keyring searcheable - if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { - return err - } - } - - if err := setupNetwork(l.config); err != nil { - return err - } - if err := setupRoute(l.config.Config); err != nil { - return err - } - - label.Init() - - // prepareRootfs() can be executed only for a new mount namespace. - if l.config.Config.Namespaces.Contains(configs.NEWNS) { - if err := prepareRootfs(l.pipe, l.config.Config); err != nil { - return err - } - } - - // Set up the console. This has to be done *before* we finalize the rootfs, - // but *after* we've given the user the chance to set up all of the mounts - // they wanted. - if l.config.CreateConsole { - if err := setupConsole(l.consoleSocket, l.config, true); err != nil { - return err - } - if err := system.Setctty(); err != nil { - return err - } - } - - // Finish the rootfs setup. - if l.config.Config.Namespaces.Contains(configs.NEWNS) { - if err := finalizeRootfs(l.config.Config); err != nil { - return err - } - } - - if hostname := l.config.Config.Hostname; hostname != "" { - if err := syscall.Sethostname([]byte(hostname)); err != nil { - return err - } - } - if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { - return err - } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return err - } - // when userns enabled, write to sysctl will fail, let docker-hooks do this job - if len(l.config.Config.UidMappings) == 0 && len(l.config.Config.GidMappings) == 0 { - for key, value := range l.config.Config.Sysctl { - if err := writeSystemProperty(key, value); err != nil { - return err - } - } - } - for _, path := range l.config.Config.ReadonlyPaths { - if err := readonlyPath(path); err != nil { - return err - } - } - for _, m := range l.config.Config.Mounts { - if m.Flags&syscall.MS_RDONLY == 0 && m.Device == "proc" && strings.HasPrefix(m.Destination, "/proc/sys/") { - if err := remountReadWrite(m.Destination); err != nil { - return err - } - } - } - for _, path := range l.config.Config.MaskPaths { - if err := maskPath(path); err != nil { - return err - } - } - pdeath, err := system.GetParentDeathSignal() - if err != nil { - return err - } - if l.config.NoNewPrivileges { - if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return err - } - } - // Tell our parent that we're ready to Execv. This must be done before the - // Seccomp rules have been applied, because we need to be able to read and - // write to a socket. - if err := syncParentReady(l.pipe); err != nil { - return err - } - // Without NoNewPrivileges seccomp is a privileged operation, so we need to - // do this before dropping capabilities; otherwise do it as late as possible - // just before execve so as few syscalls take place after it as possible. - if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { - return err - } - } - if err := finalizeNamespace(l.config); err != nil { - return err - } - // finalizeNamespace can change user/group which clears the parent death - // signal, so we restore it here. - if err := pdeath.Restore(); err != nil { - return err - } - // compare the parent from the initial start of the init process and make sure that it did not change. - // if the parent changes that means it died and we were reparented to something else so we should - // just kill ourself and not cause problems for someone else. - if syscall.Getppid() != l.parentPid { - return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) - } - // check for the arg before waiting to make sure it exists and it is returned - // as a create time error. - name, err := exec.LookPath(l.config.Args[0]) - if err != nil { - return err - } - // close the pipe to signal that we have completed our init. - l.pipe.Close() - - if l.logFd != 0 { - syscall.Close(l.logFd) - } - - // wait for the fifo to be opened on the other side before - // exec'ing the users process. - ch := make(chan Error, 1) - go func() { - fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) - if err != nil { - ch <- newSystemErrorWithCause(err, "openat exec fifo") - return - } - if _, err := syscall.Write(fd, []byte("0")); err != nil { - ch <- newSystemErrorWithCause(err, "write 0 exec fifo") - return - } - ch <- nil - }() - - select { - case chErr := <-ch: - if chErr != nil { - return chErr - } - case <-time.After(120 * time.Second): - return newSystemErrorWithCause(fmt.Errorf("timeout"), "wait for the fifo to be opened on the other side ") - } - - if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { - return newSystemErrorWithCause(err, "init seccomp") - } - } - // close the statedir fd before exec because the kernel resets dumpable in the wrong order - // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 - syscall.Close(l.stateDirFD) - if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { - return newSystemErrorWithCause(err, "exec user process") - } - return nil -} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index cd04ace..922cffb 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -5,17 +5,12 @@ import ( "encoding/binary" "encoding/hex" "encoding/json" - "fmt" "io" "os" "path/filepath" - "strconv" "strings" "syscall" "unsafe" - - securejoin "github.com/cyphar/filepath-securejoin" - "golang.org/x/sys/unix" ) const ( @@ -175,36 +170,3 @@ func stripRoot(root, path string) string { } return CleanPath("/" + path) } - -// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) -// corresponding to the unsafePath resolved within the root. Before passing the -// fd, this path is verified to have been inside the root -- so operating on it -// through the passed fdpath should be safe. Do not access this path through -// the original path strings, and do not attempt to use the pathname outside of -// the passed closure (the file handle will be freed once the closure returns). -func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { - // Remove the root then forcefully resolve inside the root. - unsafePath = stripRoot(root, unsafePath) - path, err := securejoin.SecureJoin(root, unsafePath) - if err != nil { - return fmt.Errorf("resolving path inside rootfs failed: %v", err) - } - - // Open the target path. - fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) - if err != nil { - return fmt.Errorf("open o_path procfd: %w", err) - } - defer fh.Close() - - // Double-check the path is the one we expected. - procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) - if realpath, err := os.Readlink(procfd); err != nil { - return fmt.Errorf("procfd verification failed: %w", err) - } else if realpath != path { - return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) - } - - // Run the closure. - return fn(procfd) -} diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 7b798cc..cfacfc2 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -1,43 +1,264 @@ +///go:build !windows +//go:build !windows // +build !windows package utils import ( - "io/ioutil" + "fmt" + "math" "os" + "path/filepath" + "runtime" "strconv" - "syscall" + "sync" + _ "unsafe" // for go:linkname + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/Sirupsen/logrus" + "golang.org/x/sys/unix" ) -func CloseExecFrom(minFd int) error { - fdList, err := ioutil.ReadDir("/proc/self/fd") +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +var ( + haveCloseRangeCloexecBool bool + haveCloseRangeCloexecOnce sync.Once +) + +func haveCloseRangeCloexec() bool { + haveCloseRangeCloexecOnce.Do(func() { + // Make sure we're not closing a random file descriptor. + tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return + } + defer unix.Close(tmpFd) + + err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) + // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). + // -ENOSYS and -EINVAL ultimately mean we don't have support, but any + // other potential error would imply that even the most basic close + // operation wouldn't work. + haveCloseRangeCloexecBool = err == nil + }) + return haveCloseRangeCloexecBool +} + +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { + procSelfFd, closer := ProcThreadSelf("fd") + defer closer() + + fdDir, err := os.Open(procSelfFd) + if err != nil { + return err + } + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) if err != nil { return err } - for _, fi := range fdList { - fd, err := strconv.Atoi(fi.Name()) + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. if err != nil { - // ignore non-numeric file names continue } - + // Ignore descriptors lower than our specified minimum. if fd < minFd { - // ignore descriptors lower than our specified minimum continue } - - // intentionally ignore errors from syscall.CloseOnExec - syscall.CloseOnExec(fd) - // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } -// NewSockPair returns a new unix socket pair -func NewSockPair(name string) (parent *os.File, child *os.File, err error) { - fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. + if haveCloseRangeCloexec() { + err := unix.CloseRange(uint(minFd), math.MaxUint64, unix.CLOSE_RANGE_CLOEXEC) + return os.NewSyscallError("close_range", err) + } + // Otherwise, fall back to the standard loop. + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We cannot use close_range(2) even if it is available, because we must + // not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + +// NewSockPair returns a new SOCK_STREAM unix socket pair. +func NewSockPair(name string) (parent, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + procSelfFd, closer := ProcThreadSelf("fd/") + defer closer() + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) + // Double-check the path is the one we expected. + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + return fn(procfd) +} + +type ProcThreadSelfCloser func() + +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +// ProcThreadSelf returns a string that is equivalent to +// /proc/thread-self/, with a graceful fallback on older kernels where +// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, +// meaning that the passed string needs to be trusted. The caller _must_ call +// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) +// *only once* after it has finished using the returned path string. +func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { + haveProcThreadSelfOnce.Do(func() { + if _, err := os.Stat("/proc/thread-self/"); err == nil { + haveProcThreadSelf = true + } else { + logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) + } + }) + + // We need to lock our thread until the caller is done with the path string + // because any non-atomic operation on the path (such as opening a file, + // then reading it) could be interrupted by the Go runtime where the + // underlying thread is swapped out and the original thread is killed, + // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In + // addition, the pre-3.17 fallback makes everything non-atomic because the + // same thing could happen between unix.Gettid() and the path operations. + // + // In theory, we don't need to lock in the atomic user case when using + // /proc/thread-self/, but it's better to be safe than sorry (and there are + // only one or two truly atomic users of /proc/thread-self/). + runtime.LockOSThread() + + threadSelf := "/proc/thread-self/" + if !haveProcThreadSelf { + // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. + threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := os.Stat(threadSelf); err != nil { + // Unfortunately, this code is called from rootfs_linux.go where we + // are running inside the pid namespace of the container but /proc + // is the host's procfs. Unfortunately there is no real way to get + // the correct tid to use here (the kernel age means we cannot do + // things like set up a private fsopen("proc") -- even scanning + // NSpid in all of the tasks in /proc/self/task/*/status requires + // Linux 4.1). + // + // So, we just have to assume that /proc/self is acceptable in this + // one specific case. + if os.Getpid() == 1 { + logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) + } else { + // This should never happen, but the fallback should work in most cases... + logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) + } + threadSelf = "/proc/self/" + } + } + return threadSelf + subpath, runtime.UnlockOSThread +} + +// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to +// create a /proc/thread-self handle for given file descriptor. +// +// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but +// without using fmt.Sprintf to avoid unneeded overhead. +func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { + return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) +} diff --git a/vendor/golang.org/x/sys/unix/flock.go b/vendor/golang.org/x/sys/unix/flock.go index ce67a59..e8d1081 100644 --- a/vendor/golang.org/x/sys/unix/flock.go +++ b/vendor/golang.org/x/sys/unix/flock.go @@ -14,6 +14,11 @@ import "unsafe" // systems by flock_linux_32bit.go to be SYS_FCNTL64. var fcntl64Syscall uintptr = SYS_FCNTL +// FcntlInt performs a fcntl syscall on fd with the provided command and argument. +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return fcntl(int(fd), cmd, arg) +} + // FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command. func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error { _, _, errno := Syscall(fcntl64Syscall, fd, uintptr(cmd), uintptr(unsafe.Pointer(lk))) diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go index f21dcd9..e1bde81 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go @@ -934,6 +934,7 @@ const ( PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 + PROC_SUPER_MAGIC = 0x9fa0 PROT_EXEC = 0x4 PROT_GROWSDOWN = 0x1000000 PROT_GROWSUP = 0x2000000 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go index 16a18f5..388d1fc 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go @@ -966,6 +966,7 @@ const ( PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 + PROC_SUPER_MAGIC = 0x9fa0 PROT_EXEC = 0x4 PROT_GROWSDOWN = 0x1000000 PROT_GROWSUP = 0x2000000 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go index 8b2e87d..fe21f83 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go @@ -312,6 +312,16 @@ func Close(fd int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func CloseRange(first uint, last uint, flags uint) (err error) { + _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Dup(oldfd int) (fd int, err error) { r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0) fd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go index f6cc320..395e2de 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go @@ -312,6 +312,16 @@ func Close(fd int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func CloseRange(first uint, last uint, flags uint) (err error) { + _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Dup(oldfd int) (fd int, err error) { r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0) fd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go index 9042317..f7c427c 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go @@ -338,4 +338,5 @@ const ( SYS_PKEY_MPROTECT = 329 SYS_PKEY_ALLOC = 330 SYS_PKEY_FREE = 331 + SYS_CLOSE_RANGE = 436 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go index 90e43d0..530563a 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go @@ -282,4 +282,5 @@ const ( SYS_PKEY_MPROTECT = 288 SYS_PKEY_ALLOC = 289 SYS_PKEY_FREE = 290 + SYS_CLOSE_RANGE = 436 ) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go index c9e1e64..2f12811 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go @@ -345,6 +345,11 @@ type TCPInfo struct { Total_retrans uint32 } +const ( + CLOSE_RANGE_UNSHARE = 0x2 + CLOSE_RANGE_CLOEXEC = 0x4 +) + const ( SizeofSockaddrInet4 = 0x10 SizeofSockaddrInet6 = 0x1c diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go index e58c500..b77eceb 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go @@ -30,6 +30,11 @@ type Timeval struct { Usec int64 } +const ( + CLOSE_RANGE_UNSHARE = 0x2 + CLOSE_RANGE_CLOEXEC = 0x4 +) + type Timex struct { Modes uint32 Pad_cgo_0 [4]byte -- 2.33.0