runc/patch/0148-runc-fix-CVE-2024-21626.patch
zhongjiawei 1a6ece7204 runc:fix CVE-2024-21626
(cherry picked from commit 6e9b77988428e4184978084eccfa08612f3c5b0f)
2024-02-01 19:02:05 +08:00

2766 lines
86 KiB
Diff

From e81938064402940ca8176d6f3145f65b1d455996 Mon Sep 17 00:00:00 2001
From: zhongjiawei <zhongjiawei1@huawei.com>
Date: Thu, 1 Feb 2024 18:25:16 +0800
Subject: [PATCH] runc:fix CVE-2024-21626
---
libcontainer/container_linux.go | 50 +-
libcontainer/container_linux.go.orig | 1660 -----------------
libcontainer/factory_linux.go | 15 +-
libcontainer/init_linux.go | 39 +-
libcontainer/process_linux.go | 3 +-
libcontainer/setns_init_linux.go | 19 +
libcontainer/standard_init_linux.go | 28 +-
libcontainer/standard_init_linux.go.orig | 223 ---
libcontainer/utils/utils.go | 38 -
libcontainer/utils/utils_unix.go | 253 ++-
vendor/golang.org/x/sys/unix/flock.go | 5 +
.../x/sys/unix/zerrors_linux_amd64.go | 1 +
.../x/sys/unix/zerrors_linux_arm64.go | 1 +
.../x/sys/unix/zsyscall_linux_amd64.go | 10 +
.../x/sys/unix/zsyscall_linux_arm64.go | 10 +
.../x/sys/unix/zsysnum_linux_amd64.go | 1 +
.../x/sys/unix/zsysnum_linux_arm64.go | 1 +
.../x/sys/unix/ztypes_linux_amd64.go | 5 +
.../x/sys/unix/ztypes_linux_arm64.go | 5 +
19 files changed, 403 insertions(+), 1964 deletions(-)
delete mode 100644 libcontainer/container_linux.go.orig
delete mode 100644 libcontainer/standard_init_linux.go.orig
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index a4859ca..c757d71 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -1,3 +1,4 @@
+//go:build linux
// +build linux
package libcontainer
@@ -28,6 +29,7 @@ import (
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
+ "golang.org/x/sys/unix"
)
const stdioFdCount = 3
@@ -321,6 +323,15 @@ func (c *linuxContainer) start(process *Process) error {
}()
}
+ // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
+ // to make sure we don't leak any files into "runc init". Any files to be
+ // passed to "runc init" through ExtraFiles will get dup2'd by the Go
+ // runtime and thus their O_CLOEXEC flag will be cleared. This is some
+ // additional protection against attacks like CVE-2024-21626, by making
+ // sure we never leak files to "runc init" we didn't intend to.
+ if err := utils.CloseExecFrom(3); err != nil {
+ return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
+ }
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil {
@@ -414,6 +425,23 @@ func (c *linuxContainer) deleteExecFifo() {
os.Remove(fifoName)
}
+// includeExecFifo opens the container's execfifo as a pathfd, so that the
+// container cannot access the statedir (and the FIFO itself remains
+// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
+// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
+func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
+ fifoName := filepath.Join(c.root, execFifoFilename)
+ fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return err
+ }
+
+ cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
+ cmd.Env = append(cmd.Env,
+ fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+ return nil
+}
+
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
@@ -430,18 +458,15 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
- // We only set up rootDir if we're not doing a `runc exec`. The reason for
- // this is to avoid cases where a racing, unprivileged process inside the
- // container can get access to the statedir file descriptor (which would
- // allow for container rootfs escape).
- rootDir, err := os.Open(c.root)
- if err != nil {
- return nil, err
+ // We only set up fifoFd if we're not doing a `runc exec`. The historic
+ // reason for this is that previously we would pass a dirfd that allowed
+ // for container rootfs escape (and not doing it in `runc exec` avoided
+ // that problem), but we no longer do that. However, there's no need to do
+ // this for `runc exec` so we just keep it this way to be safe.
+ if err := c.includeExecFifo(cmd); err != nil {
+ return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
}
- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
+ return c.newInitProcess(p, cmd, parentPipe, childPipe)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
@@ -479,7 +504,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil
}
-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
+func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
@@ -501,7 +526,6 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
process: p,
bootstrapData: data,
sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID),
- rootDir: rootDir,
}, nil
}
diff --git a/libcontainer/container_linux.go.orig b/libcontainer/container_linux.go.orig
deleted file mode 100644
index d678407..0000000
--- a/libcontainer/container_linux.go.orig
+++ /dev/null
@@ -1,1660 +0,0 @@
-// +build linux
-
-package libcontainer
-
-import (
- "bytes"
- "encoding/json"
- "errors"
- "fmt"
- "io"
- "io/ioutil"
- "os"
- "os/exec"
- "path/filepath"
- "reflect"
- "strings"
- "sync"
- "syscall"
- "time"
-
- "github.com/Sirupsen/logrus"
- "github.com/golang/protobuf/proto"
- "github.com/opencontainers/runc/libcontainer/cgroups"
- "github.com/opencontainers/runc/libcontainer/configs"
- "github.com/opencontainers/runc/libcontainer/criurpc"
- "github.com/opencontainers/runc/libcontainer/logs"
- "github.com/opencontainers/runc/libcontainer/system"
- "github.com/opencontainers/runc/libcontainer/utils"
- "github.com/syndtr/gocapability/capability"
- "github.com/vishvananda/netlink/nl"
-)
-
-const stdioFdCount = 3
-
-type linuxContainer struct {
- id string
- root string
- config *configs.Config
- cgroupManager cgroups.Manager
- initArgs []string
- initProcess parentProcess
- initProcessStartTime string
- criuPath string
- m sync.Mutex
- criuVersion int
- state containerState
- created time.Time
-}
-
-// State represents a running container's state
-type State struct {
- BaseState
-
- // Platform specific fields below here
-
- // Specifies if the container was started under the rootless mode.
- Rootless bool `json:"rootless"`
-
- // Path to all the cgroups setup for a container. Key is cgroup subsystem name
- // with the value as the path.
- CgroupPaths map[string]string `json:"cgroup_paths"`
-
- // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
- // with the value as the path.
- NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
-
- // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
- ExternalDescriptors []string `json:"external_descriptors,omitempty"`
-}
-
-// CompatState
-type CompatState struct {
- State
- Config configs.CompatConfig `json:"config"`
-}
-
-// Container is a libcontainer container object.
-//
-// Each container is thread-safe within the same process. Since a container can
-// be destroyed by a separate process, any function may return that the container
-// was not found.
-type Container interface {
- BaseContainer
-
- // Methods below here are platform specific
-
- // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
- //
- // errors:
- // Systemerror - System error.
- Checkpoint(criuOpts *CriuOpts) error
-
- // Restore restores the checkpointed container to a running state using the criu(8) utility.
- //
- // errors:
- // Systemerror - System error.
- Restore(process *Process, criuOpts *CriuOpts) error
-
- // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
- // the execution of any user processes. Asynchronously, when the container finished being paused the
- // state is changed to PAUSED.
- // If the Container state is PAUSED, do nothing.
- //
- // errors:
- // ContainerNotExists - Container no longer exists,
- // ContainerNotRunning - Container not running or created,
- // Systemerror - System error.
- Pause() error
-
- // If the Container state is PAUSED, resumes the execution of any user processes in the
- // Container before setting the Container state to RUNNING.
- // If the Container state is RUNNING, do nothing.
- //
- // errors:
- // ContainerNotExists - Container no longer exists,
- // ContainerNotPaused - Container is not paused,
- // Systemerror - System error.
- Resume() error
-
- // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
- //
- // errors:
- // Systemerror - System error.
- NotifyOOM() (<-chan struct{}, error)
-
- // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
- //
- // errors:
- // Systemerror - System error.
- NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
-}
-
-// ID returns the container's unique ID
-func (c *linuxContainer) ID() string {
- return c.id
-}
-
-// Config returns the container's configuration
-func (c *linuxContainer) Config() configs.Config {
- return *c.config
-}
-
-func (c *linuxContainer) Status() (Status, error) {
- c.m.Lock()
- defer c.m.Unlock()
- return c.currentStatus()
-}
-
-func (c *linuxContainer) State() (*State, error) {
- c.m.Lock()
- defer c.m.Unlock()
- return c.currentState()
-}
-
-func (c *linuxContainer) Processes() ([]int, error) {
- pids, err := c.cgroupManager.GetAllPids()
- if err != nil {
- return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
- }
- return pids, nil
-}
-
-func (c *linuxContainer) Stats() (*Stats, error) {
- var (
- err error
- stats = &Stats{}
- )
- if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
- return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
- }
- for _, iface := range c.config.Networks {
- switch iface.Type {
- case "veth":
- istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
- if err != nil {
- return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
- }
- stats.Interfaces = append(stats.Interfaces, istats)
- }
- }
- return stats, nil
-}
-
-func (c *linuxContainer) Set(config configs.Config) error {
- c.m.Lock()
- defer c.m.Unlock()
- status, err := c.currentStatus()
- if err != nil {
- return err
- }
- if status == Stopped {
- return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
- }
- c.config = &config
- return c.cgroupManager.Set(c.config)
-}
-
-func (c *linuxContainer) Start(process *Process) error {
- c.m.Lock()
- defer c.m.Unlock()
- if process.Init {
- if err := c.createExecFifo(); err != nil {
- return err
- }
- }
- if err := c.start(process); err != nil {
- if process.Init {
- c.deleteExecFifo()
- }
- return err
- }
- return nil
-}
-
-func (c *linuxContainer) Run(process *Process) error {
- if err := c.Start(process); err != nil {
- return err
- }
- if process.Init {
- return c.exec()
- }
- return nil
-}
-
-func (c *linuxContainer) Exec() error {
- c.m.Lock()
- defer c.m.Unlock()
- return c.exec()
-}
-
-func (c *linuxContainer) exec() error {
- path := filepath.Join(c.root, execFifoFilename)
-
- fifoOpen := make(chan struct{})
- select {
- case <-awaitProcessExit(c.initProcess.pid(), fifoOpen):
- return errors.New("container process is already dead")
- case result := <-awaitFifoOpen(path, fifoOpen):
- if result.err != nil {
- return result.err
- }
- f := result.file
- defer f.Close()
- if err := readFromExecFifo(f); err != nil {
- return err
- }
- if err := os.Remove(path); !os.IsNotExist(err) {
- return err
- }
- return nil
- }
-}
-
-func readFromExecFifo(execFifo io.Reader) error {
- data, err := ioutil.ReadAll(execFifo)
- if err != nil {
- return err
- }
- if len(data) <= 0 {
- return fmt.Errorf("cannot start an already running container")
- }
- return nil
-}
-
-func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} {
- isDead := make(chan struct{})
- go func() {
- for {
- select {
- case <-exit:
- return
- case <-time.After(time.Millisecond * 100):
- stat, err := system.GetProcessState(pid)
- if err != nil || stat == system.Zombie {
- select {
- case <-exit:
- return
- default:
- close(isDead)
- }
- return
- }
- }
- }
- }()
- return isDead
-}
-
-func awaitFifoOpen(path string, fifoOpen chan struct{}) <-chan openResult {
- fifoOpened := make(chan openResult)
- go func() {
- f, err := os.OpenFile(path, os.O_RDONLY, 0)
- close(fifoOpen)
- if err != nil {
- fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
- return
- }
- fifoOpened <- openResult{file: f}
- }()
- return fifoOpened
-}
-
-type openResult struct {
- file *os.File
- err error
-}
-
-func (c *linuxContainer) start(process *Process) error {
- parent, err := c.newParentProcess(process)
- if err != nil {
- return newSystemErrorWithCause(err, "creating new parent process")
- }
-
- if logsDone := logs.ForwardLogs(); logsDone != nil {
- defer func() {
- select {
- case <-logsDone:
- case <-time.After(3 * time.Second):
- logrus.Warnf("wait child close logfd timeout")
- }
- }()
- }
-
- if err := parent.start(); err != nil {
- // terminate the process to ensure that it properly is reaped.
- if err := parent.terminate(); err != nil {
- logrus.Warnf("parent process terminate error: %v", err)
- }
- return newSystemErrorWithCause(err, "starting container process")
- }
- // generate a timestamp indicating when the container was started
- c.created = time.Now().UTC()
- if process.Init {
- c.state = &createdState{
- c: c,
- }
- state, err := c.updateState(parent)
- if err != nil {
- return err
- }
- c.initProcessStartTime = state.InitProcessStartTime
-
- if c.config.Hooks != nil {
- s := configs.HookState{
- SpecState: configs.SpecState{
- Version: c.config.Version,
- ID: c.id,
- Pid: parent.pid(),
- Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
- },
- Root: c.config.Rootfs,
- }
- for i, hook := range c.config.Hooks.Poststart {
- logrus.Infof("run poststart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
- if err := hook.Run(s); err != nil {
- logrus.Warnf("running poststart hook %d:%s failed: %s, ContainerId: %s", i, hook.Info(), err, s.ID)
- }
- }
- }
- } else {
- c.state = &runningState{
- c: c,
- }
- }
- return nil
-}
-
-func (c *linuxContainer) Signal(s os.Signal, all bool) error {
- if all {
- return signalAllProcesses(c.cgroupManager, s)
- }
- status, err := c.currentStatus()
- if err != nil {
- return err
- }
- // to avoid a PID reuse attack
- if status == Running || status == Created {
- if err := c.initProcess.signal(s); err != nil {
- return newSystemErrorWithCause(err, "signaling init process")
- }
- return nil
- }
- return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
-}
-
-func (c *linuxContainer) createExecFifo() error {
- rootuid, err := c.Config().HostRootUID()
- if err != nil {
- return err
- }
- rootgid, err := c.Config().HostRootGID()
- if err != nil {
- return err
- }
-
- fifoName := filepath.Join(c.root, execFifoFilename)
- if _, err := os.Stat(fifoName); err == nil {
- return fmt.Errorf("exec fifo %s already exists", fifoName)
- }
- oldMask := syscall.Umask(0000)
- if err := syscall.Mkfifo(fifoName, 0622); err != nil {
- syscall.Umask(oldMask)
- return err
- }
- syscall.Umask(oldMask)
- if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
- return err
- }
- return nil
-}
-
-func (c *linuxContainer) deleteExecFifo() {
- fifoName := filepath.Join(c.root, execFifoFilename)
- os.Remove(fifoName)
-}
-
-func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
- parentPipe, childPipe, err := utils.NewSockPair("init")
- if err != nil {
- return nil, newSystemErrorWithCause(err, "creating new init pipe")
- }
- if err := logs.InitLogPipe(); err != nil {
- return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
- }
- cmd, err := c.commandTemplate(p, childPipe)
- if err != nil {
- return nil, newSystemErrorWithCause(err, "creating new command template")
- }
- if !p.Init {
- return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
- }
-
- // We only set up rootDir if we're not doing a `runc exec`. The reason for
- // this is to avoid cases where a racing, unprivileged process inside the
- // container can get access to the statedir file descriptor (which would
- // allow for container rootfs escape).
- rootDir, err := os.Open(c.root)
- if err != nil {
- return nil, err
- }
- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
-}
-
-func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
- cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
- cmd.Stdin = p.Stdin
- cmd.Stdout = p.Stdout
- cmd.Stderr = p.Stderr
- cmd.Dir = c.config.Rootfs
- if cmd.SysProcAttr == nil {
- cmd.SysProcAttr = &syscall.SysProcAttr{}
- }
- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
- if p.ConsoleSocket != nil {
- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
- )
- }
- cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
- )
-
- cmd.ExtraFiles = append(cmd.ExtraFiles, logs.ChildLogPipe)
- cmd.Env = append(cmd.Env,
- fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
- )
-
- // NOTE: when running a container with no PID namespace and the parent process spawning the container is
- // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
- // even with the parent still running.
- if c.config.ParentDeathSignal > 0 {
- cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
- }
- return cmd, nil
-}
-
-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
- nsMaps := make(map[configs.NamespaceType]string)
- for _, ns := range c.config.Namespaces {
- if ns.Path != "" {
- nsMaps[ns.Type] = ns.Path
- }
- }
- data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
- if err != nil {
- return nil, err
- }
- return &initProcess{
- cmd: cmd,
- childPipe: childPipe,
- parentPipe: parentPipe,
- manager: c.cgroupManager,
- config: c.newInitConfig(p),
- container: c,
- process: p,
- bootstrapData: data,
- sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID),
- rootDir: rootDir,
- }, nil
-}
-
-func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
- state, err := c.currentState()
- if err != nil {
- return nil, newSystemErrorWithCause(err, "getting container's current state")
- }
- // for setns process, we don't have to set cloneflags as the process namespaces
- // will only be set via setns syscall
- data, err := c.bootstrapData(0, state.NamespacePaths)
- if err != nil {
- return nil, err
- }
- return &setnsProcess{
- cmd: cmd,
- cgroupPaths: c.cgroupManager.GetPaths(),
- childPipe: childPipe,
- parentPipe: parentPipe,
- config: c.newInitConfig(p),
- process: p,
- bootstrapData: data,
- }, nil
-}
-
-func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
- cfg := &initConfig{
- Config: c.config,
- Args: process.Args,
- Env: process.Env,
- User: process.User,
- AdditionalGroups: process.AdditionalGroups,
- Cwd: process.Cwd,
- Capabilities: process.Capabilities,
- PassedFilesCount: len(process.ExtraFiles),
- ContainerId: c.ID(),
- NoNewPrivileges: c.config.NoNewPrivileges,
- Rootless: c.config.Rootless,
- AppArmorProfile: c.config.AppArmorProfile,
- ProcessLabel: c.config.ProcessLabel,
- Rlimits: c.config.Rlimits,
- }
- if process.NoNewPrivileges != nil {
- cfg.NoNewPrivileges = *process.NoNewPrivileges
- }
- if process.AppArmorProfile != "" {
- cfg.AppArmorProfile = process.AppArmorProfile
- }
- if process.Label != "" {
- cfg.ProcessLabel = process.Label
- }
- if len(process.Rlimits) > 0 {
- cfg.Rlimits = process.Rlimits
- }
- cfg.CreateConsole = process.ConsoleSocket != nil
- return cfg
-}
-
-func (c *linuxContainer) Destroy() error {
- c.m.Lock()
- defer c.m.Unlock()
- return c.state.destroy()
-}
-
-func (c *linuxContainer) Pause() error {
- c.m.Lock()
- defer c.m.Unlock()
- status, err := c.currentStatus()
- if err != nil {
- return err
- }
- switch status {
- case Running, Created:
- if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
- return err
- }
- return c.state.transition(&pausedState{
- c: c,
- })
- }
- return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
-}
-
-func (c *linuxContainer) Resume() error {
- c.m.Lock()
- defer c.m.Unlock()
- status, err := c.currentStatus()
- if err != nil {
- return err
- }
- if status != Paused {
- return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
- }
- if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
- return err
- }
- return c.state.transition(&runningState{
- c: c,
- })
-}
-
-func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
- // XXX(cyphar): This requires cgroups.
- if c.config.Rootless {
- return nil, fmt.Errorf("cannot get OOM notifications from rootless container")
- }
- return notifyOnOOM(c.cgroupManager.GetPaths())
-}
-
-func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
- // XXX(cyphar): This requires cgroups.
- if c.config.Rootless {
- return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container")
- }
- return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
-}
-
-var criuFeatures *criurpc.CriuFeatures
-
-func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
-
- var t criurpc.CriuReqType
- t = criurpc.CriuReqType_FEATURE_CHECK
-
- if err := c.checkCriuVersion("1.8"); err != nil {
- // Feature checking was introduced with CRIU 1.8.
- // Ignore the feature check if an older CRIU version is used
- // and just act as before.
- // As all automated PR testing is done using CRIU 1.7 this
- // code will not be tested by automated PR testing.
- return nil
- }
-
- // make sure the features we are looking for are really not from
- // some previous check
- criuFeatures = nil
-
- req := &criurpc.CriuReq{
- Type: &t,
- // Theoretically this should not be necessary but CRIU
- // segfaults if Opts is empty.
- // Fixed in CRIU 2.12
- Opts: rpcOpts,
- Features: criuFeat,
- }
-
- err := c.criuSwrk(nil, req, criuOpts, false)
- if err != nil {
- logrus.Debugf("%s", err)
- return fmt.Errorf("CRIU feature check failed")
- }
-
- logrus.Debugf("Feature check says: %s", criuFeatures)
- missingFeatures := false
-
- if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
- missingFeatures = true
- logrus.Debugf("CRIU does not support MemTrack")
- }
-
- if missingFeatures {
- return fmt.Errorf("CRIU is missing features")
- }
-
- return nil
-}
-
-// checkCriuVersion checks Criu version greater than or equal to minVersion
-func (c *linuxContainer) checkCriuVersion(minVersion string) error {
- var x, y, z, versionReq int
-
- _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
- if err != nil {
- _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
- }
- versionReq = x*10000 + y*100 + z
-
- out, err := exec.Command(c.criuPath, "-V").Output()
- if err != nil {
- return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath)
- }
-
- x = 0
- y = 0
- z = 0
- if ep := strings.Index(string(out), "-"); ep >= 0 {
- // criu Git version format
- var version string
- if sp := strings.Index(string(out), "GitID"); sp > 0 {
- version = string(out)[sp:ep]
- } else {
- return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath)
- }
-
- n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
- if err != nil {
- n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
- y++
- } else {
- z++
- }
- if n < 2 || err != nil {
- return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
- }
- } else {
- // criu release version format
- n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
- if err != nil {
- n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
- }
- if n < 2 || err != nil {
- return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
- }
- }
-
- c.criuVersion = x*10000 + y*100 + z
-
- if c.criuVersion < versionReq {
- return fmt.Errorf("CRIU version must be %s or higher", minVersion)
- }
-
- return nil
-}
-
-const descriptorsFilename = "descriptors.json"
-
-func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
- mountDest := m.Destination
- if strings.HasPrefix(mountDest, c.config.Rootfs) {
- mountDest = mountDest[len(c.config.Rootfs):]
- }
-
- extMnt := &criurpc.ExtMountMap{
- Key: proto.String(mountDest),
- Val: proto.String(mountDest),
- }
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
-}
-
-func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
- for _, path := range c.config.MaskPaths {
- fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
- if err != nil {
- if os.IsNotExist(err) {
- continue
- }
- return err
- }
- if fi.IsDir() {
- continue
- }
-
- extMnt := &criurpc.ExtMountMap{
- Key: proto.String(path),
- Val: proto.String("/dev/null"),
- }
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
- }
-
- return nil
-}
-
-func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
- c.m.Lock()
- defer c.m.Unlock()
-
- // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
- // support for doing unprivileged dumps, but the setup of
- // rootless containers might make this complicated.
- if c.config.Rootless {
- return fmt.Errorf("cannot checkpoint a rootless container")
- }
-
- if err := c.checkCriuVersion("1.5.2"); err != nil {
- return err
- }
-
- if criuOpts.ImagesDirectory == "" {
- return fmt.Errorf("invalid directory to save checkpoint")
- }
-
- // Since a container can be C/R'ed multiple times,
- // the checkpoint directory may already exist.
- if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
- return err
- }
-
- if criuOpts.WorkDirectory == "" {
- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
- }
-
- if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
- return err
- }
-
- workDir, err := os.Open(criuOpts.WorkDirectory)
- if err != nil {
- return err
- }
- defer workDir.Close()
-
- imageDir, err := os.Open(criuOpts.ImagesDirectory)
- if err != nil {
- return err
- }
- defer imageDir.Close()
-
- rpcOpts := criurpc.CriuOpts{
- ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
- WorkDirFd: proto.Int32(int32(workDir.Fd())),
- LogLevel: proto.Int32(4),
- LogFile: proto.String("dump.log"),
- Root: proto.String(c.config.Rootfs),
- ManageCgroups: proto.Bool(true),
- NotifyScripts: proto.Bool(true),
- Pid: proto.Int32(int32(c.initProcess.pid())),
- ShellJob: proto.Bool(criuOpts.ShellJob),
- LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
- TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
- FileLocks: proto.Bool(criuOpts.FileLocks),
- EmptyNs: proto.Uint32(criuOpts.EmptyNs),
- }
-
- // append optional criu opts, e.g., page-server and port
- if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
- rpcOpts.Ps = &criurpc.CriuPageServerInfo{
- Address: proto.String(criuOpts.PageServer.Address),
- Port: proto.Int32(criuOpts.PageServer.Port),
- }
- }
-
- //pre-dump may need parentImage param to complete iterative migration
- if criuOpts.ParentImage != "" {
- rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
- rpcOpts.TrackMem = proto.Bool(true)
- }
-
- // append optional manage cgroups mode
- if criuOpts.ManageCgroupsMode != 0 {
- if err := c.checkCriuVersion("1.7"); err != nil {
- return err
- }
- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
- rpcOpts.ManageCgroupsMode = &mode
- }
-
- var t criurpc.CriuReqType
- if criuOpts.PreDump {
- feat := criurpc.CriuFeatures{
- MemTrack: proto.Bool(true),
- }
-
- if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
- return err
- }
-
- t = criurpc.CriuReqType_PRE_DUMP
- } else {
- t = criurpc.CriuReqType_DUMP
- }
- req := &criurpc.CriuReq{
- Type: &t,
- Opts: &rpcOpts,
- }
-
- //no need to dump these information in pre-dump
- if !criuOpts.PreDump {
- for _, m := range c.config.Mounts {
- switch m.Device {
- case "bind":
- c.addCriuDumpMount(req, m)
- break
- case "cgroup":
- binds, err := getCgroupMounts(m)
- if err != nil {
- return err
- }
- for _, b := range binds {
- c.addCriuDumpMount(req, b)
- }
- break
- }
- }
-
- if err := c.addMaskPaths(req); err != nil {
- return err
- }
-
- for _, node := range c.config.Devices {
- m := &configs.Mount{Destination: node.Path, Source: node.Path}
- c.addCriuDumpMount(req, m)
- }
-
- // Write the FD info to a file in the image directory
- fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
- if err != nil {
- return err
- }
-
- err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
- if err != nil {
- return err
- }
- }
-
- err = c.criuSwrk(nil, req, criuOpts, false)
- if err != nil {
- return err
- }
- return nil
-}
-
-func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
- mountDest := m.Destination
- if strings.HasPrefix(mountDest, c.config.Rootfs) {
- mountDest = mountDest[len(c.config.Rootfs):]
- }
-
- extMnt := &criurpc.ExtMountMap{
- Key: proto.String(mountDest),
- Val: proto.String(m.Source),
- }
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
-}
-
-func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
- for _, iface := range c.config.Networks {
- switch iface.Type {
- case "veth":
- veth := new(criurpc.CriuVethPair)
- veth.IfOut = proto.String(iface.HostInterfaceName)
- veth.IfIn = proto.String(iface.Name)
- req.Opts.Veths = append(req.Opts.Veths, veth)
- break
- case "loopback":
- break
- }
- }
- for _, i := range criuOpts.VethPairs {
- veth := new(criurpc.CriuVethPair)
- veth.IfOut = proto.String(i.HostInterfaceName)
- veth.IfIn = proto.String(i.ContainerInterfaceName)
- req.Opts.Veths = append(req.Opts.Veths, veth)
- }
-}
-
-func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
- c.m.Lock()
- defer c.m.Unlock()
-
- // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
- // support for unprivileged restore at the moment.
- if c.config.Rootless {
- return fmt.Errorf("cannot restore a rootless container")
- }
-
- if err := c.checkCriuVersion("1.5.2"); err != nil {
- return err
- }
- if criuOpts.WorkDirectory == "" {
- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
- }
- // Since a container can be C/R'ed multiple times,
- // the work directory may already exist.
- if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
- return err
- }
- workDir, err := os.Open(criuOpts.WorkDirectory)
- if err != nil {
- return err
- }
- defer workDir.Close()
- if criuOpts.ImagesDirectory == "" {
- return fmt.Errorf("invalid directory to restore checkpoint")
- }
- imageDir, err := os.Open(criuOpts.ImagesDirectory)
- if err != nil {
- return err
- }
- defer imageDir.Close()
- // CRIU has a few requirements for a root directory:
- // * it must be a mount point
- // * its parent must not be overmounted
- // c.config.Rootfs is bind-mounted to a temporary directory
- // to satisfy these requirements.
- root := filepath.Join(c.root, "criu-root")
- if err := os.Mkdir(root, 0755); err != nil {
- return err
- }
- defer os.Remove(root)
- root, err = filepath.EvalSymlinks(root)
- if err != nil {
- return err
- }
- err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
- if err != nil {
- return err
- }
- defer syscall.Unmount(root, syscall.MNT_DETACH)
- t := criurpc.CriuReqType_RESTORE
- req := &criurpc.CriuReq{
- Type: &t,
- Opts: &criurpc.CriuOpts{
- ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
- WorkDirFd: proto.Int32(int32(workDir.Fd())),
- EvasiveDevices: proto.Bool(true),
- LogLevel: proto.Int32(4),
- LogFile: proto.String("restore.log"),
- RstSibling: proto.Bool(true),
- Root: proto.String(root),
- ManageCgroups: proto.Bool(true),
- NotifyScripts: proto.Bool(true),
- ShellJob: proto.Bool(criuOpts.ShellJob),
- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
- TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
- FileLocks: proto.Bool(criuOpts.FileLocks),
- EmptyNs: proto.Uint32(criuOpts.EmptyNs),
- },
- }
-
- for _, m := range c.config.Mounts {
- switch m.Device {
- case "bind":
- c.addCriuRestoreMount(req, m)
- break
- case "cgroup":
- binds, err := getCgroupMounts(m)
- if err != nil {
- return err
- }
- for _, b := range binds {
- c.addCriuRestoreMount(req, b)
- }
- break
- }
- }
-
- if len(c.config.MaskPaths) > 0 {
- m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
- c.addCriuRestoreMount(req, m)
- }
-
- for _, node := range c.config.Devices {
- m := &configs.Mount{Destination: node.Path, Source: node.Path}
- c.addCriuRestoreMount(req, m)
- }
-
- if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
- c.restoreNetwork(req, criuOpts)
- }
-
- // append optional manage cgroups mode
- if criuOpts.ManageCgroupsMode != 0 {
- if err := c.checkCriuVersion("1.7"); err != nil {
- return err
- }
- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
- req.Opts.ManageCgroupsMode = &mode
- }
-
- var (
- fds []string
- fdJSON []byte
- )
- if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
- return err
- }
-
- if err := json.Unmarshal(fdJSON, &fds); err != nil {
- return err
- }
- for i := range fds {
- if s := fds[i]; strings.Contains(s, "pipe:") {
- inheritFd := new(criurpc.InheritFd)
- inheritFd.Key = proto.String(s)
- inheritFd.Fd = proto.Int32(int32(i))
- req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
- }
- }
- return c.criuSwrk(process, req, criuOpts, true)
-}
-
-func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
- // XXX: Do we need to deal with this case? AFAIK criu still requires root.
- if err := c.cgroupManager.Apply(pid); err != nil {
- return err
- }
-
- if err := c.cgroupManager.Set(c.config); err != nil {
- return newSystemError(err)
- }
-
- path := fmt.Sprintf("/proc/%d/cgroup", pid)
- cgroupsPaths, err := cgroups.ParseCgroupFile(path)
- if err != nil {
- return err
- }
-
- for c, p := range cgroupsPaths {
- cgroupRoot := &criurpc.CgroupRoot{
- Ctrl: proto.String(c),
- Path: proto.String(p),
- }
- req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
- }
-
- return nil
-}
-
-func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
- fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- return err
- }
-
- logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
- criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
- criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
- defer criuClient.Close()
- defer criuServer.Close()
-
- args := []string{"swrk", "3"}
- logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
- logrus.Debugf("Using CRIU with following args: %s", args)
- cmd := exec.Command(c.criuPath, args...)
- if process != nil {
- cmd.Stdin = process.Stdin
- cmd.Stdout = process.Stdout
- cmd.Stderr = process.Stderr
- }
- cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
-
- if err := cmd.Start(); err != nil {
- return err
- }
- criuServer.Close()
-
- defer func() {
- criuClient.Close()
- _, err := cmd.Process.Wait()
- if err != nil {
- return
- }
- }()
-
- if applyCgroups {
- err := c.criuApplyCgroups(cmd.Process.Pid, req)
- if err != nil {
- return err
- }
- }
-
- var extFds []string
- if process != nil {
- extFds, err = getPipeFds(cmd.Process.Pid)
- if err != nil {
- return err
- }
- }
-
- logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
- // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
- // should be empty. For older CRIU versions it still will be
- // available but empty.
- if req.GetType() != criurpc.CriuReqType_FEATURE_CHECK {
- val := reflect.ValueOf(req.GetOpts())
- v := reflect.Indirect(val)
- for i := 0; i < v.NumField(); i++ {
- st := v.Type()
- name := st.Field(i).Name
- if strings.HasPrefix(name, "XXX_") {
- continue
- }
- value := val.MethodByName("Get" + name).Call([]reflect.Value{})
- logrus.Debugf("CRIU option %s with value %v", name, value[0])
- }
- }
- data, err := proto.Marshal(req)
- if err != nil {
- return err
- }
- _, err = criuClient.Write(data)
- if err != nil {
- return err
- }
-
- buf := make([]byte, 10*4096)
- for true {
- n, err := criuClient.Read(buf)
- if err != nil {
- return err
- }
- if n == 0 {
- return fmt.Errorf("unexpected EOF")
- }
- if n == len(buf) {
- return fmt.Errorf("buffer is too small")
- }
-
- resp := new(criurpc.CriuResp)
- err = proto.Unmarshal(buf[:n], resp)
- if err != nil {
- return err
- }
- if !resp.GetSuccess() {
- typeString := req.GetType().String()
- return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
- }
-
- t := resp.GetType()
- switch {
- case t == criurpc.CriuReqType_FEATURE_CHECK:
- logrus.Debugf("Feature check says: %s", resp)
- criuFeatures = resp.GetFeatures()
- break
- case t == criurpc.CriuReqType_NOTIFY:
- if err := c.criuNotifications(resp, process, opts, extFds); err != nil {
- return err
- }
- t = criurpc.CriuReqType_NOTIFY
- req = &criurpc.CriuReq{
- Type: &t,
- NotifySuccess: proto.Bool(true),
- }
- data, err = proto.Marshal(req)
- if err != nil {
- return err
- }
- _, err = criuClient.Write(data)
- if err != nil {
- return err
- }
- continue
- case t == criurpc.CriuReqType_RESTORE:
- case t == criurpc.CriuReqType_DUMP:
- break
- case t == criurpc.CriuReqType_PRE_DUMP:
- // In pre-dump mode CRIU is in a loop and waits for
- // the final DUMP command.
- // The current runc pre-dump approach, however, is
- // start criu in PRE_DUMP once for a single pre-dump
- // and not the whole series of pre-dump, pre-dump, ...m, dump
- // If we got the message CriuReqType_PRE_DUMP it means
- // CRIU was successful and we need to forcefully stop CRIU
- logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service")
- criuClient.Close()
- // Process status won't be success, because one end of sockets is closed
- _, err := cmd.Process.Wait()
- if err != nil {
- logrus.Debugf("After PRE_DUMP CRIU exiting failed")
- return err
- }
- return nil
- default:
- return fmt.Errorf("unable to parse the response %s", resp.String())
- }
-
- break
- }
-
- // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
- // Here we want to wait only the CRIU process.
- st, err := cmd.Process.Wait()
- if err != nil {
- return err
- }
- if !st.Success() {
- return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
- }
- return nil
-}
-
-// block any external network activity
-func lockNetwork(config *configs.Config) error {
- for _, config := range config.Networks {
- strategy, err := getStrategy(config.Type)
- if err != nil {
- return err
- }
-
- if err := strategy.detach(config); err != nil {
- return err
- }
- }
- return nil
-}
-
-func unlockNetwork(config *configs.Config) error {
- for _, config := range config.Networks {
- strategy, err := getStrategy(config.Type)
- if err != nil {
- return err
- }
- if err = strategy.attach(config); err != nil {
- return err
- }
- }
- return nil
-}
-
-func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error {
- notify := resp.GetNotify()
- if notify == nil {
- return fmt.Errorf("invalid response: %s", resp.String())
- }
- switch {
- case notify.GetScript() == "post-dump":
- f, err := os.Create(filepath.Join(c.root, "checkpoint"))
- if err != nil {
- return err
- }
- f.Close()
- case notify.GetScript() == "network-unlock":
- if err := unlockNetwork(c.config); err != nil {
- return err
- }
- case notify.GetScript() == "network-lock":
- if err := lockNetwork(c.config); err != nil {
- return err
- }
- case notify.GetScript() == "setup-namespaces":
- if c.config.Hooks != nil {
- s := configs.HookState{
- SpecState: configs.SpecState{
- Version: c.config.Version,
- ID: c.id,
- Pid: int(notify.GetPid()),
- Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
- },
- Root: c.config.Rootfs,
- }
- for i, hook := range c.config.Hooks.Prestart {
- logrus.Infof("run prestart hook: %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
- if err := hook.Run(s); err != nil {
- return newSystemErrorWithCausef(err, "running prestart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
- }
- logrus.Infof("prestart hook: %d:%s done", i, hook.Info())
- }
- }
- case notify.GetScript() == "post-restore":
- pid := notify.GetPid()
- r, err := newRestoredProcess(int(pid), fds)
- if err != nil {
- return err
- }
- process.ops = r
- if err := c.state.transition(&restoredState{
- imageDir: opts.ImagesDirectory,
- c: c,
- }); err != nil {
- return err
- }
- // create a timestamp indicating when the restored checkpoint was started
- c.created = time.Now().UTC()
- if _, err := c.updateState(r); err != nil {
- return err
- }
- if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
- if !os.IsNotExist(err) {
- logrus.Error(err)
- }
- }
- }
- return nil
-}
-
-func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
- c.initProcess = process
- state, err := c.currentState()
- if err != nil {
- return nil, err
- }
- err = c.saveState(state)
- if err != nil {
- return nil, err
- }
- return state, nil
-}
-
-func (c *linuxContainer) saveState(s *State) error {
- f, err := os.Create(filepath.Join(c.root, stateFilename))
- if err != nil {
- return err
- }
- defer f.Close()
- return utils.WriteJSON(f, s)
-}
-
-func (c *linuxContainer) deleteState() error {
- return os.Remove(filepath.Join(c.root, stateFilename))
-}
-
-func (c *linuxContainer) currentStatus() (Status, error) {
- if err := c.refreshState(); err != nil {
- return -1, err
- }
- return c.state.status(), nil
-}
-
-// refreshState needs to be called to verify that the current state on the
-// container is what is true. Because consumers of libcontainer can use it
-// out of process we need to verify the container's status based on runtime
-// information and not rely on our in process info.
-func (c *linuxContainer) refreshState() error {
- paused, err := c.isPaused()
- if err != nil {
- return err
- }
- if paused {
- return c.state.transition(&pausedState{c: c})
- }
- t, err := c.runType()
- if err != nil {
- return err
- }
- switch t {
- case Created:
- return c.state.transition(&createdState{c: c})
- case Running:
- return c.state.transition(&runningState{c: c})
- }
- return c.state.transition(&stoppedState{c: c})
-}
-
-// doesInitProcessExist checks if the init process is still the same process
-// as the initial one, it could happen that the original process has exited
-// and a new process has been created with the same pid, in this case, the
-// container would already be stopped.
-func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
- startTime, err := system.GetProcessStartTime(initPid)
- if err != nil {
- return false, nil
- }
- if c.initProcessStartTime != startTime {
- return false, nil
- }
- return true, nil
-}
-
-func (c *linuxContainer) runType() (Status, error) {
- if c.initProcess == nil {
- return Stopped, nil
- }
- pid := c.initProcess.pid()
- // return Running if the init process is alive
- if err := syscall.Kill(pid, 0); err != nil {
- if err == syscall.ESRCH {
- // It means the process does not exist anymore, could happen when the
- // process exited just when we call the function, we should not return
- // error in this case.
- return Stopped, nil
- }
- return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
- }
- // check if the process is still the original init process.
- exist, err := c.doesInitProcessExist(pid)
- if !exist || err != nil {
- return Stopped, err
- }
- // We'll create exec fifo and blocking on it after container is created,
- // and delete it after start container.
- if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
- return Created, nil
- }
- return Running, nil
-}
-
-func (c *linuxContainer) isPaused() (bool, error) {
- fcg := c.cgroupManager.GetPaths()["freezer"]
- if fcg == "" {
- // A container doesn't have a freezer cgroup
- return false, nil
- }
- data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state"))
- if err != nil {
- // If freezer cgroup is not mounted, the container would just be not paused.
- if os.IsNotExist(err) {
- return false, nil
- }
- return false, newSystemErrorWithCause(err, "checking if container is paused")
- }
- return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
-}
-
-func (c *linuxContainer) currentState() (*State, error) {
- var (
- startTime string
- externalDescriptors []string
- pid = -1
- )
- if c.initProcess != nil {
- pid = c.initProcess.pid()
- startTime, _ = c.initProcess.startTime()
- externalDescriptors = c.initProcess.externalDescriptors()
- }
- state := &State{
- BaseState: BaseState{
- ID: c.ID(),
- Config: *c.config,
- InitProcessPid: pid,
- InitProcessStartTime: startTime,
- Created: c.created,
- },
- Rootless: c.config.Rootless,
- CgroupPaths: c.cgroupManager.GetPaths(),
- NamespacePaths: make(map[configs.NamespaceType]string),
- ExternalDescriptors: externalDescriptors,
- }
- if pid > 0 {
- for _, ns := range c.config.Namespaces {
- state.NamespacePaths[ns.Type] = ns.GetPath(pid)
- }
- for _, nsType := range configs.NamespaceTypes() {
- if !configs.IsNamespaceSupported(nsType) {
- continue
- }
- if _, ok := state.NamespacePaths[nsType]; !ok {
- ns := configs.Namespace{Type: nsType}
- state.NamespacePaths[ns.Type] = ns.GetPath(pid)
- }
- }
- }
- return state, nil
-}
-
-// orderNamespacePaths sorts namespace paths into a list of paths that we
-// can setns in order.
-func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
- paths := []string{}
-
- for _, ns := range configs.NamespaceTypes() {
-
- // Remove namespaces that we don't need to join.
- if !c.config.Namespaces.Contains(ns) {
- continue
- }
-
- if p, ok := namespaces[ns]; ok && p != "" {
- // check if the requested namespace is supported
- if !configs.IsNamespaceSupported(ns) {
- return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
- }
- // only set to join this namespace if it exists
- if _, err := os.Lstat(p); err != nil {
- return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
- }
- // do not allow namespace path with comma as we use it to separate
- // the namespace paths
- if strings.ContainsRune(p, ',') {
- return nil, newSystemError(fmt.Errorf("invalid path %s", p))
- }
- paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
- }
-
- }
-
- return paths, nil
-}
-
-func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
- data := bytes.NewBuffer(nil)
- for _, im := range idMap {
- line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
- if _, err := data.WriteString(line); err != nil {
- return nil, err
- }
- }
- return data.Bytes(), nil
-}
-
-// bootstrapData encodes the necessary data in netlink binary format
-// as a io.Reader.
-// Consumer can write the data to a bootstrap program
-// such as one that uses nsenter package to bootstrap the container's
-// init process correctly, i.e. with correct namespaces, uid/gid
-// mapping etc.
-func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
- // create the netlink message
- r := nl.NewNetlinkRequest(int(InitMsg), 0)
-
- // write cloneFlags
- r.AddData(&Int32msg{
- Type: CloneFlagsAttr,
- Value: uint32(cloneFlags),
- })
-
- // write custom namespace paths
- if len(nsMaps) > 0 {
- nsPaths, err := c.orderNamespacePaths(nsMaps)
- if err != nil {
- return nil, err
- }
- r.AddData(&Bytemsg{
- Type: NsPathsAttr,
- Value: []byte(strings.Join(nsPaths, ",")),
- })
- }
-
- // write namespace paths only when we are not joining an existing user ns
- _, joinExistingUser := nsMaps[configs.NEWUSER]
- if !joinExistingUser {
- // write uid mappings
- if len(c.config.UidMappings) > 0 {
- b, err := encodeIDMapping(c.config.UidMappings)
- if err != nil {
- return nil, err
- }
- r.AddData(&Bytemsg{
- Type: UidmapAttr,
- Value: b,
- })
- }
-
- // write gid mappings
- if len(c.config.GidMappings) > 0 {
- b, err := encodeIDMapping(c.config.GidMappings)
- if err != nil {
- return nil, err
- }
- r.AddData(&Bytemsg{
- Type: GidmapAttr,
- Value: b,
- })
- // The following only applies if we are root.
- if !c.config.Rootless {
- // check if we have CAP_SETGID to setgroup properly
- pid, err := capability.NewPid(os.Getpid())
- if err != nil {
- return nil, err
- }
- if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
- r.AddData(&Boolmsg{
- Type: SetgroupAttr,
- Value: true,
- })
- }
- }
- }
- }
-
- // write oom_score_adj
- r.AddData(&Bytemsg{
- Type: OomScoreAdjAttr,
- Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
- })
-
- // write rootless
- r.AddData(&Boolmsg{
- Type: RootlessAttr,
- Value: c.config.Rootless,
- })
-
- return bytes.NewReader(r.Serialize()), nil
-}
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
index 0b2aa74..15ba017 100644
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@@ -1,3 +1,4 @@
+//go:build linux
// +build linux
package libcontainer
@@ -245,10 +246,10 @@ func (l *LinuxFactory) Type() string {
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
var (
- pipefd, rootfd int
+ pipefd, fifofd int
consoleSocket *os.File
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
- envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR")
+ envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD")
envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
)
@@ -264,11 +265,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
)
defer pipe.Close()
- // Only init processes have STATEDIR.
- rootfd = -1
+ // Only init processes have FIFOFD.
+ fifofd = -1
if it == initStandard {
- if rootfd, err = strconv.Atoi(envStateDir); err != nil {
- return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
+ if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
+ return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
}
}
@@ -309,7 +310,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
}
}()
- i, err := newContainerInit(it, pipe, consoleSocket, rootfd, logPipeFd)
+ i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
if err != nil {
return err
}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index e9a83e9..fd417ca 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -1,18 +1,23 @@
+//go:build linux
// +build linux
package libcontainer
import (
"encoding/json"
+ "errors"
"fmt"
"io"
"net"
"os"
+ "path/filepath"
"strings"
"syscall"
"unsafe"
"github.com/Sirupsen/logrus"
+ "golang.org/x/sys/unix"
+
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
@@ -66,7 +71,7 @@ type initer interface {
Init() error
}
-func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD, logFd int) (initer, error) {
+func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@@ -89,7 +94,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi
consoleSocket: consoleSocket,
parentPid: syscall.Getppid(),
config: config,
- stateDirFD: stateDirFD,
+ fifoFd: fifoFd,
logFd: logFd,
}, nil
}
@@ -111,6 +116,32 @@ func populateProcessEnvironment(env []string) error {
return nil
}
+// verifyCwd ensures that the current directory is actually inside the mount
+// namespace root of the current process.
+func verifyCwd() error {
+ // getcwd(2) on Linux detects if cwd is outside of the rootfs of the
+ // current mount namespace root, and in that case prefixes "(unreachable)"
+ // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
+ // when this happens and return ENOENT rather than returning a non-absolute
+ // path. In both cases we can therefore easily detect if we have an invalid
+ // cwd by checking the return value of getcwd(3). See getcwd(3) for more
+ // details, and CVE-2024-21626 for the security issue that motivated this
+ // check.
+ //
+ // We have to use unix.Getwd() here because os.Getwd() has a workaround for
+ // $PWD which involves doing stat(.), which can fail if the current
+ // directory is inaccessible to the container process.
+ if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
+ return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
+ } else if err != nil {
+ return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
+ } else if !filepath.IsAbs(wd) {
+ // We shouldn't ever hit this, but check just in case.
+ return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
+ }
+ return nil
+}
+
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace
@@ -148,6 +179,10 @@ func finalizeNamespace(config *initConfig) error {
if err := setupUser(config); err != nil {
return err
}
+ // Make sure our final working directory is inside the container.
+ if err := verifyCwd(); err != nil {
+ return err
+ }
if err := system.ClearKeepCaps(); err != nil {
return err
}
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 5cdc30c..e786419 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -1,3 +1,4 @@
+//go:build linux
// +build linux
package libcontainer
@@ -204,7 +205,6 @@ type initProcess struct {
process *Process
bootstrapData io.Reader
sharePidns bool
- rootDir *os.File
}
func (p *initProcess) pid() int {
@@ -257,7 +257,6 @@ func (p *initProcess) start() error {
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
- p.rootDir.Close()
logs.CloseChild()
if err != nil {
p.process.ops = nil
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index 1f7ec98..e38165d 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -1,3 +1,4 @@
+//go:build linux
// +build linux
package libcontainer
@@ -73,5 +74,23 @@ func (l *linuxSetnsInit) Init() error {
syscall.Close(l.logFd)
}
+ // Close all file descriptors we are not passing to the container. This is
+ // necessary because the execve target could use internal runc fds as the
+ // execve path, potentially giving access to binary files from the host
+ // (which can then be opened by container processes, leading to container
+ // escapes). Note that because this operation will close any open file
+ // descriptors that are referenced by (*os.File) handles from underneath
+ // the Go runtime, we must not do any file operations after this point
+ // (otherwise the (*os.File) finaliser could close the wrong file). See
+ // CVE-2024-21626 for more information as to why this protection is
+ // necessary.
+ //
+ // This is not needed for runc-dmz, because the extra execve(2) step means
+ // that all O_CLOEXEC file descriptors have already been closed and thus
+ // the second execve(2) from runc-dmz cannot access internal file
+ // descriptors from runc.
+ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+ return err
+ }
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index 6236593..7ebf1a2 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -1,3 +1,4 @@
+//go:build linux
// +build linux
package libcontainer
@@ -15,14 +16,17 @@ import (
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
+ "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/selinux/go-selinux/label"
+
+ "golang.org/x/sys/unix"
)
type linuxStandardInit struct {
pipe *os.File
consoleSocket *os.File
parentPid int
- stateDirFD int
+ fifoFd int
config *initConfig
logFd int
}
@@ -187,7 +191,7 @@ func (l *linuxStandardInit) Init() error {
// exec'ing the users process.
ch := make(chan Error, 1)
go func() {
- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
+ fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
ch <- newSystemErrorWithCause(err, "openat exec fifo")
return
@@ -215,7 +219,25 @@ func (l *linuxStandardInit) Init() error {
}
// close the statedir fd before exec because the kernel resets dumpable in the wrong order
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
- syscall.Close(l.stateDirFD)
+ unix.Close(l.fifoFd)
+ // Close all file descriptors we are not passing to the container. This is
+ // necessary because the execve target could use internal runc fds as the
+ // execve path, potentially giving access to binary files from the host
+ // (which can then be opened by container processes, leading to container
+ // escapes). Note that because this operation will close any open file
+ // descriptors that are referenced by (*os.File) handles from underneath
+ // the Go runtime, we must not do any file operations after this point
+ // (otherwise the (*os.File) finaliser could close the wrong file). See
+ // CVE-2024-21626 for more information as to why this protection is
+ // necessary.
+ //
+ // This is not needed for runc-dmz, because the extra execve(2) step means
+ // that all O_CLOEXEC file descriptors have already been closed and thus
+ // the second execve(2) from runc-dmz cannot access internal file
+ // descriptors from runc.
+ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+ return err
+ }
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
diff --git a/libcontainer/standard_init_linux.go.orig b/libcontainer/standard_init_linux.go.orig
deleted file mode 100644
index 611b91d..0000000
--- a/libcontainer/standard_init_linux.go.orig
+++ /dev/null
@@ -1,223 +0,0 @@
-// +build linux
-
-package libcontainer
-
-import (
- "fmt"
- "os"
- "os/exec"
- "strings"
- "syscall"
- "time"
-
- "github.com/opencontainers/runc/libcontainer/apparmor"
- "github.com/opencontainers/runc/libcontainer/configs"
- "github.com/opencontainers/runc/libcontainer/keys"
- "github.com/opencontainers/runc/libcontainer/seccomp"
- "github.com/opencontainers/runc/libcontainer/system"
- "github.com/opencontainers/selinux/go-selinux/label"
-)
-
-type linuxStandardInit struct {
- pipe *os.File
- consoleSocket *os.File
- parentPid int
- stateDirFD int
- config *initConfig
- logFd int
-}
-
-func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
- var newperms uint32
-
- if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
- // with user ns we need 'other' search permissions
- newperms = 0x8
- } else {
- // without user ns we need 'UID' search permissions
- newperms = 0x80000
- }
-
- // create a unique per session container name that we can
- // join in setns; however, other containers can also join it
- return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
-}
-
-// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
-// the kernel
-const PR_SET_NO_NEW_PRIVS = 0x26
-
-func (l *linuxStandardInit) Init() error {
- if !l.config.Config.NoNewKeyring {
- ringname, keepperms, newperms := l.getSessionRingParams()
-
- // do not inherit the parent's session keyring
- sessKeyId, err := keys.JoinSessionKeyring(ringname)
- if err != nil {
- return err
- }
- // make session keyring searcheable
- if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
- return err
- }
- }
-
- if err := setupNetwork(l.config); err != nil {
- return err
- }
- if err := setupRoute(l.config.Config); err != nil {
- return err
- }
-
- label.Init()
-
- // prepareRootfs() can be executed only for a new mount namespace.
- if l.config.Config.Namespaces.Contains(configs.NEWNS) {
- if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
- return err
- }
- }
-
- // Set up the console. This has to be done *before* we finalize the rootfs,
- // but *after* we've given the user the chance to set up all of the mounts
- // they wanted.
- if l.config.CreateConsole {
- if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
- return err
- }
- if err := system.Setctty(); err != nil {
- return err
- }
- }
-
- // Finish the rootfs setup.
- if l.config.Config.Namespaces.Contains(configs.NEWNS) {
- if err := finalizeRootfs(l.config.Config); err != nil {
- return err
- }
- }
-
- if hostname := l.config.Config.Hostname; hostname != "" {
- if err := syscall.Sethostname([]byte(hostname)); err != nil {
- return err
- }
- }
- if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
- return err
- }
- if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
- return err
- }
- // when userns enabled, write to sysctl will fail, let docker-hooks do this job
- if len(l.config.Config.UidMappings) == 0 && len(l.config.Config.GidMappings) == 0 {
- for key, value := range l.config.Config.Sysctl {
- if err := writeSystemProperty(key, value); err != nil {
- return err
- }
- }
- }
- for _, path := range l.config.Config.ReadonlyPaths {
- if err := readonlyPath(path); err != nil {
- return err
- }
- }
- for _, m := range l.config.Config.Mounts {
- if m.Flags&syscall.MS_RDONLY == 0 && m.Device == "proc" && strings.HasPrefix(m.Destination, "/proc/sys/") {
- if err := remountReadWrite(m.Destination); err != nil {
- return err
- }
- }
- }
- for _, path := range l.config.Config.MaskPaths {
- if err := maskPath(path); err != nil {
- return err
- }
- }
- pdeath, err := system.GetParentDeathSignal()
- if err != nil {
- return err
- }
- if l.config.NoNewPrivileges {
- if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
- return err
- }
- }
- // Tell our parent that we're ready to Execv. This must be done before the
- // Seccomp rules have been applied, because we need to be able to read and
- // write to a socket.
- if err := syncParentReady(l.pipe); err != nil {
- return err
- }
- // Without NoNewPrivileges seccomp is a privileged operation, so we need to
- // do this before dropping capabilities; otherwise do it as late as possible
- // just before execve so as few syscalls take place after it as possible.
- if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
- return err
- }
- }
- if err := finalizeNamespace(l.config); err != nil {
- return err
- }
- // finalizeNamespace can change user/group which clears the parent death
- // signal, so we restore it here.
- if err := pdeath.Restore(); err != nil {
- return err
- }
- // compare the parent from the initial start of the init process and make sure that it did not change.
- // if the parent changes that means it died and we were reparented to something else so we should
- // just kill ourself and not cause problems for someone else.
- if syscall.Getppid() != l.parentPid {
- return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
- }
- // check for the arg before waiting to make sure it exists and it is returned
- // as a create time error.
- name, err := exec.LookPath(l.config.Args[0])
- if err != nil {
- return err
- }
- // close the pipe to signal that we have completed our init.
- l.pipe.Close()
-
- if l.logFd != 0 {
- syscall.Close(l.logFd)
- }
-
- // wait for the fifo to be opened on the other side before
- // exec'ing the users process.
- ch := make(chan Error, 1)
- go func() {
- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
- if err != nil {
- ch <- newSystemErrorWithCause(err, "openat exec fifo")
- return
- }
- if _, err := syscall.Write(fd, []byte("0")); err != nil {
- ch <- newSystemErrorWithCause(err, "write 0 exec fifo")
- return
- }
- ch <- nil
- }()
-
- select {
- case chErr := <-ch:
- if chErr != nil {
- return chErr
- }
- case <-time.After(120 * time.Second):
- return newSystemErrorWithCause(fmt.Errorf("timeout"), "wait for the fifo to be opened on the other side ")
- }
-
- if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
- return newSystemErrorWithCause(err, "init seccomp")
- }
- }
- // close the statedir fd before exec because the kernel resets dumpable in the wrong order
- // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
- syscall.Close(l.stateDirFD)
- if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
- return newSystemErrorWithCause(err, "exec user process")
- }
- return nil
-}
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
index cd04ace..922cffb 100644
--- a/libcontainer/utils/utils.go
+++ b/libcontainer/utils/utils.go
@@ -5,17 +5,12 @@ import (
"encoding/binary"
"encoding/hex"
"encoding/json"
- "fmt"
"io"
"os"
"path/filepath"
- "strconv"
"strings"
"syscall"
"unsafe"
-
- securejoin "github.com/cyphar/filepath-securejoin"
- "golang.org/x/sys/unix"
)
const (
@@ -175,36 +170,3 @@ func stripRoot(root, path string) string {
}
return CleanPath("/" + path)
}
-
-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
-// corresponding to the unsafePath resolved within the root. Before passing the
-// fd, this path is verified to have been inside the root -- so operating on it
-// through the passed fdpath should be safe. Do not access this path through
-// the original path strings, and do not attempt to use the pathname outside of
-// the passed closure (the file handle will be freed once the closure returns).
-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
- // Remove the root then forcefully resolve inside the root.
- unsafePath = stripRoot(root, unsafePath)
- path, err := securejoin.SecureJoin(root, unsafePath)
- if err != nil {
- return fmt.Errorf("resolving path inside rootfs failed: %v", err)
- }
-
- // Open the target path.
- fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
- if err != nil {
- return fmt.Errorf("open o_path procfd: %w", err)
- }
- defer fh.Close()
-
- // Double-check the path is the one we expected.
- procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
- if realpath, err := os.Readlink(procfd); err != nil {
- return fmt.Errorf("procfd verification failed: %w", err)
- } else if realpath != path {
- return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
- }
-
- // Run the closure.
- return fn(procfd)
-}
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
index 7b798cc..cfacfc2 100644
--- a/libcontainer/utils/utils_unix.go
+++ b/libcontainer/utils/utils_unix.go
@@ -1,43 +1,264 @@
+///go:build !windows
+//go:build !windows
// +build !windows
package utils
import (
- "io/ioutil"
+ "fmt"
+ "math"
"os"
+ "path/filepath"
+ "runtime"
"strconv"
- "syscall"
+ "sync"
+ _ "unsafe" // for go:linkname
+
+ securejoin "github.com/cyphar/filepath-securejoin"
+ "github.com/Sirupsen/logrus"
+ "golang.org/x/sys/unix"
)
-func CloseExecFrom(minFd int) error {
- fdList, err := ioutil.ReadDir("/proc/self/fd")
+// EnsureProcHandle returns whether or not the given file handle is on procfs.
+func EnsureProcHandle(fh *os.File) error {
+ var buf unix.Statfs_t
+ if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
+ return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
+ }
+ if buf.Type != unix.PROC_SUPER_MAGIC {
+ return fmt.Errorf("%s is not on procfs", fh.Name())
+ }
+ return nil
+}
+
+var (
+ haveCloseRangeCloexecBool bool
+ haveCloseRangeCloexecOnce sync.Once
+)
+
+func haveCloseRangeCloexec() bool {
+ haveCloseRangeCloexecOnce.Do(func() {
+ // Make sure we're not closing a random file descriptor.
+ tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
+ if err != nil {
+ return
+ }
+ defer unix.Close(tmpFd)
+
+ err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
+ // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
+ // -ENOSYS and -EINVAL ultimately mean we don't have support, but any
+ // other potential error would imply that even the most basic close
+ // operation wouldn't work.
+ haveCloseRangeCloexecBool = err == nil
+ })
+ return haveCloseRangeCloexecBool
+}
+
+type fdFunc func(fd int)
+
+// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
+// the current process.
+func fdRangeFrom(minFd int, fn fdFunc) error {
+ procSelfFd, closer := ProcThreadSelf("fd")
+ defer closer()
+
+ fdDir, err := os.Open(procSelfFd)
+ if err != nil {
+ return err
+ }
+ defer fdDir.Close()
+
+ if err := EnsureProcHandle(fdDir); err != nil {
+ return err
+ }
+
+ fdList, err := fdDir.Readdirnames(-1)
if err != nil {
return err
}
- for _, fi := range fdList {
- fd, err := strconv.Atoi(fi.Name())
+ for _, fdStr := range fdList {
+ fd, err := strconv.Atoi(fdStr)
+ // Ignore non-numeric file names.
if err != nil {
- // ignore non-numeric file names
continue
}
-
+ // Ignore descriptors lower than our specified minimum.
if fd < minFd {
- // ignore descriptors lower than our specified minimum
continue
}
-
- // intentionally ignore errors from syscall.CloseOnExec
- syscall.CloseOnExec(fd)
- // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
+ // Ignore the file descriptor we used for readdir, as it will be closed
+ // when we return.
+ if uintptr(fd) == fdDir.Fd() {
+ continue
+ }
+ // Run the closure.
+ fn(fd)
}
return nil
}
-// NewSockPair returns a new unix socket pair
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
- fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
+// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
+// equal to minFd in the current process.
+func CloseExecFrom(minFd int) error {
+ // Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
+ if haveCloseRangeCloexec() {
+ err := unix.CloseRange(uint(minFd), math.MaxUint64, unix.CLOSE_RANGE_CLOEXEC)
+ return os.NewSyscallError("close_range", err)
+ }
+ // Otherwise, fall back to the standard loop.
+ return fdRangeFrom(minFd, unix.CloseOnExec)
+}
+
+//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
+
+// In order to make sure we do not close the internal epoll descriptors the Go
+// runtime uses, we need to ensure that we skip descriptors that match
+// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
+// unfortunately there's no other way to be sure we're only keeping the file
+// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
+func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
+
+// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
+// current process, except for those critical to Go's runtime (such as the
+// netpoll management descriptors).
+//
+// NOTE: That this function is incredibly dangerous to use in most Go code, as
+// closing file descriptors from underneath *os.File handles can lead to very
+// bad behaviour (the closed file descriptor can be re-used and then any
+// *os.File operations would apply to the wrong file). This function is only
+// intended to be called from the last stage of runc init.
+func UnsafeCloseFrom(minFd int) error {
+ // We cannot use close_range(2) even if it is available, because we must
+ // not close some file descriptors.
+ return fdRangeFrom(minFd, func(fd int) {
+ if runtime_IsPollDescriptor(uintptr(fd)) {
+ // These are the Go runtimes internal netpoll file descriptors.
+ // These file descriptors are operated on deep in the Go scheduler,
+ // and closing those files from underneath Go can result in panics.
+ // There is no issue with keeping them because they are not
+ // executable and are not useful to an attacker anyway. Also we
+ // don't have any choice.
+ return
+ }
+ // There's nothing we can do about errors from close(2), and the
+ // only likely error to be seen is EBADF which indicates the fd was
+ // already closed (in which case, we got what we wanted).
+ _ = unix.Close(fd)
+ })
+}
+
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
+func NewSockPair(name string) (parent, child *os.File, err error) {
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+ // Remove the root then forcefully resolve inside the root.
+ unsafePath = stripRoot(root, unsafePath)
+ path, err := securejoin.SecureJoin(root, unsafePath)
+ if err != nil {
+ return fmt.Errorf("resolving path inside rootfs failed: %w", err)
+ }
+
+ procSelfFd, closer := ProcThreadSelf("fd/")
+ defer closer()
+
+ // Open the target path.
+ fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+ if err != nil {
+ return fmt.Errorf("open o_path procfd: %w", err)
+ }
+ defer fh.Close()
+
+ procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
+ // Double-check the path is the one we expected.
+ if realpath, err := os.Readlink(procfd); err != nil {
+ return fmt.Errorf("procfd verification failed: %w", err)
+ } else if realpath != path {
+ return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+ }
+
+ return fn(procfd)
+}
+
+type ProcThreadSelfCloser func()
+
+var (
+ haveProcThreadSelf bool
+ haveProcThreadSelfOnce sync.Once
+)
+
+// ProcThreadSelf returns a string that is equivalent to
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
+// meaning that the passed string needs to be trusted. The caller _must_ call
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
+// *only once* after it has finished using the returned path string.
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
+ haveProcThreadSelfOnce.Do(func() {
+ if _, err := os.Stat("/proc/thread-self/"); err == nil {
+ haveProcThreadSelf = true
+ } else {
+ logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
+ }
+ })
+
+ // We need to lock our thread until the caller is done with the path string
+ // because any non-atomic operation on the path (such as opening a file,
+ // then reading it) could be interrupted by the Go runtime where the
+ // underlying thread is swapped out and the original thread is killed,
+ // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
+ // addition, the pre-3.17 fallback makes everything non-atomic because the
+ // same thing could happen between unix.Gettid() and the path operations.
+ //
+ // In theory, we don't need to lock in the atomic user case when using
+ // /proc/thread-self/, but it's better to be safe than sorry (and there are
+ // only one or two truly atomic users of /proc/thread-self/).
+ runtime.LockOSThread()
+
+ threadSelf := "/proc/thread-self/"
+ if !haveProcThreadSelf {
+ // Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
+ threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+ if _, err := os.Stat(threadSelf); err != nil {
+ // Unfortunately, this code is called from rootfs_linux.go where we
+ // are running inside the pid namespace of the container but /proc
+ // is the host's procfs. Unfortunately there is no real way to get
+ // the correct tid to use here (the kernel age means we cannot do
+ // things like set up a private fsopen("proc") -- even scanning
+ // NSpid in all of the tasks in /proc/self/task/*/status requires
+ // Linux 4.1).
+ //
+ // So, we just have to assume that /proc/self is acceptable in this
+ // one specific case.
+ if os.Getpid() == 1 {
+ logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
+ } else {
+ // This should never happen, but the fallback should work in most cases...
+ logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
+ }
+ threadSelf = "/proc/self/"
+ }
+ }
+ return threadSelf + subpath, runtime.UnlockOSThread
+}
+
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
+// create a /proc/thread-self handle for given file descriptor.
+//
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
+// without using fmt.Sprintf to avoid unneeded overhead.
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
+ return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
+}
diff --git a/vendor/golang.org/x/sys/unix/flock.go b/vendor/golang.org/x/sys/unix/flock.go
index ce67a59..e8d1081 100644
--- a/vendor/golang.org/x/sys/unix/flock.go
+++ b/vendor/golang.org/x/sys/unix/flock.go
@@ -14,6 +14,11 @@ import "unsafe"
// systems by flock_linux_32bit.go to be SYS_FCNTL64.
var fcntl64Syscall uintptr = SYS_FCNTL
+// FcntlInt performs a fcntl syscall on fd with the provided command and argument.
+func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
+ return fcntl(int(fd), cmd, arg)
+}
+
// FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command.
func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error {
_, _, errno := Syscall(fcntl64Syscall, fd, uintptr(cmd), uintptr(unsafe.Pointer(lk)))
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
index f21dcd9..e1bde81 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
@@ -934,6 +934,7 @@ const (
PRIO_PGRP = 0x1
PRIO_PROCESS = 0x0
PRIO_USER = 0x2
+ PROC_SUPER_MAGIC = 0x9fa0
PROT_EXEC = 0x4
PROT_GROWSDOWN = 0x1000000
PROT_GROWSUP = 0x2000000
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
index 16a18f5..388d1fc 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
@@ -966,6 +966,7 @@ const (
PRIO_PGRP = 0x1
PRIO_PROCESS = 0x0
PRIO_USER = 0x2
+ PROC_SUPER_MAGIC = 0x9fa0
PROT_EXEC = 0x4
PROT_GROWSDOWN = 0x1000000
PROT_GROWSUP = 0x2000000
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
index 8b2e87d..fe21f83 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
@@ -312,6 +312,16 @@ func Close(fd int) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func CloseRange(first uint, last uint, flags uint) (err error) {
+ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags))
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func Dup(oldfd int) (fd int, err error) {
r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0)
fd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
index f6cc320..395e2de 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
@@ -312,6 +312,16 @@ func Close(fd int) (err error) {
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func CloseRange(first uint, last uint, flags uint) (err error) {
+ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags))
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func Dup(oldfd int) (fd int, err error) {
r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0)
fd = int(r0)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
index 9042317..f7c427c 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
@@ -338,4 +338,5 @@ const (
SYS_PKEY_MPROTECT = 329
SYS_PKEY_ALLOC = 330
SYS_PKEY_FREE = 331
+ SYS_CLOSE_RANGE = 436
)
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
index 90e43d0..530563a 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
@@ -282,4 +282,5 @@ const (
SYS_PKEY_MPROTECT = 288
SYS_PKEY_ALLOC = 289
SYS_PKEY_FREE = 290
+ SYS_CLOSE_RANGE = 436
)
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
index c9e1e64..2f12811 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
@@ -345,6 +345,11 @@ type TCPInfo struct {
Total_retrans uint32
}
+const (
+ CLOSE_RANGE_UNSHARE = 0x2
+ CLOSE_RANGE_CLOEXEC = 0x4
+)
+
const (
SizeofSockaddrInet4 = 0x10
SizeofSockaddrInet6 = 0x1c
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
index e58c500..b77eceb 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
@@ -30,6 +30,11 @@ type Timeval struct {
Usec int64
}
+const (
+ CLOSE_RANGE_UNSHARE = 0x2
+ CLOSE_RANGE_CLOEXEC = 0x4
+)
+
type Timex struct {
Modes uint32
Pad_cgo_0 [4]byte
--
2.33.0