2766 lines
86 KiB
Diff
2766 lines
86 KiB
Diff
From e81938064402940ca8176d6f3145f65b1d455996 Mon Sep 17 00:00:00 2001
|
|
From: zhongjiawei <zhongjiawei1@huawei.com>
|
|
Date: Thu, 1 Feb 2024 18:25:16 +0800
|
|
Subject: [PATCH] runc:fix CVE-2024-21626
|
|
|
|
---
|
|
libcontainer/container_linux.go | 50 +-
|
|
libcontainer/container_linux.go.orig | 1660 -----------------
|
|
libcontainer/factory_linux.go | 15 +-
|
|
libcontainer/init_linux.go | 39 +-
|
|
libcontainer/process_linux.go | 3 +-
|
|
libcontainer/setns_init_linux.go | 19 +
|
|
libcontainer/standard_init_linux.go | 28 +-
|
|
libcontainer/standard_init_linux.go.orig | 223 ---
|
|
libcontainer/utils/utils.go | 38 -
|
|
libcontainer/utils/utils_unix.go | 253 ++-
|
|
vendor/golang.org/x/sys/unix/flock.go | 5 +
|
|
.../x/sys/unix/zerrors_linux_amd64.go | 1 +
|
|
.../x/sys/unix/zerrors_linux_arm64.go | 1 +
|
|
.../x/sys/unix/zsyscall_linux_amd64.go | 10 +
|
|
.../x/sys/unix/zsyscall_linux_arm64.go | 10 +
|
|
.../x/sys/unix/zsysnum_linux_amd64.go | 1 +
|
|
.../x/sys/unix/zsysnum_linux_arm64.go | 1 +
|
|
.../x/sys/unix/ztypes_linux_amd64.go | 5 +
|
|
.../x/sys/unix/ztypes_linux_arm64.go | 5 +
|
|
19 files changed, 403 insertions(+), 1964 deletions(-)
|
|
delete mode 100644 libcontainer/container_linux.go.orig
|
|
delete mode 100644 libcontainer/standard_init_linux.go.orig
|
|
|
|
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
|
|
index a4859ca..c757d71 100644
|
|
--- a/libcontainer/container_linux.go
|
|
+++ b/libcontainer/container_linux.go
|
|
@@ -1,3 +1,4 @@
|
|
+//go:build linux
|
|
// +build linux
|
|
|
|
package libcontainer
|
|
@@ -28,6 +29,7 @@ import (
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
"github.com/syndtr/gocapability/capability"
|
|
"github.com/vishvananda/netlink/nl"
|
|
+ "golang.org/x/sys/unix"
|
|
)
|
|
|
|
const stdioFdCount = 3
|
|
@@ -321,6 +323,15 @@ func (c *linuxContainer) start(process *Process) error {
|
|
}()
|
|
}
|
|
|
|
+ // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
|
|
+ // to make sure we don't leak any files into "runc init". Any files to be
|
|
+ // passed to "runc init" through ExtraFiles will get dup2'd by the Go
|
|
+ // runtime and thus their O_CLOEXEC flag will be cleared. This is some
|
|
+ // additional protection against attacks like CVE-2024-21626, by making
|
|
+ // sure we never leak files to "runc init" we didn't intend to.
|
|
+ if err := utils.CloseExecFrom(3); err != nil {
|
|
+ return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
|
|
+ }
|
|
if err := parent.start(); err != nil {
|
|
// terminate the process to ensure that it properly is reaped.
|
|
if err := parent.terminate(); err != nil {
|
|
@@ -414,6 +425,23 @@ func (c *linuxContainer) deleteExecFifo() {
|
|
os.Remove(fifoName)
|
|
}
|
|
|
|
+// includeExecFifo opens the container's execfifo as a pathfd, so that the
|
|
+// container cannot access the statedir (and the FIFO itself remains
|
|
+// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
|
|
+// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
|
|
+func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
|
|
+ fifoName := filepath.Join(c.root, execFifoFilename)
|
|
+ fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
|
|
+ if err != nil {
|
|
+ return err
|
|
+ }
|
|
+
|
|
+ cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
|
|
+ cmd.Env = append(cmd.Env,
|
|
+ fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
|
|
+ return nil
|
|
+}
|
|
+
|
|
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
|
|
parentPipe, childPipe, err := utils.NewSockPair("init")
|
|
if err != nil {
|
|
@@ -430,18 +458,15 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
|
|
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
|
|
}
|
|
|
|
- // We only set up rootDir if we're not doing a `runc exec`. The reason for
|
|
- // this is to avoid cases where a racing, unprivileged process inside the
|
|
- // container can get access to the statedir file descriptor (which would
|
|
- // allow for container rootfs escape).
|
|
- rootDir, err := os.Open(c.root)
|
|
- if err != nil {
|
|
- return nil, err
|
|
+ // We only set up fifoFd if we're not doing a `runc exec`. The historic
|
|
+ // reason for this is that previously we would pass a dirfd that allowed
|
|
+ // for container rootfs escape (and not doing it in `runc exec` avoided
|
|
+ // that problem), but we no longer do that. However, there's no need to do
|
|
+ // this for `runc exec` so we just keep it this way to be safe.
|
|
+ if err := c.includeExecFifo(cmd); err != nil {
|
|
+ return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
|
|
}
|
|
- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
|
|
- cmd.Env = append(cmd.Env,
|
|
- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
|
|
- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
|
|
+ return c.newInitProcess(p, cmd, parentPipe, childPipe)
|
|
}
|
|
|
|
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
|
|
@@ -479,7 +504,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
|
|
return cmd, nil
|
|
}
|
|
|
|
-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
|
|
+func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
|
|
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
|
|
nsMaps := make(map[configs.NamespaceType]string)
|
|
for _, ns := range c.config.Namespaces {
|
|
@@ -501,7 +526,6 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
|
|
process: p,
|
|
bootstrapData: data,
|
|
sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID),
|
|
- rootDir: rootDir,
|
|
}, nil
|
|
}
|
|
|
|
diff --git a/libcontainer/container_linux.go.orig b/libcontainer/container_linux.go.orig
|
|
deleted file mode 100644
|
|
index d678407..0000000
|
|
--- a/libcontainer/container_linux.go.orig
|
|
+++ /dev/null
|
|
@@ -1,1660 +0,0 @@
|
|
-// +build linux
|
|
-
|
|
-package libcontainer
|
|
-
|
|
-import (
|
|
- "bytes"
|
|
- "encoding/json"
|
|
- "errors"
|
|
- "fmt"
|
|
- "io"
|
|
- "io/ioutil"
|
|
- "os"
|
|
- "os/exec"
|
|
- "path/filepath"
|
|
- "reflect"
|
|
- "strings"
|
|
- "sync"
|
|
- "syscall"
|
|
- "time"
|
|
-
|
|
- "github.com/Sirupsen/logrus"
|
|
- "github.com/golang/protobuf/proto"
|
|
- "github.com/opencontainers/runc/libcontainer/cgroups"
|
|
- "github.com/opencontainers/runc/libcontainer/configs"
|
|
- "github.com/opencontainers/runc/libcontainer/criurpc"
|
|
- "github.com/opencontainers/runc/libcontainer/logs"
|
|
- "github.com/opencontainers/runc/libcontainer/system"
|
|
- "github.com/opencontainers/runc/libcontainer/utils"
|
|
- "github.com/syndtr/gocapability/capability"
|
|
- "github.com/vishvananda/netlink/nl"
|
|
-)
|
|
-
|
|
-const stdioFdCount = 3
|
|
-
|
|
-type linuxContainer struct {
|
|
- id string
|
|
- root string
|
|
- config *configs.Config
|
|
- cgroupManager cgroups.Manager
|
|
- initArgs []string
|
|
- initProcess parentProcess
|
|
- initProcessStartTime string
|
|
- criuPath string
|
|
- m sync.Mutex
|
|
- criuVersion int
|
|
- state containerState
|
|
- created time.Time
|
|
-}
|
|
-
|
|
-// State represents a running container's state
|
|
-type State struct {
|
|
- BaseState
|
|
-
|
|
- // Platform specific fields below here
|
|
-
|
|
- // Specifies if the container was started under the rootless mode.
|
|
- Rootless bool `json:"rootless"`
|
|
-
|
|
- // Path to all the cgroups setup for a container. Key is cgroup subsystem name
|
|
- // with the value as the path.
|
|
- CgroupPaths map[string]string `json:"cgroup_paths"`
|
|
-
|
|
- // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
|
|
- // with the value as the path.
|
|
- NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
|
|
-
|
|
- // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
|
|
- ExternalDescriptors []string `json:"external_descriptors,omitempty"`
|
|
-}
|
|
-
|
|
-// CompatState
|
|
-type CompatState struct {
|
|
- State
|
|
- Config configs.CompatConfig `json:"config"`
|
|
-}
|
|
-
|
|
-// Container is a libcontainer container object.
|
|
-//
|
|
-// Each container is thread-safe within the same process. Since a container can
|
|
-// be destroyed by a separate process, any function may return that the container
|
|
-// was not found.
|
|
-type Container interface {
|
|
- BaseContainer
|
|
-
|
|
- // Methods below here are platform specific
|
|
-
|
|
- // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
|
|
- //
|
|
- // errors:
|
|
- // Systemerror - System error.
|
|
- Checkpoint(criuOpts *CriuOpts) error
|
|
-
|
|
- // Restore restores the checkpointed container to a running state using the criu(8) utility.
|
|
- //
|
|
- // errors:
|
|
- // Systemerror - System error.
|
|
- Restore(process *Process, criuOpts *CriuOpts) error
|
|
-
|
|
- // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
|
|
- // the execution of any user processes. Asynchronously, when the container finished being paused the
|
|
- // state is changed to PAUSED.
|
|
- // If the Container state is PAUSED, do nothing.
|
|
- //
|
|
- // errors:
|
|
- // ContainerNotExists - Container no longer exists,
|
|
- // ContainerNotRunning - Container not running or created,
|
|
- // Systemerror - System error.
|
|
- Pause() error
|
|
-
|
|
- // If the Container state is PAUSED, resumes the execution of any user processes in the
|
|
- // Container before setting the Container state to RUNNING.
|
|
- // If the Container state is RUNNING, do nothing.
|
|
- //
|
|
- // errors:
|
|
- // ContainerNotExists - Container no longer exists,
|
|
- // ContainerNotPaused - Container is not paused,
|
|
- // Systemerror - System error.
|
|
- Resume() error
|
|
-
|
|
- // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
|
|
- //
|
|
- // errors:
|
|
- // Systemerror - System error.
|
|
- NotifyOOM() (<-chan struct{}, error)
|
|
-
|
|
- // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
|
|
- //
|
|
- // errors:
|
|
- // Systemerror - System error.
|
|
- NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
|
|
-}
|
|
-
|
|
-// ID returns the container's unique ID
|
|
-func (c *linuxContainer) ID() string {
|
|
- return c.id
|
|
-}
|
|
-
|
|
-// Config returns the container's configuration
|
|
-func (c *linuxContainer) Config() configs.Config {
|
|
- return *c.config
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Status() (Status, error) {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- return c.currentStatus()
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) State() (*State, error) {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- return c.currentState()
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Processes() ([]int, error) {
|
|
- pids, err := c.cgroupManager.GetAllPids()
|
|
- if err != nil {
|
|
- return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
|
|
- }
|
|
- return pids, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Stats() (*Stats, error) {
|
|
- var (
|
|
- err error
|
|
- stats = &Stats{}
|
|
- )
|
|
- if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
|
|
- return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
|
|
- }
|
|
- for _, iface := range c.config.Networks {
|
|
- switch iface.Type {
|
|
- case "veth":
|
|
- istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
|
|
- if err != nil {
|
|
- return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
|
|
- }
|
|
- stats.Interfaces = append(stats.Interfaces, istats)
|
|
- }
|
|
- }
|
|
- return stats, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Set(config configs.Config) error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- status, err := c.currentStatus()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if status == Stopped {
|
|
- return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
|
|
- }
|
|
- c.config = &config
|
|
- return c.cgroupManager.Set(c.config)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Start(process *Process) error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- if process.Init {
|
|
- if err := c.createExecFifo(); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- if err := c.start(process); err != nil {
|
|
- if process.Init {
|
|
- c.deleteExecFifo()
|
|
- }
|
|
- return err
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Run(process *Process) error {
|
|
- if err := c.Start(process); err != nil {
|
|
- return err
|
|
- }
|
|
- if process.Init {
|
|
- return c.exec()
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Exec() error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- return c.exec()
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) exec() error {
|
|
- path := filepath.Join(c.root, execFifoFilename)
|
|
-
|
|
- fifoOpen := make(chan struct{})
|
|
- select {
|
|
- case <-awaitProcessExit(c.initProcess.pid(), fifoOpen):
|
|
- return errors.New("container process is already dead")
|
|
- case result := <-awaitFifoOpen(path, fifoOpen):
|
|
- if result.err != nil {
|
|
- return result.err
|
|
- }
|
|
- f := result.file
|
|
- defer f.Close()
|
|
- if err := readFromExecFifo(f); err != nil {
|
|
- return err
|
|
- }
|
|
- if err := os.Remove(path); !os.IsNotExist(err) {
|
|
- return err
|
|
- }
|
|
- return nil
|
|
- }
|
|
-}
|
|
-
|
|
-func readFromExecFifo(execFifo io.Reader) error {
|
|
- data, err := ioutil.ReadAll(execFifo)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if len(data) <= 0 {
|
|
- return fmt.Errorf("cannot start an already running container")
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func awaitProcessExit(pid int, exit <-chan struct{}) <-chan struct{} {
|
|
- isDead := make(chan struct{})
|
|
- go func() {
|
|
- for {
|
|
- select {
|
|
- case <-exit:
|
|
- return
|
|
- case <-time.After(time.Millisecond * 100):
|
|
- stat, err := system.GetProcessState(pid)
|
|
- if err != nil || stat == system.Zombie {
|
|
- select {
|
|
- case <-exit:
|
|
- return
|
|
- default:
|
|
- close(isDead)
|
|
- }
|
|
- return
|
|
- }
|
|
- }
|
|
- }
|
|
- }()
|
|
- return isDead
|
|
-}
|
|
-
|
|
-func awaitFifoOpen(path string, fifoOpen chan struct{}) <-chan openResult {
|
|
- fifoOpened := make(chan openResult)
|
|
- go func() {
|
|
- f, err := os.OpenFile(path, os.O_RDONLY, 0)
|
|
- close(fifoOpen)
|
|
- if err != nil {
|
|
- fifoOpened <- openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
|
|
- return
|
|
- }
|
|
- fifoOpened <- openResult{file: f}
|
|
- }()
|
|
- return fifoOpened
|
|
-}
|
|
-
|
|
-type openResult struct {
|
|
- file *os.File
|
|
- err error
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) start(process *Process) error {
|
|
- parent, err := c.newParentProcess(process)
|
|
- if err != nil {
|
|
- return newSystemErrorWithCause(err, "creating new parent process")
|
|
- }
|
|
-
|
|
- if logsDone := logs.ForwardLogs(); logsDone != nil {
|
|
- defer func() {
|
|
- select {
|
|
- case <-logsDone:
|
|
- case <-time.After(3 * time.Second):
|
|
- logrus.Warnf("wait child close logfd timeout")
|
|
- }
|
|
- }()
|
|
- }
|
|
-
|
|
- if err := parent.start(); err != nil {
|
|
- // terminate the process to ensure that it properly is reaped.
|
|
- if err := parent.terminate(); err != nil {
|
|
- logrus.Warnf("parent process terminate error: %v", err)
|
|
- }
|
|
- return newSystemErrorWithCause(err, "starting container process")
|
|
- }
|
|
- // generate a timestamp indicating when the container was started
|
|
- c.created = time.Now().UTC()
|
|
- if process.Init {
|
|
- c.state = &createdState{
|
|
- c: c,
|
|
- }
|
|
- state, err := c.updateState(parent)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- c.initProcessStartTime = state.InitProcessStartTime
|
|
-
|
|
- if c.config.Hooks != nil {
|
|
- s := configs.HookState{
|
|
- SpecState: configs.SpecState{
|
|
- Version: c.config.Version,
|
|
- ID: c.id,
|
|
- Pid: parent.pid(),
|
|
- Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
|
|
- },
|
|
- Root: c.config.Rootfs,
|
|
- }
|
|
- for i, hook := range c.config.Hooks.Poststart {
|
|
- logrus.Infof("run poststart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
|
|
- if err := hook.Run(s); err != nil {
|
|
- logrus.Warnf("running poststart hook %d:%s failed: %s, ContainerId: %s", i, hook.Info(), err, s.ID)
|
|
- }
|
|
- }
|
|
- }
|
|
- } else {
|
|
- c.state = &runningState{
|
|
- c: c,
|
|
- }
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Signal(s os.Signal, all bool) error {
|
|
- if all {
|
|
- return signalAllProcesses(c.cgroupManager, s)
|
|
- }
|
|
- status, err := c.currentStatus()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- // to avoid a PID reuse attack
|
|
- if status == Running || status == Created {
|
|
- if err := c.initProcess.signal(s); err != nil {
|
|
- return newSystemErrorWithCause(err, "signaling init process")
|
|
- }
|
|
- return nil
|
|
- }
|
|
- return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) createExecFifo() error {
|
|
- rootuid, err := c.Config().HostRootUID()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- rootgid, err := c.Config().HostRootGID()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- fifoName := filepath.Join(c.root, execFifoFilename)
|
|
- if _, err := os.Stat(fifoName); err == nil {
|
|
- return fmt.Errorf("exec fifo %s already exists", fifoName)
|
|
- }
|
|
- oldMask := syscall.Umask(0000)
|
|
- if err := syscall.Mkfifo(fifoName, 0622); err != nil {
|
|
- syscall.Umask(oldMask)
|
|
- return err
|
|
- }
|
|
- syscall.Umask(oldMask)
|
|
- if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
|
|
- return err
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) deleteExecFifo() {
|
|
- fifoName := filepath.Join(c.root, execFifoFilename)
|
|
- os.Remove(fifoName)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
|
|
- parentPipe, childPipe, err := utils.NewSockPair("init")
|
|
- if err != nil {
|
|
- return nil, newSystemErrorWithCause(err, "creating new init pipe")
|
|
- }
|
|
- if err := logs.InitLogPipe(); err != nil {
|
|
- return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
|
|
- }
|
|
- cmd, err := c.commandTemplate(p, childPipe)
|
|
- if err != nil {
|
|
- return nil, newSystemErrorWithCause(err, "creating new command template")
|
|
- }
|
|
- if !p.Init {
|
|
- return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
|
|
- }
|
|
-
|
|
- // We only set up rootDir if we're not doing a `runc exec`. The reason for
|
|
- // this is to avoid cases where a racing, unprivileged process inside the
|
|
- // container can get access to the statedir file descriptor (which would
|
|
- // allow for container rootfs escape).
|
|
- rootDir, err := os.Open(c.root)
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
|
|
- cmd.Env = append(cmd.Env,
|
|
- fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
|
|
- return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
|
|
- cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
|
|
- cmd.Stdin = p.Stdin
|
|
- cmd.Stdout = p.Stdout
|
|
- cmd.Stderr = p.Stderr
|
|
- cmd.Dir = c.config.Rootfs
|
|
- if cmd.SysProcAttr == nil {
|
|
- cmd.SysProcAttr = &syscall.SysProcAttr{}
|
|
- }
|
|
- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
|
|
- if p.ConsoleSocket != nil {
|
|
- cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
|
|
- cmd.Env = append(cmd.Env,
|
|
- fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
- )
|
|
- }
|
|
- cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
|
|
- cmd.Env = append(cmd.Env,
|
|
- fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
- )
|
|
-
|
|
- cmd.ExtraFiles = append(cmd.ExtraFiles, logs.ChildLogPipe)
|
|
- cmd.Env = append(cmd.Env,
|
|
- fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
|
|
- )
|
|
-
|
|
- // NOTE: when running a container with no PID namespace and the parent process spawning the container is
|
|
- // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
|
|
- // even with the parent still running.
|
|
- if c.config.ParentDeathSignal > 0 {
|
|
- cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
|
|
- }
|
|
- return cmd, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
|
|
- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
|
|
- nsMaps := make(map[configs.NamespaceType]string)
|
|
- for _, ns := range c.config.Namespaces {
|
|
- if ns.Path != "" {
|
|
- nsMaps[ns.Type] = ns.Path
|
|
- }
|
|
- }
|
|
- data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- return &initProcess{
|
|
- cmd: cmd,
|
|
- childPipe: childPipe,
|
|
- parentPipe: parentPipe,
|
|
- manager: c.cgroupManager,
|
|
- config: c.newInitConfig(p),
|
|
- container: c,
|
|
- process: p,
|
|
- bootstrapData: data,
|
|
- sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID),
|
|
- rootDir: rootDir,
|
|
- }, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
|
|
- cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
|
|
- state, err := c.currentState()
|
|
- if err != nil {
|
|
- return nil, newSystemErrorWithCause(err, "getting container's current state")
|
|
- }
|
|
- // for setns process, we don't have to set cloneflags as the process namespaces
|
|
- // will only be set via setns syscall
|
|
- data, err := c.bootstrapData(0, state.NamespacePaths)
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- return &setnsProcess{
|
|
- cmd: cmd,
|
|
- cgroupPaths: c.cgroupManager.GetPaths(),
|
|
- childPipe: childPipe,
|
|
- parentPipe: parentPipe,
|
|
- config: c.newInitConfig(p),
|
|
- process: p,
|
|
- bootstrapData: data,
|
|
- }, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
|
|
- cfg := &initConfig{
|
|
- Config: c.config,
|
|
- Args: process.Args,
|
|
- Env: process.Env,
|
|
- User: process.User,
|
|
- AdditionalGroups: process.AdditionalGroups,
|
|
- Cwd: process.Cwd,
|
|
- Capabilities: process.Capabilities,
|
|
- PassedFilesCount: len(process.ExtraFiles),
|
|
- ContainerId: c.ID(),
|
|
- NoNewPrivileges: c.config.NoNewPrivileges,
|
|
- Rootless: c.config.Rootless,
|
|
- AppArmorProfile: c.config.AppArmorProfile,
|
|
- ProcessLabel: c.config.ProcessLabel,
|
|
- Rlimits: c.config.Rlimits,
|
|
- }
|
|
- if process.NoNewPrivileges != nil {
|
|
- cfg.NoNewPrivileges = *process.NoNewPrivileges
|
|
- }
|
|
- if process.AppArmorProfile != "" {
|
|
- cfg.AppArmorProfile = process.AppArmorProfile
|
|
- }
|
|
- if process.Label != "" {
|
|
- cfg.ProcessLabel = process.Label
|
|
- }
|
|
- if len(process.Rlimits) > 0 {
|
|
- cfg.Rlimits = process.Rlimits
|
|
- }
|
|
- cfg.CreateConsole = process.ConsoleSocket != nil
|
|
- return cfg
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Destroy() error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- return c.state.destroy()
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Pause() error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- status, err := c.currentStatus()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- switch status {
|
|
- case Running, Created:
|
|
- if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
|
|
- return err
|
|
- }
|
|
- return c.state.transition(&pausedState{
|
|
- c: c,
|
|
- })
|
|
- }
|
|
- return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Resume() error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
- status, err := c.currentStatus()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if status != Paused {
|
|
- return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
|
|
- }
|
|
- if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
|
- return err
|
|
- }
|
|
- return c.state.transition(&runningState{
|
|
- c: c,
|
|
- })
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
|
|
- // XXX(cyphar): This requires cgroups.
|
|
- if c.config.Rootless {
|
|
- return nil, fmt.Errorf("cannot get OOM notifications from rootless container")
|
|
- }
|
|
- return notifyOnOOM(c.cgroupManager.GetPaths())
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
|
|
- // XXX(cyphar): This requires cgroups.
|
|
- if c.config.Rootless {
|
|
- return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container")
|
|
- }
|
|
- return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
|
|
-}
|
|
-
|
|
-var criuFeatures *criurpc.CriuFeatures
|
|
-
|
|
-func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
|
|
-
|
|
- var t criurpc.CriuReqType
|
|
- t = criurpc.CriuReqType_FEATURE_CHECK
|
|
-
|
|
- if err := c.checkCriuVersion("1.8"); err != nil {
|
|
- // Feature checking was introduced with CRIU 1.8.
|
|
- // Ignore the feature check if an older CRIU version is used
|
|
- // and just act as before.
|
|
- // As all automated PR testing is done using CRIU 1.7 this
|
|
- // code will not be tested by automated PR testing.
|
|
- return nil
|
|
- }
|
|
-
|
|
- // make sure the features we are looking for are really not from
|
|
- // some previous check
|
|
- criuFeatures = nil
|
|
-
|
|
- req := &criurpc.CriuReq{
|
|
- Type: &t,
|
|
- // Theoretically this should not be necessary but CRIU
|
|
- // segfaults if Opts is empty.
|
|
- // Fixed in CRIU 2.12
|
|
- Opts: rpcOpts,
|
|
- Features: criuFeat,
|
|
- }
|
|
-
|
|
- err := c.criuSwrk(nil, req, criuOpts, false)
|
|
- if err != nil {
|
|
- logrus.Debugf("%s", err)
|
|
- return fmt.Errorf("CRIU feature check failed")
|
|
- }
|
|
-
|
|
- logrus.Debugf("Feature check says: %s", criuFeatures)
|
|
- missingFeatures := false
|
|
-
|
|
- if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
|
|
- missingFeatures = true
|
|
- logrus.Debugf("CRIU does not support MemTrack")
|
|
- }
|
|
-
|
|
- if missingFeatures {
|
|
- return fmt.Errorf("CRIU is missing features")
|
|
- }
|
|
-
|
|
- return nil
|
|
-}
|
|
-
|
|
-// checkCriuVersion checks Criu version greater than or equal to minVersion
|
|
-func (c *linuxContainer) checkCriuVersion(minVersion string) error {
|
|
- var x, y, z, versionReq int
|
|
-
|
|
- _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
|
|
- if err != nil {
|
|
- _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
|
|
- }
|
|
- versionReq = x*10000 + y*100 + z
|
|
-
|
|
- out, err := exec.Command(c.criuPath, "-V").Output()
|
|
- if err != nil {
|
|
- return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath)
|
|
- }
|
|
-
|
|
- x = 0
|
|
- y = 0
|
|
- z = 0
|
|
- if ep := strings.Index(string(out), "-"); ep >= 0 {
|
|
- // criu Git version format
|
|
- var version string
|
|
- if sp := strings.Index(string(out), "GitID"); sp > 0 {
|
|
- version = string(out)[sp:ep]
|
|
- } else {
|
|
- return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath)
|
|
- }
|
|
-
|
|
- n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
|
|
- if err != nil {
|
|
- n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
|
|
- y++
|
|
- } else {
|
|
- z++
|
|
- }
|
|
- if n < 2 || err != nil {
|
|
- return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
|
|
- }
|
|
- } else {
|
|
- // criu release version format
|
|
- n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
|
|
- if err != nil {
|
|
- n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
|
|
- }
|
|
- if n < 2 || err != nil {
|
|
- return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
|
|
- }
|
|
- }
|
|
-
|
|
- c.criuVersion = x*10000 + y*100 + z
|
|
-
|
|
- if c.criuVersion < versionReq {
|
|
- return fmt.Errorf("CRIU version must be %s or higher", minVersion)
|
|
- }
|
|
-
|
|
- return nil
|
|
-}
|
|
-
|
|
-const descriptorsFilename = "descriptors.json"
|
|
-
|
|
-func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
|
|
- mountDest := m.Destination
|
|
- if strings.HasPrefix(mountDest, c.config.Rootfs) {
|
|
- mountDest = mountDest[len(c.config.Rootfs):]
|
|
- }
|
|
-
|
|
- extMnt := &criurpc.ExtMountMap{
|
|
- Key: proto.String(mountDest),
|
|
- Val: proto.String(mountDest),
|
|
- }
|
|
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
|
|
- for _, path := range c.config.MaskPaths {
|
|
- fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
|
|
- if err != nil {
|
|
- if os.IsNotExist(err) {
|
|
- continue
|
|
- }
|
|
- return err
|
|
- }
|
|
- if fi.IsDir() {
|
|
- continue
|
|
- }
|
|
-
|
|
- extMnt := &criurpc.ExtMountMap{
|
|
- Key: proto.String(path),
|
|
- Val: proto.String("/dev/null"),
|
|
- }
|
|
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
- }
|
|
-
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
-
|
|
- // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
|
|
- // support for doing unprivileged dumps, but the setup of
|
|
- // rootless containers might make this complicated.
|
|
- if c.config.Rootless {
|
|
- return fmt.Errorf("cannot checkpoint a rootless container")
|
|
- }
|
|
-
|
|
- if err := c.checkCriuVersion("1.5.2"); err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- if criuOpts.ImagesDirectory == "" {
|
|
- return fmt.Errorf("invalid directory to save checkpoint")
|
|
- }
|
|
-
|
|
- // Since a container can be C/R'ed multiple times,
|
|
- // the checkpoint directory may already exist.
|
|
- if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
|
|
- return err
|
|
- }
|
|
-
|
|
- if criuOpts.WorkDirectory == "" {
|
|
- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
|
|
- }
|
|
-
|
|
- if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
|
|
- return err
|
|
- }
|
|
-
|
|
- workDir, err := os.Open(criuOpts.WorkDirectory)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- defer workDir.Close()
|
|
-
|
|
- imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- defer imageDir.Close()
|
|
-
|
|
- rpcOpts := criurpc.CriuOpts{
|
|
- ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
|
- WorkDirFd: proto.Int32(int32(workDir.Fd())),
|
|
- LogLevel: proto.Int32(4),
|
|
- LogFile: proto.String("dump.log"),
|
|
- Root: proto.String(c.config.Rootfs),
|
|
- ManageCgroups: proto.Bool(true),
|
|
- NotifyScripts: proto.Bool(true),
|
|
- Pid: proto.Int32(int32(c.initProcess.pid())),
|
|
- ShellJob: proto.Bool(criuOpts.ShellJob),
|
|
- LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
|
|
- TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
|
|
- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
|
|
- FileLocks: proto.Bool(criuOpts.FileLocks),
|
|
- EmptyNs: proto.Uint32(criuOpts.EmptyNs),
|
|
- }
|
|
-
|
|
- // append optional criu opts, e.g., page-server and port
|
|
- if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
|
|
- rpcOpts.Ps = &criurpc.CriuPageServerInfo{
|
|
- Address: proto.String(criuOpts.PageServer.Address),
|
|
- Port: proto.Int32(criuOpts.PageServer.Port),
|
|
- }
|
|
- }
|
|
-
|
|
- //pre-dump may need parentImage param to complete iterative migration
|
|
- if criuOpts.ParentImage != "" {
|
|
- rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
|
|
- rpcOpts.TrackMem = proto.Bool(true)
|
|
- }
|
|
-
|
|
- // append optional manage cgroups mode
|
|
- if criuOpts.ManageCgroupsMode != 0 {
|
|
- if err := c.checkCriuVersion("1.7"); err != nil {
|
|
- return err
|
|
- }
|
|
- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
|
|
- rpcOpts.ManageCgroupsMode = &mode
|
|
- }
|
|
-
|
|
- var t criurpc.CriuReqType
|
|
- if criuOpts.PreDump {
|
|
- feat := criurpc.CriuFeatures{
|
|
- MemTrack: proto.Bool(true),
|
|
- }
|
|
-
|
|
- if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- t = criurpc.CriuReqType_PRE_DUMP
|
|
- } else {
|
|
- t = criurpc.CriuReqType_DUMP
|
|
- }
|
|
- req := &criurpc.CriuReq{
|
|
- Type: &t,
|
|
- Opts: &rpcOpts,
|
|
- }
|
|
-
|
|
- //no need to dump these information in pre-dump
|
|
- if !criuOpts.PreDump {
|
|
- for _, m := range c.config.Mounts {
|
|
- switch m.Device {
|
|
- case "bind":
|
|
- c.addCriuDumpMount(req, m)
|
|
- break
|
|
- case "cgroup":
|
|
- binds, err := getCgroupMounts(m)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- for _, b := range binds {
|
|
- c.addCriuDumpMount(req, b)
|
|
- }
|
|
- break
|
|
- }
|
|
- }
|
|
-
|
|
- if err := c.addMaskPaths(req); err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- for _, node := range c.config.Devices {
|
|
- m := &configs.Mount{Destination: node.Path, Source: node.Path}
|
|
- c.addCriuDumpMount(req, m)
|
|
- }
|
|
-
|
|
- // Write the FD info to a file in the image directory
|
|
- fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
-
|
|
- err = c.criuSwrk(nil, req, criuOpts, false)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
|
|
- mountDest := m.Destination
|
|
- if strings.HasPrefix(mountDest, c.config.Rootfs) {
|
|
- mountDest = mountDest[len(c.config.Rootfs):]
|
|
- }
|
|
-
|
|
- extMnt := &criurpc.ExtMountMap{
|
|
- Key: proto.String(mountDest),
|
|
- Val: proto.String(m.Source),
|
|
- }
|
|
- req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
|
|
- for _, iface := range c.config.Networks {
|
|
- switch iface.Type {
|
|
- case "veth":
|
|
- veth := new(criurpc.CriuVethPair)
|
|
- veth.IfOut = proto.String(iface.HostInterfaceName)
|
|
- veth.IfIn = proto.String(iface.Name)
|
|
- req.Opts.Veths = append(req.Opts.Veths, veth)
|
|
- break
|
|
- case "loopback":
|
|
- break
|
|
- }
|
|
- }
|
|
- for _, i := range criuOpts.VethPairs {
|
|
- veth := new(criurpc.CriuVethPair)
|
|
- veth.IfOut = proto.String(i.HostInterfaceName)
|
|
- veth.IfIn = proto.String(i.ContainerInterfaceName)
|
|
- req.Opts.Veths = append(req.Opts.Veths, veth)
|
|
- }
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
|
- c.m.Lock()
|
|
- defer c.m.Unlock()
|
|
-
|
|
- // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
|
|
- // support for unprivileged restore at the moment.
|
|
- if c.config.Rootless {
|
|
- return fmt.Errorf("cannot restore a rootless container")
|
|
- }
|
|
-
|
|
- if err := c.checkCriuVersion("1.5.2"); err != nil {
|
|
- return err
|
|
- }
|
|
- if criuOpts.WorkDirectory == "" {
|
|
- criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
|
|
- }
|
|
- // Since a container can be C/R'ed multiple times,
|
|
- // the work directory may already exist.
|
|
- if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
|
|
- return err
|
|
- }
|
|
- workDir, err := os.Open(criuOpts.WorkDirectory)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- defer workDir.Close()
|
|
- if criuOpts.ImagesDirectory == "" {
|
|
- return fmt.Errorf("invalid directory to restore checkpoint")
|
|
- }
|
|
- imageDir, err := os.Open(criuOpts.ImagesDirectory)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- defer imageDir.Close()
|
|
- // CRIU has a few requirements for a root directory:
|
|
- // * it must be a mount point
|
|
- // * its parent must not be overmounted
|
|
- // c.config.Rootfs is bind-mounted to a temporary directory
|
|
- // to satisfy these requirements.
|
|
- root := filepath.Join(c.root, "criu-root")
|
|
- if err := os.Mkdir(root, 0755); err != nil {
|
|
- return err
|
|
- }
|
|
- defer os.Remove(root)
|
|
- root, err = filepath.EvalSymlinks(root)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- defer syscall.Unmount(root, syscall.MNT_DETACH)
|
|
- t := criurpc.CriuReqType_RESTORE
|
|
- req := &criurpc.CriuReq{
|
|
- Type: &t,
|
|
- Opts: &criurpc.CriuOpts{
|
|
- ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
|
|
- WorkDirFd: proto.Int32(int32(workDir.Fd())),
|
|
- EvasiveDevices: proto.Bool(true),
|
|
- LogLevel: proto.Int32(4),
|
|
- LogFile: proto.String("restore.log"),
|
|
- RstSibling: proto.Bool(true),
|
|
- Root: proto.String(root),
|
|
- ManageCgroups: proto.Bool(true),
|
|
- NotifyScripts: proto.Bool(true),
|
|
- ShellJob: proto.Bool(criuOpts.ShellJob),
|
|
- ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
|
|
- TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
|
|
- FileLocks: proto.Bool(criuOpts.FileLocks),
|
|
- EmptyNs: proto.Uint32(criuOpts.EmptyNs),
|
|
- },
|
|
- }
|
|
-
|
|
- for _, m := range c.config.Mounts {
|
|
- switch m.Device {
|
|
- case "bind":
|
|
- c.addCriuRestoreMount(req, m)
|
|
- break
|
|
- case "cgroup":
|
|
- binds, err := getCgroupMounts(m)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- for _, b := range binds {
|
|
- c.addCriuRestoreMount(req, b)
|
|
- }
|
|
- break
|
|
- }
|
|
- }
|
|
-
|
|
- if len(c.config.MaskPaths) > 0 {
|
|
- m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
|
|
- c.addCriuRestoreMount(req, m)
|
|
- }
|
|
-
|
|
- for _, node := range c.config.Devices {
|
|
- m := &configs.Mount{Destination: node.Path, Source: node.Path}
|
|
- c.addCriuRestoreMount(req, m)
|
|
- }
|
|
-
|
|
- if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
|
|
- c.restoreNetwork(req, criuOpts)
|
|
- }
|
|
-
|
|
- // append optional manage cgroups mode
|
|
- if criuOpts.ManageCgroupsMode != 0 {
|
|
- if err := c.checkCriuVersion("1.7"); err != nil {
|
|
- return err
|
|
- }
|
|
- mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
|
|
- req.Opts.ManageCgroupsMode = &mode
|
|
- }
|
|
-
|
|
- var (
|
|
- fds []string
|
|
- fdJSON []byte
|
|
- )
|
|
- if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- if err := json.Unmarshal(fdJSON, &fds); err != nil {
|
|
- return err
|
|
- }
|
|
- for i := range fds {
|
|
- if s := fds[i]; strings.Contains(s, "pipe:") {
|
|
- inheritFd := new(criurpc.InheritFd)
|
|
- inheritFd.Key = proto.String(s)
|
|
- inheritFd.Fd = proto.Int32(int32(i))
|
|
- req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
|
|
- }
|
|
- }
|
|
- return c.criuSwrk(process, req, criuOpts, true)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
|
|
- // XXX: Do we need to deal with this case? AFAIK criu still requires root.
|
|
- if err := c.cgroupManager.Apply(pid); err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- if err := c.cgroupManager.Set(c.config); err != nil {
|
|
- return newSystemError(err)
|
|
- }
|
|
-
|
|
- path := fmt.Sprintf("/proc/%d/cgroup", pid)
|
|
- cgroupsPaths, err := cgroups.ParseCgroupFile(path)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- for c, p := range cgroupsPaths {
|
|
- cgroupRoot := &criurpc.CgroupRoot{
|
|
- Ctrl: proto.String(c),
|
|
- Path: proto.String(p),
|
|
- }
|
|
- req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
|
|
- }
|
|
-
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
|
|
- fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
|
|
- criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
|
|
- criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
|
|
- defer criuClient.Close()
|
|
- defer criuServer.Close()
|
|
-
|
|
- args := []string{"swrk", "3"}
|
|
- logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
|
|
- logrus.Debugf("Using CRIU with following args: %s", args)
|
|
- cmd := exec.Command(c.criuPath, args...)
|
|
- if process != nil {
|
|
- cmd.Stdin = process.Stdin
|
|
- cmd.Stdout = process.Stdout
|
|
- cmd.Stderr = process.Stderr
|
|
- }
|
|
- cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
|
|
-
|
|
- if err := cmd.Start(); err != nil {
|
|
- return err
|
|
- }
|
|
- criuServer.Close()
|
|
-
|
|
- defer func() {
|
|
- criuClient.Close()
|
|
- _, err := cmd.Process.Wait()
|
|
- if err != nil {
|
|
- return
|
|
- }
|
|
- }()
|
|
-
|
|
- if applyCgroups {
|
|
- err := c.criuApplyCgroups(cmd.Process.Pid, req)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
-
|
|
- var extFds []string
|
|
- if process != nil {
|
|
- extFds, err = getPipeFds(cmd.Process.Pid)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
-
|
|
- logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
|
|
- // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
|
|
- // should be empty. For older CRIU versions it still will be
|
|
- // available but empty.
|
|
- if req.GetType() != criurpc.CriuReqType_FEATURE_CHECK {
|
|
- val := reflect.ValueOf(req.GetOpts())
|
|
- v := reflect.Indirect(val)
|
|
- for i := 0; i < v.NumField(); i++ {
|
|
- st := v.Type()
|
|
- name := st.Field(i).Name
|
|
- if strings.HasPrefix(name, "XXX_") {
|
|
- continue
|
|
- }
|
|
- value := val.MethodByName("Get" + name).Call([]reflect.Value{})
|
|
- logrus.Debugf("CRIU option %s with value %v", name, value[0])
|
|
- }
|
|
- }
|
|
- data, err := proto.Marshal(req)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- _, err = criuClient.Write(data)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- buf := make([]byte, 10*4096)
|
|
- for true {
|
|
- n, err := criuClient.Read(buf)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if n == 0 {
|
|
- return fmt.Errorf("unexpected EOF")
|
|
- }
|
|
- if n == len(buf) {
|
|
- return fmt.Errorf("buffer is too small")
|
|
- }
|
|
-
|
|
- resp := new(criurpc.CriuResp)
|
|
- err = proto.Unmarshal(buf[:n], resp)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if !resp.GetSuccess() {
|
|
- typeString := req.GetType().String()
|
|
- return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
|
|
- }
|
|
-
|
|
- t := resp.GetType()
|
|
- switch {
|
|
- case t == criurpc.CriuReqType_FEATURE_CHECK:
|
|
- logrus.Debugf("Feature check says: %s", resp)
|
|
- criuFeatures = resp.GetFeatures()
|
|
- break
|
|
- case t == criurpc.CriuReqType_NOTIFY:
|
|
- if err := c.criuNotifications(resp, process, opts, extFds); err != nil {
|
|
- return err
|
|
- }
|
|
- t = criurpc.CriuReqType_NOTIFY
|
|
- req = &criurpc.CriuReq{
|
|
- Type: &t,
|
|
- NotifySuccess: proto.Bool(true),
|
|
- }
|
|
- data, err = proto.Marshal(req)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- _, err = criuClient.Write(data)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- continue
|
|
- case t == criurpc.CriuReqType_RESTORE:
|
|
- case t == criurpc.CriuReqType_DUMP:
|
|
- break
|
|
- case t == criurpc.CriuReqType_PRE_DUMP:
|
|
- // In pre-dump mode CRIU is in a loop and waits for
|
|
- // the final DUMP command.
|
|
- // The current runc pre-dump approach, however, is
|
|
- // start criu in PRE_DUMP once for a single pre-dump
|
|
- // and not the whole series of pre-dump, pre-dump, ...m, dump
|
|
- // If we got the message CriuReqType_PRE_DUMP it means
|
|
- // CRIU was successful and we need to forcefully stop CRIU
|
|
- logrus.Debugf("PRE_DUMP finished. Send close signal to CRIU service")
|
|
- criuClient.Close()
|
|
- // Process status won't be success, because one end of sockets is closed
|
|
- _, err := cmd.Process.Wait()
|
|
- if err != nil {
|
|
- logrus.Debugf("After PRE_DUMP CRIU exiting failed")
|
|
- return err
|
|
- }
|
|
- return nil
|
|
- default:
|
|
- return fmt.Errorf("unable to parse the response %s", resp.String())
|
|
- }
|
|
-
|
|
- break
|
|
- }
|
|
-
|
|
- // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
|
|
- // Here we want to wait only the CRIU process.
|
|
- st, err := cmd.Process.Wait()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if !st.Success() {
|
|
- return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-// block any external network activity
|
|
-func lockNetwork(config *configs.Config) error {
|
|
- for _, config := range config.Networks {
|
|
- strategy, err := getStrategy(config.Type)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- if err := strategy.detach(config); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func unlockNetwork(config *configs.Config) error {
|
|
- for _, config := range config.Networks {
|
|
- strategy, err := getStrategy(config.Type)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if err = strategy.attach(config); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error {
|
|
- notify := resp.GetNotify()
|
|
- if notify == nil {
|
|
- return fmt.Errorf("invalid response: %s", resp.String())
|
|
- }
|
|
- switch {
|
|
- case notify.GetScript() == "post-dump":
|
|
- f, err := os.Create(filepath.Join(c.root, "checkpoint"))
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- f.Close()
|
|
- case notify.GetScript() == "network-unlock":
|
|
- if err := unlockNetwork(c.config); err != nil {
|
|
- return err
|
|
- }
|
|
- case notify.GetScript() == "network-lock":
|
|
- if err := lockNetwork(c.config); err != nil {
|
|
- return err
|
|
- }
|
|
- case notify.GetScript() == "setup-namespaces":
|
|
- if c.config.Hooks != nil {
|
|
- s := configs.HookState{
|
|
- SpecState: configs.SpecState{
|
|
- Version: c.config.Version,
|
|
- ID: c.id,
|
|
- Pid: int(notify.GetPid()),
|
|
- Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
|
|
- },
|
|
- Root: c.config.Rootfs,
|
|
- }
|
|
- for i, hook := range c.config.Hooks.Prestart {
|
|
- logrus.Infof("run prestart hook: %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
|
|
- if err := hook.Run(s); err != nil {
|
|
- return newSystemErrorWithCausef(err, "running prestart hook %d:%s, ContainerID: %s", i, hook.Info(), s.ID)
|
|
- }
|
|
- logrus.Infof("prestart hook: %d:%s done", i, hook.Info())
|
|
- }
|
|
- }
|
|
- case notify.GetScript() == "post-restore":
|
|
- pid := notify.GetPid()
|
|
- r, err := newRestoredProcess(int(pid), fds)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- process.ops = r
|
|
- if err := c.state.transition(&restoredState{
|
|
- imageDir: opts.ImagesDirectory,
|
|
- c: c,
|
|
- }); err != nil {
|
|
- return err
|
|
- }
|
|
- // create a timestamp indicating when the restored checkpoint was started
|
|
- c.created = time.Now().UTC()
|
|
- if _, err := c.updateState(r); err != nil {
|
|
- return err
|
|
- }
|
|
- if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
|
|
- if !os.IsNotExist(err) {
|
|
- logrus.Error(err)
|
|
- }
|
|
- }
|
|
- }
|
|
- return nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
|
|
- c.initProcess = process
|
|
- state, err := c.currentState()
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- err = c.saveState(state)
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- return state, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) saveState(s *State) error {
|
|
- f, err := os.Create(filepath.Join(c.root, stateFilename))
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- defer f.Close()
|
|
- return utils.WriteJSON(f, s)
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) deleteState() error {
|
|
- return os.Remove(filepath.Join(c.root, stateFilename))
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) currentStatus() (Status, error) {
|
|
- if err := c.refreshState(); err != nil {
|
|
- return -1, err
|
|
- }
|
|
- return c.state.status(), nil
|
|
-}
|
|
-
|
|
-// refreshState needs to be called to verify that the current state on the
|
|
-// container is what is true. Because consumers of libcontainer can use it
|
|
-// out of process we need to verify the container's status based on runtime
|
|
-// information and not rely on our in process info.
|
|
-func (c *linuxContainer) refreshState() error {
|
|
- paused, err := c.isPaused()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if paused {
|
|
- return c.state.transition(&pausedState{c: c})
|
|
- }
|
|
- t, err := c.runType()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- switch t {
|
|
- case Created:
|
|
- return c.state.transition(&createdState{c: c})
|
|
- case Running:
|
|
- return c.state.transition(&runningState{c: c})
|
|
- }
|
|
- return c.state.transition(&stoppedState{c: c})
|
|
-}
|
|
-
|
|
-// doesInitProcessExist checks if the init process is still the same process
|
|
-// as the initial one, it could happen that the original process has exited
|
|
-// and a new process has been created with the same pid, in this case, the
|
|
-// container would already be stopped.
|
|
-func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
|
|
- startTime, err := system.GetProcessStartTime(initPid)
|
|
- if err != nil {
|
|
- return false, nil
|
|
- }
|
|
- if c.initProcessStartTime != startTime {
|
|
- return false, nil
|
|
- }
|
|
- return true, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) runType() (Status, error) {
|
|
- if c.initProcess == nil {
|
|
- return Stopped, nil
|
|
- }
|
|
- pid := c.initProcess.pid()
|
|
- // return Running if the init process is alive
|
|
- if err := syscall.Kill(pid, 0); err != nil {
|
|
- if err == syscall.ESRCH {
|
|
- // It means the process does not exist anymore, could happen when the
|
|
- // process exited just when we call the function, we should not return
|
|
- // error in this case.
|
|
- return Stopped, nil
|
|
- }
|
|
- return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
|
|
- }
|
|
- // check if the process is still the original init process.
|
|
- exist, err := c.doesInitProcessExist(pid)
|
|
- if !exist || err != nil {
|
|
- return Stopped, err
|
|
- }
|
|
- // We'll create exec fifo and blocking on it after container is created,
|
|
- // and delete it after start container.
|
|
- if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
|
|
- return Created, nil
|
|
- }
|
|
- return Running, nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) isPaused() (bool, error) {
|
|
- fcg := c.cgroupManager.GetPaths()["freezer"]
|
|
- if fcg == "" {
|
|
- // A container doesn't have a freezer cgroup
|
|
- return false, nil
|
|
- }
|
|
- data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state"))
|
|
- if err != nil {
|
|
- // If freezer cgroup is not mounted, the container would just be not paused.
|
|
- if os.IsNotExist(err) {
|
|
- return false, nil
|
|
- }
|
|
- return false, newSystemErrorWithCause(err, "checking if container is paused")
|
|
- }
|
|
- return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
|
|
-}
|
|
-
|
|
-func (c *linuxContainer) currentState() (*State, error) {
|
|
- var (
|
|
- startTime string
|
|
- externalDescriptors []string
|
|
- pid = -1
|
|
- )
|
|
- if c.initProcess != nil {
|
|
- pid = c.initProcess.pid()
|
|
- startTime, _ = c.initProcess.startTime()
|
|
- externalDescriptors = c.initProcess.externalDescriptors()
|
|
- }
|
|
- state := &State{
|
|
- BaseState: BaseState{
|
|
- ID: c.ID(),
|
|
- Config: *c.config,
|
|
- InitProcessPid: pid,
|
|
- InitProcessStartTime: startTime,
|
|
- Created: c.created,
|
|
- },
|
|
- Rootless: c.config.Rootless,
|
|
- CgroupPaths: c.cgroupManager.GetPaths(),
|
|
- NamespacePaths: make(map[configs.NamespaceType]string),
|
|
- ExternalDescriptors: externalDescriptors,
|
|
- }
|
|
- if pid > 0 {
|
|
- for _, ns := range c.config.Namespaces {
|
|
- state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
|
- }
|
|
- for _, nsType := range configs.NamespaceTypes() {
|
|
- if !configs.IsNamespaceSupported(nsType) {
|
|
- continue
|
|
- }
|
|
- if _, ok := state.NamespacePaths[nsType]; !ok {
|
|
- ns := configs.Namespace{Type: nsType}
|
|
- state.NamespacePaths[ns.Type] = ns.GetPath(pid)
|
|
- }
|
|
- }
|
|
- }
|
|
- return state, nil
|
|
-}
|
|
-
|
|
-// orderNamespacePaths sorts namespace paths into a list of paths that we
|
|
-// can setns in order.
|
|
-func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
|
|
- paths := []string{}
|
|
-
|
|
- for _, ns := range configs.NamespaceTypes() {
|
|
-
|
|
- // Remove namespaces that we don't need to join.
|
|
- if !c.config.Namespaces.Contains(ns) {
|
|
- continue
|
|
- }
|
|
-
|
|
- if p, ok := namespaces[ns]; ok && p != "" {
|
|
- // check if the requested namespace is supported
|
|
- if !configs.IsNamespaceSupported(ns) {
|
|
- return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
|
|
- }
|
|
- // only set to join this namespace if it exists
|
|
- if _, err := os.Lstat(p); err != nil {
|
|
- return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
|
|
- }
|
|
- // do not allow namespace path with comma as we use it to separate
|
|
- // the namespace paths
|
|
- if strings.ContainsRune(p, ',') {
|
|
- return nil, newSystemError(fmt.Errorf("invalid path %s", p))
|
|
- }
|
|
- paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
|
|
- }
|
|
-
|
|
- }
|
|
-
|
|
- return paths, nil
|
|
-}
|
|
-
|
|
-func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
|
|
- data := bytes.NewBuffer(nil)
|
|
- for _, im := range idMap {
|
|
- line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
|
|
- if _, err := data.WriteString(line); err != nil {
|
|
- return nil, err
|
|
- }
|
|
- }
|
|
- return data.Bytes(), nil
|
|
-}
|
|
-
|
|
-// bootstrapData encodes the necessary data in netlink binary format
|
|
-// as a io.Reader.
|
|
-// Consumer can write the data to a bootstrap program
|
|
-// such as one that uses nsenter package to bootstrap the container's
|
|
-// init process correctly, i.e. with correct namespaces, uid/gid
|
|
-// mapping etc.
|
|
-func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) {
|
|
- // create the netlink message
|
|
- r := nl.NewNetlinkRequest(int(InitMsg), 0)
|
|
-
|
|
- // write cloneFlags
|
|
- r.AddData(&Int32msg{
|
|
- Type: CloneFlagsAttr,
|
|
- Value: uint32(cloneFlags),
|
|
- })
|
|
-
|
|
- // write custom namespace paths
|
|
- if len(nsMaps) > 0 {
|
|
- nsPaths, err := c.orderNamespacePaths(nsMaps)
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- r.AddData(&Bytemsg{
|
|
- Type: NsPathsAttr,
|
|
- Value: []byte(strings.Join(nsPaths, ",")),
|
|
- })
|
|
- }
|
|
-
|
|
- // write namespace paths only when we are not joining an existing user ns
|
|
- _, joinExistingUser := nsMaps[configs.NEWUSER]
|
|
- if !joinExistingUser {
|
|
- // write uid mappings
|
|
- if len(c.config.UidMappings) > 0 {
|
|
- b, err := encodeIDMapping(c.config.UidMappings)
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- r.AddData(&Bytemsg{
|
|
- Type: UidmapAttr,
|
|
- Value: b,
|
|
- })
|
|
- }
|
|
-
|
|
- // write gid mappings
|
|
- if len(c.config.GidMappings) > 0 {
|
|
- b, err := encodeIDMapping(c.config.GidMappings)
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- r.AddData(&Bytemsg{
|
|
- Type: GidmapAttr,
|
|
- Value: b,
|
|
- })
|
|
- // The following only applies if we are root.
|
|
- if !c.config.Rootless {
|
|
- // check if we have CAP_SETGID to setgroup properly
|
|
- pid, err := capability.NewPid(os.Getpid())
|
|
- if err != nil {
|
|
- return nil, err
|
|
- }
|
|
- if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
|
|
- r.AddData(&Boolmsg{
|
|
- Type: SetgroupAttr,
|
|
- Value: true,
|
|
- })
|
|
- }
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- // write oom_score_adj
|
|
- r.AddData(&Bytemsg{
|
|
- Type: OomScoreAdjAttr,
|
|
- Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
|
|
- })
|
|
-
|
|
- // write rootless
|
|
- r.AddData(&Boolmsg{
|
|
- Type: RootlessAttr,
|
|
- Value: c.config.Rootless,
|
|
- })
|
|
-
|
|
- return bytes.NewReader(r.Serialize()), nil
|
|
-}
|
|
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
|
|
index 0b2aa74..15ba017 100644
|
|
--- a/libcontainer/factory_linux.go
|
|
+++ b/libcontainer/factory_linux.go
|
|
@@ -1,3 +1,4 @@
|
|
+//go:build linux
|
|
// +build linux
|
|
|
|
package libcontainer
|
|
@@ -245,10 +246,10 @@ func (l *LinuxFactory) Type() string {
|
|
// This is a low level implementation detail of the reexec and should not be consumed externally
|
|
func (l *LinuxFactory) StartInitialization() (err error) {
|
|
var (
|
|
- pipefd, rootfd int
|
|
+ pipefd, fifofd int
|
|
consoleSocket *os.File
|
|
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
|
|
- envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR")
|
|
+ envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD")
|
|
envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
|
|
)
|
|
|
|
@@ -264,11 +265,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
|
|
)
|
|
defer pipe.Close()
|
|
|
|
- // Only init processes have STATEDIR.
|
|
- rootfd = -1
|
|
+ // Only init processes have FIFOFD.
|
|
+ fifofd = -1
|
|
if it == initStandard {
|
|
- if rootfd, err = strconv.Atoi(envStateDir); err != nil {
|
|
- return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
|
|
+ if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
|
|
+ return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
|
|
}
|
|
}
|
|
|
|
@@ -309,7 +310,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
|
|
}
|
|
}()
|
|
|
|
- i, err := newContainerInit(it, pipe, consoleSocket, rootfd, logPipeFd)
|
|
+ i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
|
|
index e9a83e9..fd417ca 100644
|
|
--- a/libcontainer/init_linux.go
|
|
+++ b/libcontainer/init_linux.go
|
|
@@ -1,18 +1,23 @@
|
|
+//go:build linux
|
|
// +build linux
|
|
|
|
package libcontainer
|
|
|
|
import (
|
|
"encoding/json"
|
|
+ "errors"
|
|
"fmt"
|
|
"io"
|
|
"net"
|
|
"os"
|
|
+ "path/filepath"
|
|
"strings"
|
|
"syscall"
|
|
"unsafe"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
+ "golang.org/x/sys/unix"
|
|
+
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
@@ -66,7 +71,7 @@ type initer interface {
|
|
Init() error
|
|
}
|
|
|
|
-func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD, logFd int) (initer, error) {
|
|
+func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
|
|
var config *initConfig
|
|
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
|
|
return nil, err
|
|
@@ -89,7 +94,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDi
|
|
consoleSocket: consoleSocket,
|
|
parentPid: syscall.Getppid(),
|
|
config: config,
|
|
- stateDirFD: stateDirFD,
|
|
+ fifoFd: fifoFd,
|
|
logFd: logFd,
|
|
}, nil
|
|
}
|
|
@@ -111,6 +116,32 @@ func populateProcessEnvironment(env []string) error {
|
|
return nil
|
|
}
|
|
|
|
+// verifyCwd ensures that the current directory is actually inside the mount
|
|
+// namespace root of the current process.
|
|
+func verifyCwd() error {
|
|
+ // getcwd(2) on Linux detects if cwd is outside of the rootfs of the
|
|
+ // current mount namespace root, and in that case prefixes "(unreachable)"
|
|
+ // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
|
|
+ // when this happens and return ENOENT rather than returning a non-absolute
|
|
+ // path. In both cases we can therefore easily detect if we have an invalid
|
|
+ // cwd by checking the return value of getcwd(3). See getcwd(3) for more
|
|
+ // details, and CVE-2024-21626 for the security issue that motivated this
|
|
+ // check.
|
|
+ //
|
|
+ // We have to use unix.Getwd() here because os.Getwd() has a workaround for
|
|
+ // $PWD which involves doing stat(.), which can fail if the current
|
|
+ // directory is inaccessible to the container process.
|
|
+ if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
|
|
+ return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
|
|
+ } else if err != nil {
|
|
+ return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
|
|
+ } else if !filepath.IsAbs(wd) {
|
|
+ // We shouldn't ever hit this, but check just in case.
|
|
+ return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
|
|
+ }
|
|
+ return nil
|
|
+}
|
|
+
|
|
// finalizeNamespace drops the caps, sets the correct user
|
|
// and working dir, and closes any leaked file descriptors
|
|
// before executing the command inside the namespace
|
|
@@ -148,6 +179,10 @@ func finalizeNamespace(config *initConfig) error {
|
|
if err := setupUser(config); err != nil {
|
|
return err
|
|
}
|
|
+ // Make sure our final working directory is inside the container.
|
|
+ if err := verifyCwd(); err != nil {
|
|
+ return err
|
|
+ }
|
|
if err := system.ClearKeepCaps(); err != nil {
|
|
return err
|
|
}
|
|
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
|
|
index 5cdc30c..e786419 100644
|
|
--- a/libcontainer/process_linux.go
|
|
+++ b/libcontainer/process_linux.go
|
|
@@ -1,3 +1,4 @@
|
|
+//go:build linux
|
|
// +build linux
|
|
|
|
package libcontainer
|
|
@@ -204,7 +205,6 @@ type initProcess struct {
|
|
process *Process
|
|
bootstrapData io.Reader
|
|
sharePidns bool
|
|
- rootDir *os.File
|
|
}
|
|
|
|
func (p *initProcess) pid() int {
|
|
@@ -257,7 +257,6 @@ func (p *initProcess) start() error {
|
|
err := p.cmd.Start()
|
|
p.process.ops = p
|
|
p.childPipe.Close()
|
|
- p.rootDir.Close()
|
|
logs.CloseChild()
|
|
if err != nil {
|
|
p.process.ops = nil
|
|
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
|
|
index 1f7ec98..e38165d 100644
|
|
--- a/libcontainer/setns_init_linux.go
|
|
+++ b/libcontainer/setns_init_linux.go
|
|
@@ -1,3 +1,4 @@
|
|
+//go:build linux
|
|
// +build linux
|
|
|
|
package libcontainer
|
|
@@ -73,5 +74,23 @@ func (l *linuxSetnsInit) Init() error {
|
|
syscall.Close(l.logFd)
|
|
}
|
|
|
|
+ // Close all file descriptors we are not passing to the container. This is
|
|
+ // necessary because the execve target could use internal runc fds as the
|
|
+ // execve path, potentially giving access to binary files from the host
|
|
+ // (which can then be opened by container processes, leading to container
|
|
+ // escapes). Note that because this operation will close any open file
|
|
+ // descriptors that are referenced by (*os.File) handles from underneath
|
|
+ // the Go runtime, we must not do any file operations after this point
|
|
+ // (otherwise the (*os.File) finaliser could close the wrong file). See
|
|
+ // CVE-2024-21626 for more information as to why this protection is
|
|
+ // necessary.
|
|
+ //
|
|
+ // This is not needed for runc-dmz, because the extra execve(2) step means
|
|
+ // that all O_CLOEXEC file descriptors have already been closed and thus
|
|
+ // the second execve(2) from runc-dmz cannot access internal file
|
|
+ // descriptors from runc.
|
|
+ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
|
|
+ return err
|
|
+ }
|
|
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
|
|
}
|
|
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
|
|
index 6236593..7ebf1a2 100644
|
|
--- a/libcontainer/standard_init_linux.go
|
|
+++ b/libcontainer/standard_init_linux.go
|
|
@@ -1,3 +1,4 @@
|
|
+//go:build linux
|
|
// +build linux
|
|
|
|
package libcontainer
|
|
@@ -15,14 +16,17 @@ import (
|
|
"github.com/opencontainers/runc/libcontainer/keys"
|
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
+ "github.com/opencontainers/runc/libcontainer/utils"
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
|
+
|
|
+ "golang.org/x/sys/unix"
|
|
)
|
|
|
|
type linuxStandardInit struct {
|
|
pipe *os.File
|
|
consoleSocket *os.File
|
|
parentPid int
|
|
- stateDirFD int
|
|
+ fifoFd int
|
|
config *initConfig
|
|
logFd int
|
|
}
|
|
@@ -187,7 +191,7 @@ func (l *linuxStandardInit) Init() error {
|
|
// exec'ing the users process.
|
|
ch := make(chan Error, 1)
|
|
go func() {
|
|
- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
|
|
+ fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
|
|
if err != nil {
|
|
ch <- newSystemErrorWithCause(err, "openat exec fifo")
|
|
return
|
|
@@ -215,7 +219,25 @@ func (l *linuxStandardInit) Init() error {
|
|
}
|
|
// close the statedir fd before exec because the kernel resets dumpable in the wrong order
|
|
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
|
|
- syscall.Close(l.stateDirFD)
|
|
+ unix.Close(l.fifoFd)
|
|
+ // Close all file descriptors we are not passing to the container. This is
|
|
+ // necessary because the execve target could use internal runc fds as the
|
|
+ // execve path, potentially giving access to binary files from the host
|
|
+ // (which can then be opened by container processes, leading to container
|
|
+ // escapes). Note that because this operation will close any open file
|
|
+ // descriptors that are referenced by (*os.File) handles from underneath
|
|
+ // the Go runtime, we must not do any file operations after this point
|
|
+ // (otherwise the (*os.File) finaliser could close the wrong file). See
|
|
+ // CVE-2024-21626 for more information as to why this protection is
|
|
+ // necessary.
|
|
+ //
|
|
+ // This is not needed for runc-dmz, because the extra execve(2) step means
|
|
+ // that all O_CLOEXEC file descriptors have already been closed and thus
|
|
+ // the second execve(2) from runc-dmz cannot access internal file
|
|
+ // descriptors from runc.
|
|
+ if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
|
|
+ return err
|
|
+ }
|
|
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
|
|
return newSystemErrorWithCause(err, "exec user process")
|
|
}
|
|
diff --git a/libcontainer/standard_init_linux.go.orig b/libcontainer/standard_init_linux.go.orig
|
|
deleted file mode 100644
|
|
index 611b91d..0000000
|
|
--- a/libcontainer/standard_init_linux.go.orig
|
|
+++ /dev/null
|
|
@@ -1,223 +0,0 @@
|
|
-// +build linux
|
|
-
|
|
-package libcontainer
|
|
-
|
|
-import (
|
|
- "fmt"
|
|
- "os"
|
|
- "os/exec"
|
|
- "strings"
|
|
- "syscall"
|
|
- "time"
|
|
-
|
|
- "github.com/opencontainers/runc/libcontainer/apparmor"
|
|
- "github.com/opencontainers/runc/libcontainer/configs"
|
|
- "github.com/opencontainers/runc/libcontainer/keys"
|
|
- "github.com/opencontainers/runc/libcontainer/seccomp"
|
|
- "github.com/opencontainers/runc/libcontainer/system"
|
|
- "github.com/opencontainers/selinux/go-selinux/label"
|
|
-)
|
|
-
|
|
-type linuxStandardInit struct {
|
|
- pipe *os.File
|
|
- consoleSocket *os.File
|
|
- parentPid int
|
|
- stateDirFD int
|
|
- config *initConfig
|
|
- logFd int
|
|
-}
|
|
-
|
|
-func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
|
|
- var newperms uint32
|
|
-
|
|
- if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
|
|
- // with user ns we need 'other' search permissions
|
|
- newperms = 0x8
|
|
- } else {
|
|
- // without user ns we need 'UID' search permissions
|
|
- newperms = 0x80000
|
|
- }
|
|
-
|
|
- // create a unique per session container name that we can
|
|
- // join in setns; however, other containers can also join it
|
|
- return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
|
|
-}
|
|
-
|
|
-// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
|
|
-// the kernel
|
|
-const PR_SET_NO_NEW_PRIVS = 0x26
|
|
-
|
|
-func (l *linuxStandardInit) Init() error {
|
|
- if !l.config.Config.NoNewKeyring {
|
|
- ringname, keepperms, newperms := l.getSessionRingParams()
|
|
-
|
|
- // do not inherit the parent's session keyring
|
|
- sessKeyId, err := keys.JoinSessionKeyring(ringname)
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- // make session keyring searcheable
|
|
- if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
-
|
|
- if err := setupNetwork(l.config); err != nil {
|
|
- return err
|
|
- }
|
|
- if err := setupRoute(l.config.Config); err != nil {
|
|
- return err
|
|
- }
|
|
-
|
|
- label.Init()
|
|
-
|
|
- // prepareRootfs() can be executed only for a new mount namespace.
|
|
- if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
|
- if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
-
|
|
- // Set up the console. This has to be done *before* we finalize the rootfs,
|
|
- // but *after* we've given the user the chance to set up all of the mounts
|
|
- // they wanted.
|
|
- if l.config.CreateConsole {
|
|
- if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
|
|
- return err
|
|
- }
|
|
- if err := system.Setctty(); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
-
|
|
- // Finish the rootfs setup.
|
|
- if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
|
- if err := finalizeRootfs(l.config.Config); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
-
|
|
- if hostname := l.config.Config.Hostname; hostname != "" {
|
|
- if err := syscall.Sethostname([]byte(hostname)); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
|
- return err
|
|
- }
|
|
- if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
|
|
- return err
|
|
- }
|
|
- // when userns enabled, write to sysctl will fail, let docker-hooks do this job
|
|
- if len(l.config.Config.UidMappings) == 0 && len(l.config.Config.GidMappings) == 0 {
|
|
- for key, value := range l.config.Config.Sysctl {
|
|
- if err := writeSystemProperty(key, value); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- }
|
|
- for _, path := range l.config.Config.ReadonlyPaths {
|
|
- if err := readonlyPath(path); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- for _, m := range l.config.Config.Mounts {
|
|
- if m.Flags&syscall.MS_RDONLY == 0 && m.Device == "proc" && strings.HasPrefix(m.Destination, "/proc/sys/") {
|
|
- if err := remountReadWrite(m.Destination); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- }
|
|
- for _, path := range l.config.Config.MaskPaths {
|
|
- if err := maskPath(path); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- pdeath, err := system.GetParentDeathSignal()
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- if l.config.NoNewPrivileges {
|
|
- if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- // Tell our parent that we're ready to Execv. This must be done before the
|
|
- // Seccomp rules have been applied, because we need to be able to read and
|
|
- // write to a socket.
|
|
- if err := syncParentReady(l.pipe); err != nil {
|
|
- return err
|
|
- }
|
|
- // Without NoNewPrivileges seccomp is a privileged operation, so we need to
|
|
- // do this before dropping capabilities; otherwise do it as late as possible
|
|
- // just before execve so as few syscalls take place after it as possible.
|
|
- if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
|
|
- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
|
- return err
|
|
- }
|
|
- }
|
|
- if err := finalizeNamespace(l.config); err != nil {
|
|
- return err
|
|
- }
|
|
- // finalizeNamespace can change user/group which clears the parent death
|
|
- // signal, so we restore it here.
|
|
- if err := pdeath.Restore(); err != nil {
|
|
- return err
|
|
- }
|
|
- // compare the parent from the initial start of the init process and make sure that it did not change.
|
|
- // if the parent changes that means it died and we were reparented to something else so we should
|
|
- // just kill ourself and not cause problems for someone else.
|
|
- if syscall.Getppid() != l.parentPid {
|
|
- return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
|
|
- }
|
|
- // check for the arg before waiting to make sure it exists and it is returned
|
|
- // as a create time error.
|
|
- name, err := exec.LookPath(l.config.Args[0])
|
|
- if err != nil {
|
|
- return err
|
|
- }
|
|
- // close the pipe to signal that we have completed our init.
|
|
- l.pipe.Close()
|
|
-
|
|
- if l.logFd != 0 {
|
|
- syscall.Close(l.logFd)
|
|
- }
|
|
-
|
|
- // wait for the fifo to be opened on the other side before
|
|
- // exec'ing the users process.
|
|
- ch := make(chan Error, 1)
|
|
- go func() {
|
|
- fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
|
|
- if err != nil {
|
|
- ch <- newSystemErrorWithCause(err, "openat exec fifo")
|
|
- return
|
|
- }
|
|
- if _, err := syscall.Write(fd, []byte("0")); err != nil {
|
|
- ch <- newSystemErrorWithCause(err, "write 0 exec fifo")
|
|
- return
|
|
- }
|
|
- ch <- nil
|
|
- }()
|
|
-
|
|
- select {
|
|
- case chErr := <-ch:
|
|
- if chErr != nil {
|
|
- return chErr
|
|
- }
|
|
- case <-time.After(120 * time.Second):
|
|
- return newSystemErrorWithCause(fmt.Errorf("timeout"), "wait for the fifo to be opened on the other side ")
|
|
- }
|
|
-
|
|
- if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
|
|
- if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
|
- return newSystemErrorWithCause(err, "init seccomp")
|
|
- }
|
|
- }
|
|
- // close the statedir fd before exec because the kernel resets dumpable in the wrong order
|
|
- // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
|
|
- syscall.Close(l.stateDirFD)
|
|
- if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
|
|
- return newSystemErrorWithCause(err, "exec user process")
|
|
- }
|
|
- return nil
|
|
-}
|
|
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
|
|
index cd04ace..922cffb 100644
|
|
--- a/libcontainer/utils/utils.go
|
|
+++ b/libcontainer/utils/utils.go
|
|
@@ -5,17 +5,12 @@ import (
|
|
"encoding/binary"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
- "fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
- "strconv"
|
|
"strings"
|
|
"syscall"
|
|
"unsafe"
|
|
-
|
|
- securejoin "github.com/cyphar/filepath-securejoin"
|
|
- "golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
@@ -175,36 +170,3 @@ func stripRoot(root, path string) string {
|
|
}
|
|
return CleanPath("/" + path)
|
|
}
|
|
-
|
|
-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
|
|
-// corresponding to the unsafePath resolved within the root. Before passing the
|
|
-// fd, this path is verified to have been inside the root -- so operating on it
|
|
-// through the passed fdpath should be safe. Do not access this path through
|
|
-// the original path strings, and do not attempt to use the pathname outside of
|
|
-// the passed closure (the file handle will be freed once the closure returns).
|
|
-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
|
|
- // Remove the root then forcefully resolve inside the root.
|
|
- unsafePath = stripRoot(root, unsafePath)
|
|
- path, err := securejoin.SecureJoin(root, unsafePath)
|
|
- if err != nil {
|
|
- return fmt.Errorf("resolving path inside rootfs failed: %v", err)
|
|
- }
|
|
-
|
|
- // Open the target path.
|
|
- fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
|
|
- if err != nil {
|
|
- return fmt.Errorf("open o_path procfd: %w", err)
|
|
- }
|
|
- defer fh.Close()
|
|
-
|
|
- // Double-check the path is the one we expected.
|
|
- procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
|
|
- if realpath, err := os.Readlink(procfd); err != nil {
|
|
- return fmt.Errorf("procfd verification failed: %w", err)
|
|
- } else if realpath != path {
|
|
- return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
|
|
- }
|
|
-
|
|
- // Run the closure.
|
|
- return fn(procfd)
|
|
-}
|
|
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
|
|
index 7b798cc..cfacfc2 100644
|
|
--- a/libcontainer/utils/utils_unix.go
|
|
+++ b/libcontainer/utils/utils_unix.go
|
|
@@ -1,43 +1,264 @@
|
|
+///go:build !windows
|
|
+//go:build !windows
|
|
// +build !windows
|
|
|
|
package utils
|
|
|
|
import (
|
|
- "io/ioutil"
|
|
+ "fmt"
|
|
+ "math"
|
|
"os"
|
|
+ "path/filepath"
|
|
+ "runtime"
|
|
"strconv"
|
|
- "syscall"
|
|
+ "sync"
|
|
+ _ "unsafe" // for go:linkname
|
|
+
|
|
+ securejoin "github.com/cyphar/filepath-securejoin"
|
|
+ "github.com/Sirupsen/logrus"
|
|
+ "golang.org/x/sys/unix"
|
|
)
|
|
|
|
-func CloseExecFrom(minFd int) error {
|
|
- fdList, err := ioutil.ReadDir("/proc/self/fd")
|
|
+// EnsureProcHandle returns whether or not the given file handle is on procfs.
|
|
+func EnsureProcHandle(fh *os.File) error {
|
|
+ var buf unix.Statfs_t
|
|
+ if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
|
|
+ return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
|
|
+ }
|
|
+ if buf.Type != unix.PROC_SUPER_MAGIC {
|
|
+ return fmt.Errorf("%s is not on procfs", fh.Name())
|
|
+ }
|
|
+ return nil
|
|
+}
|
|
+
|
|
+var (
|
|
+ haveCloseRangeCloexecBool bool
|
|
+ haveCloseRangeCloexecOnce sync.Once
|
|
+)
|
|
+
|
|
+func haveCloseRangeCloexec() bool {
|
|
+ haveCloseRangeCloexecOnce.Do(func() {
|
|
+ // Make sure we're not closing a random file descriptor.
|
|
+ tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
|
|
+ if err != nil {
|
|
+ return
|
|
+ }
|
|
+ defer unix.Close(tmpFd)
|
|
+
|
|
+ err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
|
|
+ // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
|
|
+ // -ENOSYS and -EINVAL ultimately mean we don't have support, but any
|
|
+ // other potential error would imply that even the most basic close
|
|
+ // operation wouldn't work.
|
|
+ haveCloseRangeCloexecBool = err == nil
|
|
+ })
|
|
+ return haveCloseRangeCloexecBool
|
|
+}
|
|
+
|
|
+type fdFunc func(fd int)
|
|
+
|
|
+// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
|
|
+// the current process.
|
|
+func fdRangeFrom(minFd int, fn fdFunc) error {
|
|
+ procSelfFd, closer := ProcThreadSelf("fd")
|
|
+ defer closer()
|
|
+
|
|
+ fdDir, err := os.Open(procSelfFd)
|
|
+ if err != nil {
|
|
+ return err
|
|
+ }
|
|
+ defer fdDir.Close()
|
|
+
|
|
+ if err := EnsureProcHandle(fdDir); err != nil {
|
|
+ return err
|
|
+ }
|
|
+
|
|
+ fdList, err := fdDir.Readdirnames(-1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
- for _, fi := range fdList {
|
|
- fd, err := strconv.Atoi(fi.Name())
|
|
+ for _, fdStr := range fdList {
|
|
+ fd, err := strconv.Atoi(fdStr)
|
|
+ // Ignore non-numeric file names.
|
|
if err != nil {
|
|
- // ignore non-numeric file names
|
|
continue
|
|
}
|
|
-
|
|
+ // Ignore descriptors lower than our specified minimum.
|
|
if fd < minFd {
|
|
- // ignore descriptors lower than our specified minimum
|
|
continue
|
|
}
|
|
-
|
|
- // intentionally ignore errors from syscall.CloseOnExec
|
|
- syscall.CloseOnExec(fd)
|
|
- // the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
|
|
+ // Ignore the file descriptor we used for readdir, as it will be closed
|
|
+ // when we return.
|
|
+ if uintptr(fd) == fdDir.Fd() {
|
|
+ continue
|
|
+ }
|
|
+ // Run the closure.
|
|
+ fn(fd)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
-// NewSockPair returns a new unix socket pair
|
|
-func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
|
|
- fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
|
|
+// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
|
|
+// equal to minFd in the current process.
|
|
+func CloseExecFrom(minFd int) error {
|
|
+ // Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
|
|
+ if haveCloseRangeCloexec() {
|
|
+ err := unix.CloseRange(uint(minFd), math.MaxUint64, unix.CLOSE_RANGE_CLOEXEC)
|
|
+ return os.NewSyscallError("close_range", err)
|
|
+ }
|
|
+ // Otherwise, fall back to the standard loop.
|
|
+ return fdRangeFrom(minFd, unix.CloseOnExec)
|
|
+}
|
|
+
|
|
+//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
|
|
+
|
|
+// In order to make sure we do not close the internal epoll descriptors the Go
|
|
+// runtime uses, we need to ensure that we skip descriptors that match
|
|
+// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
|
|
+// unfortunately there's no other way to be sure we're only keeping the file
|
|
+// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
|
|
+func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
|
|
+
|
|
+// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
|
|
+// current process, except for those critical to Go's runtime (such as the
|
|
+// netpoll management descriptors).
|
|
+//
|
|
+// NOTE: That this function is incredibly dangerous to use in most Go code, as
|
|
+// closing file descriptors from underneath *os.File handles can lead to very
|
|
+// bad behaviour (the closed file descriptor can be re-used and then any
|
|
+// *os.File operations would apply to the wrong file). This function is only
|
|
+// intended to be called from the last stage of runc init.
|
|
+func UnsafeCloseFrom(minFd int) error {
|
|
+ // We cannot use close_range(2) even if it is available, because we must
|
|
+ // not close some file descriptors.
|
|
+ return fdRangeFrom(minFd, func(fd int) {
|
|
+ if runtime_IsPollDescriptor(uintptr(fd)) {
|
|
+ // These are the Go runtimes internal netpoll file descriptors.
|
|
+ // These file descriptors are operated on deep in the Go scheduler,
|
|
+ // and closing those files from underneath Go can result in panics.
|
|
+ // There is no issue with keeping them because they are not
|
|
+ // executable and are not useful to an attacker anyway. Also we
|
|
+ // don't have any choice.
|
|
+ return
|
|
+ }
|
|
+ // There's nothing we can do about errors from close(2), and the
|
|
+ // only likely error to be seen is EBADF which indicates the fd was
|
|
+ // already closed (in which case, we got what we wanted).
|
|
+ _ = unix.Close(fd)
|
|
+ })
|
|
+}
|
|
+
|
|
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
|
|
+func NewSockPair(name string) (parent, child *os.File, err error) {
|
|
+ fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
|
|
}
|
|
+
|
|
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
|
|
+// corresponding to the unsafePath resolved within the root. Before passing the
|
|
+// fd, this path is verified to have been inside the root -- so operating on it
|
|
+// through the passed fdpath should be safe. Do not access this path through
|
|
+// the original path strings, and do not attempt to use the pathname outside of
|
|
+// the passed closure (the file handle will be freed once the closure returns).
|
|
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
|
|
+ // Remove the root then forcefully resolve inside the root.
|
|
+ unsafePath = stripRoot(root, unsafePath)
|
|
+ path, err := securejoin.SecureJoin(root, unsafePath)
|
|
+ if err != nil {
|
|
+ return fmt.Errorf("resolving path inside rootfs failed: %w", err)
|
|
+ }
|
|
+
|
|
+ procSelfFd, closer := ProcThreadSelf("fd/")
|
|
+ defer closer()
|
|
+
|
|
+ // Open the target path.
|
|
+ fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
|
|
+ if err != nil {
|
|
+ return fmt.Errorf("open o_path procfd: %w", err)
|
|
+ }
|
|
+ defer fh.Close()
|
|
+
|
|
+ procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
|
|
+ // Double-check the path is the one we expected.
|
|
+ if realpath, err := os.Readlink(procfd); err != nil {
|
|
+ return fmt.Errorf("procfd verification failed: %w", err)
|
|
+ } else if realpath != path {
|
|
+ return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
|
|
+ }
|
|
+
|
|
+ return fn(procfd)
|
|
+}
|
|
+
|
|
+type ProcThreadSelfCloser func()
|
|
+
|
|
+var (
|
|
+ haveProcThreadSelf bool
|
|
+ haveProcThreadSelfOnce sync.Once
|
|
+)
|
|
+
|
|
+// ProcThreadSelf returns a string that is equivalent to
|
|
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
|
|
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
|
|
+// meaning that the passed string needs to be trusted. The caller _must_ call
|
|
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
|
|
+// *only once* after it has finished using the returned path string.
|
|
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
|
|
+ haveProcThreadSelfOnce.Do(func() {
|
|
+ if _, err := os.Stat("/proc/thread-self/"); err == nil {
|
|
+ haveProcThreadSelf = true
|
|
+ } else {
|
|
+ logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
|
|
+ }
|
|
+ })
|
|
+
|
|
+ // We need to lock our thread until the caller is done with the path string
|
|
+ // because any non-atomic operation on the path (such as opening a file,
|
|
+ // then reading it) could be interrupted by the Go runtime where the
|
|
+ // underlying thread is swapped out and the original thread is killed,
|
|
+ // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
|
|
+ // addition, the pre-3.17 fallback makes everything non-atomic because the
|
|
+ // same thing could happen between unix.Gettid() and the path operations.
|
|
+ //
|
|
+ // In theory, we don't need to lock in the atomic user case when using
|
|
+ // /proc/thread-self/, but it's better to be safe than sorry (and there are
|
|
+ // only one or two truly atomic users of /proc/thread-self/).
|
|
+ runtime.LockOSThread()
|
|
+
|
|
+ threadSelf := "/proc/thread-self/"
|
|
+ if !haveProcThreadSelf {
|
|
+ // Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
|
|
+ threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
|
|
+ if _, err := os.Stat(threadSelf); err != nil {
|
|
+ // Unfortunately, this code is called from rootfs_linux.go where we
|
|
+ // are running inside the pid namespace of the container but /proc
|
|
+ // is the host's procfs. Unfortunately there is no real way to get
|
|
+ // the correct tid to use here (the kernel age means we cannot do
|
|
+ // things like set up a private fsopen("proc") -- even scanning
|
|
+ // NSpid in all of the tasks in /proc/self/task/*/status requires
|
|
+ // Linux 4.1).
|
|
+ //
|
|
+ // So, we just have to assume that /proc/self is acceptable in this
|
|
+ // one specific case.
|
|
+ if os.Getpid() == 1 {
|
|
+ logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
|
|
+ } else {
|
|
+ // This should never happen, but the fallback should work in most cases...
|
|
+ logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
|
|
+ }
|
|
+ threadSelf = "/proc/self/"
|
|
+ }
|
|
+ }
|
|
+ return threadSelf + subpath, runtime.UnlockOSThread
|
|
+}
|
|
+
|
|
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
|
|
+// create a /proc/thread-self handle for given file descriptor.
|
|
+//
|
|
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
|
|
+// without using fmt.Sprintf to avoid unneeded overhead.
|
|
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
|
|
+ return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
|
|
+}
|
|
diff --git a/vendor/golang.org/x/sys/unix/flock.go b/vendor/golang.org/x/sys/unix/flock.go
|
|
index ce67a59..e8d1081 100644
|
|
--- a/vendor/golang.org/x/sys/unix/flock.go
|
|
+++ b/vendor/golang.org/x/sys/unix/flock.go
|
|
@@ -14,6 +14,11 @@ import "unsafe"
|
|
// systems by flock_linux_32bit.go to be SYS_FCNTL64.
|
|
var fcntl64Syscall uintptr = SYS_FCNTL
|
|
|
|
+// FcntlInt performs a fcntl syscall on fd with the provided command and argument.
|
|
+func FcntlInt(fd uintptr, cmd, arg int) (int, error) {
|
|
+ return fcntl(int(fd), cmd, arg)
|
|
+}
|
|
+
|
|
// FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command.
|
|
func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error {
|
|
_, _, errno := Syscall(fcntl64Syscall, fd, uintptr(cmd), uintptr(unsafe.Pointer(lk)))
|
|
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
|
|
index f21dcd9..e1bde81 100644
|
|
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
|
|
@@ -934,6 +934,7 @@ const (
|
|
PRIO_PGRP = 0x1
|
|
PRIO_PROCESS = 0x0
|
|
PRIO_USER = 0x2
|
|
+ PROC_SUPER_MAGIC = 0x9fa0
|
|
PROT_EXEC = 0x4
|
|
PROT_GROWSDOWN = 0x1000000
|
|
PROT_GROWSUP = 0x2000000
|
|
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
|
|
index 16a18f5..388d1fc 100644
|
|
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
|
|
@@ -966,6 +966,7 @@ const (
|
|
PRIO_PGRP = 0x1
|
|
PRIO_PROCESS = 0x0
|
|
PRIO_USER = 0x2
|
|
+ PROC_SUPER_MAGIC = 0x9fa0
|
|
PROT_EXEC = 0x4
|
|
PROT_GROWSDOWN = 0x1000000
|
|
PROT_GROWSUP = 0x2000000
|
|
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
|
|
index 8b2e87d..fe21f83 100644
|
|
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go
|
|
@@ -312,6 +312,16 @@ func Close(fd int) (err error) {
|
|
|
|
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
|
|
|
|
+func CloseRange(first uint, last uint, flags uint) (err error) {
|
|
+ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags))
|
|
+ if e1 != 0 {
|
|
+ err = errnoErr(e1)
|
|
+ }
|
|
+ return
|
|
+}
|
|
+
|
|
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
|
|
+
|
|
func Dup(oldfd int) (fd int, err error) {
|
|
r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0)
|
|
fd = int(r0)
|
|
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
|
|
index f6cc320..395e2de 100644
|
|
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go
|
|
@@ -312,6 +312,16 @@ func Close(fd int) (err error) {
|
|
|
|
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
|
|
|
|
+func CloseRange(first uint, last uint, flags uint) (err error) {
|
|
+ _, _, e1 := Syscall(SYS_CLOSE_RANGE, uintptr(first), uintptr(last), uintptr(flags))
|
|
+ if e1 != 0 {
|
|
+ err = errnoErr(e1)
|
|
+ }
|
|
+ return
|
|
+}
|
|
+
|
|
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
|
|
+
|
|
func Dup(oldfd int) (fd int, err error) {
|
|
r0, _, e1 := Syscall(SYS_DUP, uintptr(oldfd), 0, 0)
|
|
fd = int(r0)
|
|
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
|
|
index 9042317..f7c427c 100644
|
|
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
|
|
@@ -338,4 +338,5 @@ const (
|
|
SYS_PKEY_MPROTECT = 329
|
|
SYS_PKEY_ALLOC = 330
|
|
SYS_PKEY_FREE = 331
|
|
+ SYS_CLOSE_RANGE = 436
|
|
)
|
|
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
|
|
index 90e43d0..530563a 100644
|
|
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
|
|
@@ -282,4 +282,5 @@ const (
|
|
SYS_PKEY_MPROTECT = 288
|
|
SYS_PKEY_ALLOC = 289
|
|
SYS_PKEY_FREE = 290
|
|
+ SYS_CLOSE_RANGE = 436
|
|
)
|
|
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
|
|
index c9e1e64..2f12811 100644
|
|
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go
|
|
@@ -345,6 +345,11 @@ type TCPInfo struct {
|
|
Total_retrans uint32
|
|
}
|
|
|
|
+const (
|
|
+ CLOSE_RANGE_UNSHARE = 0x2
|
|
+ CLOSE_RANGE_CLOEXEC = 0x4
|
|
+)
|
|
+
|
|
const (
|
|
SizeofSockaddrInet4 = 0x10
|
|
SizeofSockaddrInet6 = 0x1c
|
|
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
|
|
index e58c500..b77eceb 100644
|
|
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
|
|
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go
|
|
@@ -30,6 +30,11 @@ type Timeval struct {
|
|
Usec int64
|
|
}
|
|
|
|
+const (
|
|
+ CLOSE_RANGE_UNSHARE = 0x2
|
|
+ CLOSE_RANGE_CLOEXEC = 0x4
|
|
+)
|
|
+
|
|
type Timex struct {
|
|
Modes uint32
|
|
Pad_cgo_0 [4]byte
|
|
--
|
|
2.33.0
|
|
|