root@ubuntu:/home/ubuntu# kata-runtime exec e12a7db6fb05df044a59a19bb03c39fe7752e4d684a8e2e58822b88606d3ac3e rpc error: code = Internal desc = Could not run process: container_linux.go:349: starting container process caused "panic from initialization: runtime error: index out of range, goroutine 1 [running, locked to thread]: runtime/debug.Stack(0x400018fbd8, 0xaaaab1b68260, 0xaaaab21de220) /usr/go/src/runtime/debug/stack.go:24 +0x88
github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer.(*LinuxFactory).StartInitialization.func2(0x400018fea0) /root/go/src/github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go:370 +0x40 panic(0xaaaab1b68260, 0xaaaab21de220) /usr/go/src/runtime/panic.go:513 +0x18c github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer.(*linuxSetnsInit).Init(0x400012d9c0, 0x0, 0x0) /root/go/src/github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go:91 +0x434 github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer.(*LinuxFactory).StartInitialization(0x4000164090, 0x0, 0x0) /root/go/src/github.com/kata-containers/agent/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go: 380 +0x2ec main.init.0() /root/go/src/github.com/kata-containers/agent/agent.go:1506 +0x88 " root@ubuntu:/home/ubuntu#
【kubernetes/k8s源码分析】kata container agent create container 源码分析
https://blog.csdn.net/zhonglinzhang/article/details/101212033
linuxStandardInit.Init()(github.com/opencontainers/runc/libcontainer/standard_init_linux.go#47):
func (l *linuxStandardInit) Init() error {
// 这里比较重要的是这个函数,此时各个 Namespace 虽然都挂载完毕了,但是当前的进程的视角里根目录和容器外是一样的
// 因此这个方法会挂载设备,bind mount,然后将当前根目录切换到容器的根目录下。
if err := prepareRootfs(l.pipe, l.config); err != nil {
return err
}
// 设置 root (/) 为只读
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
// 在完成一系列容器内的环境准备之后,通过 execve 执行容器内的 entrypoint
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
}
总结:
- runc init 一个会有三个进程
- 第一个进程读取 bootstrapData,并完成第二个进程的 user map 的设置
- 第二个进程完成 namespace 的设置
- 第三个进程完成 CGROUP namesapce 的设置,并读取了 0x80 的同步信息。最后进入 go 代码。go 代码读取 container config,进行容器内环境准备,最后执行容器的 entrypoint
-
47 func (l *linuxStandardInit) Init() error { 48 runtime.LockOSThread() 49 defer runtime.UnlockOSThread() 50 if !l.config.Config.NoNewKeyring { 51 if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil { 52 return err 53 } 54 defer label.SetKeyLabel("") 55 ringname, keepperms, newperms := l.getSessionRingParams() 56 57 // Do not inherit the parent's session keyring. 58 if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil { 59 // If keyrings aren't supported then it is likely we are on an 60 // older kernel (or inside an LXC container). While we could bail, 61 // the security feature we are using here is best-effort (it only 62 // really provides marginal protection since VFS credentials are 63 // the only significant protection of keyrings). 64 // 65 // TODO(cyphar): Log this so people know what's going on, once we 66 // have proper logging in 'runc init'. 67 if errors.Cause(err) != unix.ENOSYS { 68 return errors.Wrap(err, "join session keyring") 69 } 70 } else { 71 // Make session keyring searcheable. If we've gotten this far we 72 // bail on any error -- we don't want to have a keyring with bad 73 // permissions. 74 if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { 75 return errors.Wrap(err, "mod keyring permissions") 76 } 77 } 78 } 79 80 if err := setupNetwork(l.config); err != nil { 81 return err 82 } 83 if err := setupRoute(l.config.Config); err != nil { 84 return err 85 } 86 87 label.Init() 88 if err := prepareRootfs(l.pipe, l.config); err != nil { 89 return err 90 } 91 // Set up the console. This has to be done *before* we finalize the rootfs, 92 // but *after* we've given the user the chance to set up all of the mounts 93 // they wanted. 94 if l.config.CreateConsole { 95 if err := setupConsole(l.consoleSocket, l.config, true); err != nil { 96 return err 97 } 98 if err := system.Setctty(); err != nil { 99 return errors.Wrap(err, "setctty") 100 } 101 } 102 103 // Finish the rootfs setup. 104 if l.config.Config.Namespaces.Contains(configs.NEWNS) { 105 if err := finalizeRootfs(l.config.Config); err != nil { 106 return err 107 } 108 } 109 110 if hostname := l.config.Config.Hostname; hostname != "" { 111 if err := unix.Sethostname([]byte(hostname)); err != nil { 112 return errors.Wrap(err, "sethostname") 113 } 114 } 115 if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { 116 return errors.Wrap(err, "apply apparmor profile") 117 } 118 119 for key, value := range l.config.Config.Sysctl { 120 if err := writeSystemProperty(key, value); err != nil { 121 return errors.Wrapf(err, "write sysctl key %s", key) 122 } 123 } 124 for _, path := range l.config.Config.ReadonlyPaths { 125 if err := readonlyPath(path); err != nil { 126 return errors.Wrapf(err, "readonly path %s", path) 127 } 128 } 129 for _, path := range l.config.Config.MaskPaths { 130 if err := maskPath(path, l.config.Config.MountLabel); err != nil { 131 return errors.Wrapf(err, "mask path %s", path) 132 } 133 } 134 pdeath, err := system.GetParentDeathSignal() 135 if err != nil { 136 return errors.Wrap(err, "get pdeath signal") 137 } 138 if l.config.NoNewPrivileges { 139 if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { 140 return errors.Wrap(err, "set nonewprivileges") 141 } 142 } 143 // Tell our parent that we're ready to Execv. This must be done before the 144 // Seccomp rules have been applied, because we need to be able to read and 145 // write to a socket. 146 if err := syncParentReady(l.pipe); err != nil { 147 return errors.Wrap(err, "sync ready") 148 } 149 if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { 150 return errors.Wrap(err, "set process label") 151 } 152 defer label.SetProcessLabel("") 153 // Without NoNewPrivileges seccomp is a privileged operation, so we need to 154 // do this before dropping capabilities; otherwise do it as late as possible 155 // just before execve so as few syscalls take place after it as possible. 156 if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { 157 if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { 158 return err 159 } 160 } 161 if err := finalizeNamespace(l.config); err != nil { 162 return err 163 } 164 // finalizeNamespace can change user/group which clears the parent death 165 // signal, so we restore it here. 166 if err := pdeath.Restore(); err != nil { 167 return errors.Wrap(err, "restore pdeath signal") 168 } 169 // Compare the parent from the initial start of the init process and make 170 // sure that it did not change. if the parent changes that means it died 171 // and we were reparented to something else so we should just kill ourself 172 // and not cause problems for someone else. 173 if unix.Getppid() != l.parentPid { 174 return unix.Kill(unix.Getpid(), unix.SIGKILL) 175 } 176 // Check for the arg before waiting to make sure it exists and it is 177 // returned as a create time error. 178 name, err := exec.LookPath(l.config.Args[0]) 179 if err != nil { 180 return err 181 } 182 // Close the pipe to signal that we have completed our init. 183 l.pipe.Close() 184 // Wait for the FIFO to be opened on the other side before exec-ing the 185 // user process. We open it through /proc/self/fd/$fd, because the fd that 186 // was given to us was an O_PATH fd to the fifo itself. Linux allows us to 187 // re-open an O_PATH fd through /proc. 188 fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) 189 if err != nil { 190 return newSystemErrorWithCause(err, "open exec fifo") 191 } 192 if _, err := unix.Write(fd, []byte("0")); err != nil { 193 return newSystemErrorWithCause(err, "write 0 exec fifo") 194 } 195 // Close the O_PATH fifofd fd before exec because the kernel resets 196 // dumpable in the wrong order. This has been fixed in newer kernels, but 197 // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels. 198 // N.B. the core issue itself (passing dirfds to the host filesystem) has 199 // since been resolved. 200 // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 201 unix.Close(l.fifoFd) 202 // Set seccomp as close to execve as possible, so as few syscalls take 203 // place afterward (reducing the amount of syscalls that users need to 204 // enable in their seccomp profiles). 205 if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { 206 if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { 207 return newSystemErrorWithCause(err, "init seccomp") 208 } 209 } 210 if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { 211 return newSystemErrorWithCause(err, "exec user process") 212 } 213 return nil 214 }
调用Init
-
-
i, err := newContainerInit(it, pipe, consoleSocket, fifofd) if err != nil { return err } // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called. return i.Init() }
-
-
-
// Shared function between CreateContainer and ExecProcess, because those expect // a process to be run. func (a *agentGRPC) execProcess(ctr *container, proc *process, createContainer bool) (err error) { if ctr == nil { return grpcStatus.Error(codes.InvalidArgument, "Container cannot be nil") } if proc == nil { return grpcStatus.Error(codes.InvalidArgument, "Process cannot be nil") } // This lock is very important to avoid any race with reaper.reap(). // Indeed, if we don't lock this here, we could potentially get the // SIGCHLD signal before the channel has been created, meaning we will // miss the opportunity to get the exit code, leading WaitProcess() to // wait forever on the new channel. // This lock has to be taken before we run the new process. a.sandbox.subreaper.lock() defer a.sandbox.subreaper.unlock() if createContainer { err = ctr.container.Start(&proc.process) } else { err = ctr.container.Run(&(proc.process)) } if err != nil { return grpcStatus.Errorf(codes.Internal, "Could not run process: %v", err) }
vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +233
-
func (c *linuxContainer) Start(process *Process) error { c.m.Lock() defer c.m.Unlock() if process.Init { if err := c.createExecFifo(); err != nil { return err } } if err := c.start(process); err != nil { if process.Init { c.deleteExecFifo() } return err } return nil } func (c *linuxContainer) Run(process *Process) error { if err := c.Start(process); err != nil { return err } if process.Init { return c.exec() } return nil }
-
newParentProcess 函数 创建一对pipe,parentPipe和childPipe,作为 start 进程与容器内部 init 进程通信管道 创建一个命令模版作为 Parent 进程启动的模板 newInitProcess 封装 initProcess。主要工作为添加初始化类型环境变量,将namespace、uid/gid 映射等信息使用 bootstrapData 封装为一个 io.Reader initProcess 实现了 parentProcess 接口
-
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { parentInitPipe, childInitPipe, err := utils.NewSockPair("init") if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") } messageSockPair := filePair{parentInitPipe, childInitPipe} parentLogPipe, childLogPipe, err := os.Pipe() if err != nil { return nil, fmt.Errorf("Unable to create the log pipe: %s", err) } logFilePair := filePair{parentLogPipe, childLogPipe} cmd, err := c.commandTemplate(p, childInitPipe, childLogPipe) if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } if !p.Init { return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) } // We only set up fifoFd if we're not doing a `runc exec`. The historic // reason for this is that previously we would pass a dirfd that allowed // for container rootfs escape (and not doing it in `runc exec` avoided // that problem), but we no longer do that. However, there's no need to do // this for `runc exec` so we just keep it this way to be safe. if err := c.includeExecFifo(cmd); err != nil { return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup") } return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
-
agent.go
-
func main() { defer handlePanic() err := realMain() if err != nil { agentLog.WithError(err).Error("agent failed") os.Exit(1) } agentLog.Debug("agent exiting") os.Exit(0) }
-
initProcess start 函数 创建新的进程。而此时新的进程使用 /proc/self/exec 为执行入口,参数为 init,会在 main 函数调用之前执行,所以在新的进程中 func init() 会直接调用,而不会去执行main函数 func (p *initProcess) start() error { defer p.messageSockPair.parent.Close() err := p.cmd.Start() p.process.ops = p // close the write-side of the pipes (controlled by child) p.messageSockPair.child.Close() p.logFilePair.child.Close() if err != nil { p.process.ops = nil return newSystemErrorWithCause(err, "starting init process command") } cmd 如最后命令所示,Path填充为 /proc/self/exe(本身 agent)。参数字段 Args 为 init,
表示对容器进行初始化,调用的为 agent init agent 最后直接复用 runc 代码 -
func init() { if len(os.Args) > 1 && os.Args[1] == "init" { runtime.GOMAXPROCS(1) runtime.LockOSThread() factory, _ := libcontainer.New("") if err := factory.StartInitialization(); err != nil { agentLog.WithError(err).Error("init failed") } panic("--this line should have never been executed, congratulations--") } }
-
runc 启动容器过程分析(附 CVE-2019-5736 实现过程)
-
环境
OCI runtime spec 地址:https://github.com/opencontainers/runtime-spec
runc 地址:https://github.com/opencontainers/runc/
Commit:f414f497b50a61750ea3af9fccf998a3db687cea
系统版本:Fedora Release 28
内核版本:4.17.9-200.fc28.x86_64runc 介绍
runc 实现了 OCI 的容器标准,能够管理容器的生命周期。runc 的详细功能请参考 帮助文档。
runc 不是基于 server 形式的,所以所有的配置和状态都会存储在本地文件系统中(以下均为使用 docker 时的默认路径):
- 容器配置:/run/docker/libcontainerd/{cnotainer-id}/config.json
- 容器 init 进程的标准输入输出流:/run/docker/libcontainerd/{cnotainer-id}/{init-stdin,init-stdout,init-stderr}
- 容器状态信息:/run/runc/*/state.json
runc 创建容器时会将状态记录到 state.json 中,所有查询都是从 state.json 中取得容器基本信息,然后再从系统中获取容器实时状态。
docker 的调用链如下:
docker-client -> dockerd -> docker-containerd -> docker-containerd-shim -> runc(容器外) -> runc(容器内) -> containter-entrypoint
runc 启动容器过程
runc 在被 docker-containerd-shim 调用时,参数中会指定容器的配置路径(即 config.json 的位置),同时容器的根路径也已经准备完毕,因此 runc 不会有跟镜像相关的概念。容器的启动过程分析直接从 runc run 开始,即 docker 调用链中的 runc(容器外)这个时间点。
runc(容器外)环境准备
读取 config.json(github.com/opencontainers/runc/run.go#65):
// 读取 config.json spec, err := setupSpec(context) if err != nil { return err } // 启动容器 status, err := startContainer(context, spec, CT_ACT_RUN, nil) if err == nil { os.Exit(status) } return err
startContainer 创建容器信息,并启动(github.com/opencontainers/runc/utils_linux.go#396):
func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) { // 通过 spec 创建容器结构,在 createContainer 中将 spec 转换为了 runc 的 container config container, err := createContainer(context, id, spec) if err != nil { return -1, err } // 构建 runner 启动容器 r := &runner{ // 容器 container: container, // 即 CT_ACT_RUN action: action, // 用于设置 process.Init 字段 init: true, } return r.run(spec.Process) }
r.run() 启动容器(github.com/opencontainers/runc/utils_linux.go#268):
func (r *runner) run(config *specs.Process) (int, error) { // 根据 config 构建容器进程,此处 r.init 为 true process, err := newProcess(*config, r.init) if err != nil { r.destroy() return -1, err } // 根据 action 调用 container 的对应方法 switch r.action { case CT_ACT_CREATE: err = r.container.Start(process) case CT_ACT_RESTORE: err = r.container.Restore(process, r.criuOpts) case CT_ACT_RUN: // 此处调用的是这个方法 err = r.container.Run(process) default: panic("Unknown action") } }
container 是由 createContainer() 方法创建,根据创建链路 createContainer() -> loadFactory() -> libcontainer.New() 确认容器由 LinuxFactory.Create() 创建:
// github.com/opencontainers/runc/libcontainer/factory_linux.go#132 func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { l := &LinuxFactory{ // 指向当前的 exe 程序,即 runc 本身 InitPath: "/proc/self/exe", // os.Args[0] 是当前 runc 的路径,本质上和 InitPath 是一样的,即 runc init InitArgs: []string{os.Args[0], "init"}, } return l, nil } // github.com/opencontainers/runc/libcontainer/factory_linux.go#189 func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { // 创建 linux 容器结构 c := &linuxContainer{ // 容器 ID id: id, // 容器状态文件存放目录,默认是 /run/runc/{容器 id}/ root: containerRoot, // 容器配置 config: config, // 即 /proc/self/exe,就是 runc initPath: l.InitPath, // 即 runc init initArgs: l.InitArgs, } return c, nil }
所以整个容器的启动逻辑在 linuxContainer.Run() 里,调用链是 linuxContainer.Run() -> linuxContainer.Start() -> linuxContainer.start():
// github.com/opencontainers/runc/libcontainer/container_linux.go#334 func (c *linuxContainer) start(process *Process) error { // process 是容器的 entrypoint,此处创建的是 entrypoint 的父进程 parent, err := c.newParentProcess(process) if err != nil { return newSystemErrorWithCause(err, "creating new parent process") } // 启动父进程 if err := parent.start(); err != nil { // terminate the process to ensure that it properly is reaped. if err := ignoreTerminateErrors(parent.terminate()); err != nil { logrus.Warn(err) } return newSystemErrorWithCause(err, "starting container process") } } func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { // 创建用于父子进程通信的 pipe parentPipe, childPipe, err := utils.NewSockPair("init") if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") } // 创建父进程的 cmd cmd, err := c.commandTemplate(p, childPipe) if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } if !p.Init { // 由于 p.Init 为 true,所以不会执行到这里 return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } // 返回标准 init 进程 return c.newInitProcess(p, cmd, parentPipe, childPipe) } func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { // 这里可以看到 cmd 就是 runc init cmd := exec.Command(c.initPath, c.initArgs[1:]...
-