From a8761df6c69b8bfca33530f91e296e561e41d2f8 Mon Sep 17 00:00:00 2001 From: lifubang Date: Thu, 4 Apr 2024 00:23:47 +0800 Subject: [PATCH] use PR_SET_CHILD_SUBREAPER and fork to replace clone(CLONE_PARENT) As the description in #4233, there is a bug in glibc, pthread_self() will return wrong info after we do `clone(CLONE_PARENT)` in libct/nsenter, it will cause runc can't work in `go 1.22.*`. So we use fork(2) to replace clone(2) in libct/nsenter, but there is a double-fork in nsenter, so we need to use `PR_SET_CHILD_SUBREAPER` to let runc can reap grand child process in libct/nsenter. Signed-off-by: lifubang --- libcontainer/container_linux.go | 7 ++++ libcontainer/nsenter/nsexec.c | 66 +++++++-------------------------- libcontainer/process_linux.go | 36 +++++++++++++----- 3 files changed, 47 insertions(+), 62 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 13be71ccb89..59f72c62e79 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -341,6 +341,13 @@ func (c *Container) start(process *Process) (retErr error) { if err := utils.CloseExecFrom(3); err != nil { return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) } + + // Tell the kernel that runc wants to reap orphaned children of the + // `runc init` process. + if err := unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); err != nil { + return fmt.Errorf("unable to set child subreaper: %w", err) + } + if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index c771ac7e116..1cef72832a8 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -52,20 +52,6 @@ enum sync_t { /* Stores the current stage of nsexec. */ int current_stage = STAGE_SETUP; -/* Assume the stack grows down, so arguments should be above it. */ -struct clone_t { - /* - * Reserve some space for clone() to locate arguments - * and retcode in this place - */ - char stack[4096] __attribute__((aligned(16))); - char stack_ptr[0]; - - /* There's two children. This is used to execute the different code. */ - jmp_buf *env; - int jmpval; -}; - struct nlconfig_t { char *data; @@ -303,23 +289,15 @@ static void update_oom_score_adj(char *data, size_t len) bail("failed to update /proc/self/oom_score_adj"); } -/* A dummy function that just jumps to the given jumpval. */ -static int child_func(void *arg) __attribute__((noinline)); -static int child_func(void *arg) +static int fork_and_run(jmp_buf *env, int jmpval) { - struct clone_t *ca = (struct clone_t *)arg; - longjmp(*ca->env, ca->jmpval); -} - -static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); -static int clone_parent(jmp_buf *env, int jmpval) -{ - struct clone_t ca = { - .env = env, - .jmpval = jmpval, - }; - - return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); + pid_t pid = fork(); + if (pid < 0) + bail("failed to fork"); + if (pid > 0) + return pid; + /* Jump back to the big switch in nsexec... */ + longjmp(*env, jmpval); } /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ @@ -644,7 +622,7 @@ void nsexec(void) * * Unfortunately, it's not as simple as that. We have to fork to enter the * PID namespace (the PID namespace only applies to children). Since we'll - * have to double-fork, this clone_parent() call won't be able to get the + * have to double-fork, this fork() call won't be able to get the * PID of the _actual_ init process (without doing more synchronisation than * I can deal with at the moment). So we'll just get the parent to send it * for us, the only job of this process is to update @@ -655,15 +633,6 @@ void nsexec(void) * will be in that namespace (and it will not be able to give us a PID value * that makes sense without resorting to sending things with cmsg). * - * This also deals with an older issue caused by dumping cloneflags into - * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so - * we have to unshare(2) before clone(2) in order to do this. This was fixed - * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was - * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're - * aware, the last mainline kernel which had this bug was Linux 3.12. - * However, we cannot comment on which kernels the broken patch was - * backported to. - * * -- Aleksa "what has my life come to?" Sarai */ @@ -687,7 +656,7 @@ void nsexec(void) /* Start the process of getting a container. */ write_log(DEBUG, "spawn stage-1"); - stage1_pid = clone_parent(&env, STAGE_CHILD); + stage1_pid = fork_and_run(&env, STAGE_CHILD); if (stage1_pid < 0) bail("unable to spawn stage-1"); @@ -755,9 +724,7 @@ void nsexec(void) /* * Send both the stage-1 and stage-2 pids back to runc. * runc needs the stage-2 to continue process management, - * but because stage-1 was spawned with CLONE_PARENT we - * cannot reap it within stage-0 and thus we need to ask - * runc to reap the zombie for us. + * and ask runc to reap the zombie for us. */ write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", stage1_pid, stage2_pid); @@ -892,9 +859,8 @@ void nsexec(void) } /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal stage-0 to do the mapping for - * us. + * We don't have the privileges to do any mapping here. + * So signal stage-0 to do the mapping for us. */ write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; @@ -925,10 +891,6 @@ void nsexec(void) * ordering might break in the future (especially with rootless * containers). But for now, it's not possible to split this into * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. - * - * Note that we don't merge this with clone() because there were - * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) - * was broken, so we'll just do it the long way anyway. */ try_unshare(config.cloneflags, "remaining namespaces"); @@ -955,7 +917,7 @@ void nsexec(void) * to actually enter the new PID namespace. */ write_log(DEBUG, "spawn stage-2"); - stage2_pid = clone_parent(&env, STAGE_INIT); + stage2_pid = fork_and_run(&env, STAGE_INIT); if (stage2_pid < 0) bail("unable to spawn stage-2"); diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 6d51eada2b3..2b06d096253 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -321,13 +321,21 @@ func (p *setnsProcess) execSetns() error { // terminate sends a SIGKILL to the forked process for the setns routine then waits to // avoid the process becoming a zombie. func (p *setnsProcess) terminate() error { - if p.cmd.Process == nil { - return nil + var err error + if p.cmd.Process != nil { + err = p.cmd.Process.Kill() + if err != nil { + logrus.Warnf("error killing setns process: %v", err) + } } - err := p.cmd.Process.Kill() - if _, werr := p.wait(); err == nil { - err = werr + + for { + if _, werr := unix.Wait4(-1, nil, 0, nil); werr != nil { + err = werr + break + } } + return err } @@ -794,13 +802,21 @@ func (p *initProcess) wait() (*os.ProcessState, error) { } func (p *initProcess) terminate() error { - if p.cmd.Process == nil { - return nil + var err error + if p.cmd.Process != nil { + err = p.cmd.Process.Kill() + if err != nil { + logrus.Warnf("error killing init process: %v", err) + } } - err := p.cmd.Process.Kill() - if _, werr := p.wait(); err == nil { - err = werr + + for { + if _, werr := unix.Wait4(-1, nil, 0, nil); werr != nil { + err = werr + break + } } + return err }