From 1c8a5b379f9417c5e01d00971d29d222b62315e4 Mon Sep 17 00:00:00 2001 From: Shayon Mukherjee Date: Sat, 30 May 2026 08:43:07 -0400 Subject: [PATCH] setns: support user namespaces User namespace entries under /proc/[pid]/ns currently render as fake namespace symlinks. They look like the other namespace files, but opening them does not produce an nsfs file that setns(2) can use. Rootless container tools such as buildah and podman rely on that file when they re-enter the pause process user namespace, so the second lifecycle command fails with EINVAL. Make UserNamespace implement vfs.Namespace and give each user namespace an nsfs inode when it is created. /proc/[pid]/ns/user now uses the regular namespace symlink path, so opening it returns a joinable namespace file instead of a fake link target. Setns now accepts CLONE_NEWUSER from both nsfds and pidfds. It follows the Linux restrictions for user namespace joins by rejecting the caller's current user namespace, requiring CAP_SYS_ADMIN in the target user namespace, rejecting multithreaded callers, and rejecting callers with fs state shared outside the thread group. The capability checks for any other namespaces in the same setns call use the credentials the caller would have after joining the user namespace. Add a syscall regression test that creates a child user namespace, opens /proc//ns/user, and verifies that setns(CLONE_NEWUSER) succeeds. --- pkg/sentry/fsimpl/proc/task.go | 2 +- pkg/sentry/fsimpl/proc/task_files.go | 7 +++ pkg/sentry/kernel/auth/BUILD | 1 + pkg/sentry/kernel/auth/user_namespace.go | 31 ++++++++++++++ pkg/sentry/kernel/kernel.go | 1 + pkg/sentry/kernel/task_clone.go | 54 +++++++++++++++++++++--- test/syscalls/linux/BUILD | 2 + test/syscalls/linux/setns.cc | 42 ++++++++++++++++++ 8 files changed, 133 insertions(+), 7 deletions(-) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 6e8ef648ef..e820a74508 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -79,7 +79,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET), "mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS), "pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWPID), - "user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"), + "user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUSER), "ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC), "uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS), }), diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 8c4cff84aa..e0a336cb82 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -1461,6 +1461,13 @@ func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode { return pidns.GetInode() } return nil + case linux.CLONE_NEWUSER: + inode, _ := t.UserNamespace().GetInode().(*nsfs.Inode) + if inode == nil { + return nil + } + inode.IncRef() + return inode default: panic("unknown namespace") } diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 4a6760b651..fd238f4737 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -99,6 +99,7 @@ go_library( "//pkg/errors/linuxerr", "//pkg/log", "//pkg/rand", + "//pkg/refs", "//pkg/sentry/seccheck", "//pkg/sentry/seccheck/points:points_go_proto", "//pkg/sync", diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go index 69fa7d7d46..d90faed946 100644 --- a/pkg/sentry/kernel/auth/user_namespace.go +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/refs" ) // A UserNamespace represents a user namespace. See user_namespaces(7) for @@ -60,6 +61,9 @@ type UserNamespace struct { // setgroupsAllowed mirrors USERNS_SETGROUPS_ALLOWED in Linux. Protected by mu. setgroupsAllowed bool + + // inode is the nsfs inode associated with this namespace. + inode refs.TryRefCounter } // NewRootUserNamespace returns a UserNamespace that is appropriate for a @@ -99,6 +103,33 @@ func (ns *UserNamespace) Root() *UserNamespace { return ns } +// Type implements vfs.Namespace.Type. +func (ns *UserNamespace) Type() string { + return "user" +} + +// Destroy implements vfs.Namespace.Destroy. +func (ns *UserNamespace) Destroy(ctx context.Context) {} + +// UserNamespace implements vfs.Namespace.UserNamespace. +func (ns *UserNamespace) UserNamespace() *UserNamespace { + return ns +} + +// SetInode sets the nsfs inode associated with ns. +func (ns *UserNamespace) SetInode(inode refs.TryRefCounter) { + ns.mu.Lock() + defer ns.mu.Unlock() + ns.inode = inode +} + +// GetInode returns the nsfs inode associated with ns. +func (ns *UserNamespace) GetInode() refs.TryRefCounter { + ns.mu.Lock() + defer ns.mu.Unlock() + return ns.inode +} + // "The kernel imposes (since version 3.11) a limit of 32 nested levels of user // namespaces." - user_namespaces(7) const maxUserNamespaceDepth = 32 diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1a07719e60..a5c1bb2f4f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -558,6 +558,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { } defer nsfsFilesystem.DecRef(ctx) k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) + k.rootUserNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUserNamespace)) k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace)) k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace)) k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace)) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 6d63786736..bce6683d60 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -133,6 +134,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { if err != nil { return 0, nil, err } + userns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, userns)) } if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { return 0, nil, linuxerr.EPERM @@ -498,6 +500,7 @@ type namespaceSet struct { utsNS *UTSNamespace ipcNS *IPCNamespace mountNS *vfs.MountNamespace + userNS *auth.UserNamespace fsContext *FSContext } @@ -525,7 +528,7 @@ func (nss *namespaceSet) release(t *Task) { } func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error { - supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS) + supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS | linux.CLONE_NEWUSER) if (uint32(flags) & ^supported) != 0 || flags == 0 { return linuxerr.EINVAL } @@ -533,6 +536,13 @@ func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error return linuxerr.EPERM } + if flags&linux.CLONE_NEWUSER != 0 { + if target.ExitState() >= TaskExitInitiated { + return linuxerr.ESRCH + } + nss.userNS = target.Credentials().UserNamespace + } + if flags&linux.CLONE_NEWPID != 0 { nss.childPIDNS = target.tg.pidns if nss.childPIDNS == nil { @@ -620,6 +630,11 @@ func (nss *namespaceSet) initFromNS(ns vfs.Namespace, flags int32) error { } nss.mountNS = ns ns.IncRef() + case *auth.UserNamespace: + if flags != 0 && flags != linux.CLONE_NEWUSER { + return linuxerr.EINVAL + } + nss.userNS = ns default: return linuxerr.EINVAL } @@ -656,8 +671,31 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { return err } + creds := t.Credentials() + checkCreds := creds + if nss.userNS != nil { + if nss.userNS == creds.UserNamespace { + return linuxerr.EINVAL + } + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.userNS) { + return linuxerr.EPERM + } + t.tg.signalHandlers.mu.Lock() + if t.tg.tasksCount != 1 { + t.tg.signalHandlers.mu.Unlock() + return linuxerr.EINVAL + } + t.tg.signalHandlers.mu.Unlock() + fsContext := t.FSContext() + if fsContext.checkAndPreventSharingOutsideTG(t.tg) { + return linuxerr.EINVAL + } + defer fsContext.allowSharing() + checkCreds = creds.ForkIntoUserNamespace(nss.userNS) + } + if nss.childPIDNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } // Allow setting the current or a child pid namespace. @@ -674,25 +712,25 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { } if nss.netNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.utsNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.ipcNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.mountNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_CHROOT) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_CHROOT) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } oldFSContext := t.FSContext() @@ -711,6 +749,9 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { // Swap to new namespaces. // Store replaced resources in nss so that they're cleaned up by the deferred function. + if nss.userNS != nil { + t.creds.Store(checkCreds) + } t.mu.Lock() if nss.childPIDNS != nil { t.childPIDNamespace, nss.childPIDNS = nss.childPIDNS, t.childPIDNamespace @@ -820,6 +861,7 @@ func (t *Task) Unshare(flags int32) error { if err != nil { return err } + newUserNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUserNS)) creds = t.Credentials().ForkIntoUserNamespace(newUserNS) newCreds = true } diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 58c9e6f3d2..e96f64171e 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -2510,8 +2510,10 @@ cc_binary( malloc = "//test/util:errno_safe_allocator", deps = select_gtest() + [ "//test/util:capability_util", + "//test/util:cleanup", "//test/util:file_descriptor", "//test/util:logging", + "//test/util:multiprocess_util", "//test/util:posix_error", "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/setns.cc b/test/syscalls/linux/setns.cc index 5c625f7449..296a633ab1 100644 --- a/test/syscalls/linux/setns.cc +++ b/test/syscalls/linux/setns.cc @@ -13,11 +13,13 @@ // limitations under the License. #include +#include #include #include #include #include #include +#include #include @@ -25,9 +27,11 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" +#include "test/util/cleanup.h" #include "test/util/file_descriptor.h" #include "test/util/linux_capability_util.h" #include "test/util/logging.h" +#include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" #include "test/util/test_util.h" @@ -156,6 +160,44 @@ TEST(SetnsTest, ChangeMountNamespaceZeroFlags) { ASSERT_THAT(setns(nsfd.get(), 0), SyscallSucceedsWithValue(0)); } +TEST(SetnsTest, ChangeUserNamespace) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int pfd[2]; + ASSERT_THAT(pipe(pfd), SyscallSucceeds()); + FileDescriptor pipe_read(pfd[0]); + FileDescriptor pipe_write(pfd[1]); + + pid_t child = fork(); + ASSERT_THAT(child, SyscallSucceeds()); + if (child == 0) { + pipe_read.reset(); + TEST_CHECK_SUCCESS(unshare(CLONE_NEWUSER)); + TEST_CHECK_SUCCESS(write(pipe_write.get(), "R", 1)); + pause(); + _exit(0); + } + Cleanup cleanup([child] { + kill(child, SIGKILL); + kill(child, SIGCONT); + int status; + RetryEINTR(waitpid)(child, &status, 0); + }); + pipe_write.reset(); + + char buf; + ASSERT_THAT(read(pipe_read.get(), &buf, 1), SyscallSucceedsWithValue(1)); + + char nspath[PATH_MAX]; + snprintf(nspath, sizeof(nspath), "/proc/%d/ns/user", child); + const FileDescriptor nsfd = ASSERT_NO_ERRNO_AND_VALUE(Open(nspath, O_RDONLY)); + + EXPECT_THAT(InForkedProcess([&nsfd] { + TEST_CHECK_SUCCESS(setns(nsfd.get(), CLONE_NEWUSER)); + }), + IsPosixErrorOkAndHolds(0)); +} + } // namespace } // namespace testing } // namespace gvisor