diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 6e8ef648ef..e820a74508 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -79,7 +79,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns "net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET), "mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS), "pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWPID), - "user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"), + "user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUSER), "ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC), "uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS), }), diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 8c4cff84aa..e0a336cb82 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -1461,6 +1461,13 @@ func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode { return pidns.GetInode() } return nil + case linux.CLONE_NEWUSER: + inode, _ := t.UserNamespace().GetInode().(*nsfs.Inode) + if inode == nil { + return nil + } + inode.IncRef() + return inode default: panic("unknown namespace") } diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 4a6760b651..fd238f4737 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -99,6 +99,7 @@ go_library( "//pkg/errors/linuxerr", "//pkg/log", "//pkg/rand", + "//pkg/refs", "//pkg/sentry/seccheck", "//pkg/sentry/seccheck/points:points_go_proto", "//pkg/sync", diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go index 69fa7d7d46..d90faed946 100644 --- a/pkg/sentry/kernel/auth/user_namespace.go +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/errors/linuxerr" + "gvisor.dev/gvisor/pkg/refs" ) // A UserNamespace represents a user namespace. See user_namespaces(7) for @@ -60,6 +61,9 @@ type UserNamespace struct { // setgroupsAllowed mirrors USERNS_SETGROUPS_ALLOWED in Linux. Protected by mu. setgroupsAllowed bool + + // inode is the nsfs inode associated with this namespace. + inode refs.TryRefCounter } // NewRootUserNamespace returns a UserNamespace that is appropriate for a @@ -99,6 +103,33 @@ func (ns *UserNamespace) Root() *UserNamespace { return ns } +// Type implements vfs.Namespace.Type. +func (ns *UserNamespace) Type() string { + return "user" +} + +// Destroy implements vfs.Namespace.Destroy. +func (ns *UserNamespace) Destroy(ctx context.Context) {} + +// UserNamespace implements vfs.Namespace.UserNamespace. +func (ns *UserNamespace) UserNamespace() *UserNamespace { + return ns +} + +// SetInode sets the nsfs inode associated with ns. +func (ns *UserNamespace) SetInode(inode refs.TryRefCounter) { + ns.mu.Lock() + defer ns.mu.Unlock() + ns.inode = inode +} + +// GetInode returns the nsfs inode associated with ns. +func (ns *UserNamespace) GetInode() refs.TryRefCounter { + ns.mu.Lock() + defer ns.mu.Unlock() + return ns.inode +} + // "The kernel imposes (since version 3.11) a limit of 32 nested levels of user // namespaces." - user_namespaces(7) const maxUserNamespaceDepth = 32 diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1a07719e60..a5c1bb2f4f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -558,6 +558,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { } defer nsfsFilesystem.DecRef(ctx) k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{}) + k.rootUserNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUserNamespace)) k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace)) k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace)) k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace)) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 6d63786736..bce6683d60 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/seccheck" pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -133,6 +134,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) { if err != nil { return 0, nil, err } + userns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, userns)) } if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) { return 0, nil, linuxerr.EPERM @@ -498,6 +500,7 @@ type namespaceSet struct { utsNS *UTSNamespace ipcNS *IPCNamespace mountNS *vfs.MountNamespace + userNS *auth.UserNamespace fsContext *FSContext } @@ -525,7 +528,7 @@ func (nss *namespaceSet) release(t *Task) { } func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error { - supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS) + supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS | linux.CLONE_NEWUSER) if (uint32(flags) & ^supported) != 0 || flags == 0 { return linuxerr.EINVAL } @@ -533,6 +536,13 @@ func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error return linuxerr.EPERM } + if flags&linux.CLONE_NEWUSER != 0 { + if target.ExitState() >= TaskExitInitiated { + return linuxerr.ESRCH + } + nss.userNS = target.Credentials().UserNamespace + } + if flags&linux.CLONE_NEWPID != 0 { nss.childPIDNS = target.tg.pidns if nss.childPIDNS == nil { @@ -620,6 +630,11 @@ func (nss *namespaceSet) initFromNS(ns vfs.Namespace, flags int32) error { } nss.mountNS = ns ns.IncRef() + case *auth.UserNamespace: + if flags != 0 && flags != linux.CLONE_NEWUSER { + return linuxerr.EINVAL + } + nss.userNS = ns default: return linuxerr.EINVAL } @@ -656,8 +671,31 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { return err } + creds := t.Credentials() + checkCreds := creds + if nss.userNS != nil { + if nss.userNS == creds.UserNamespace { + return linuxerr.EINVAL + } + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.userNS) { + return linuxerr.EPERM + } + t.tg.signalHandlers.mu.Lock() + if t.tg.tasksCount != 1 { + t.tg.signalHandlers.mu.Unlock() + return linuxerr.EINVAL + } + t.tg.signalHandlers.mu.Unlock() + fsContext := t.FSContext() + if fsContext.checkAndPreventSharingOutsideTG(t.tg) { + return linuxerr.EINVAL + } + defer fsContext.allowSharing() + checkCreds = creds.ForkIntoUserNamespace(nss.userNS) + } + if nss.childPIDNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } // Allow setting the current or a child pid namespace. @@ -674,25 +712,25 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { } if nss.netNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.utsNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.ipcNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } } if nss.mountNS != nil { - if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_CHROOT) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) { + if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_CHROOT) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) { return linuxerr.EPERM } oldFSContext := t.FSContext() @@ -711,6 +749,9 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error { // Swap to new namespaces. // Store replaced resources in nss so that they're cleaned up by the deferred function. + if nss.userNS != nil { + t.creds.Store(checkCreds) + } t.mu.Lock() if nss.childPIDNS != nil { t.childPIDNamespace, nss.childPIDNS = nss.childPIDNS, t.childPIDNamespace @@ -820,6 +861,7 @@ func (t *Task) Unshare(flags int32) error { if err != nil { return err } + newUserNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUserNS)) creds = t.Credentials().ForkIntoUserNamespace(newUserNS) newCreds = true } diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 58c9e6f3d2..e96f64171e 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -2510,8 +2510,10 @@ cc_binary( malloc = "//test/util:errno_safe_allocator", deps = select_gtest() + [ "//test/util:capability_util", + "//test/util:cleanup", "//test/util:file_descriptor", "//test/util:logging", + "//test/util:multiprocess_util", "//test/util:posix_error", "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/setns.cc b/test/syscalls/linux/setns.cc index 5c625f7449..296a633ab1 100644 --- a/test/syscalls/linux/setns.cc +++ b/test/syscalls/linux/setns.cc @@ -13,11 +13,13 @@ // limitations under the License. #include +#include #include #include #include #include #include +#include #include @@ -25,9 +27,11 @@ #include "gtest/gtest.h" #include "absl/time/clock.h" #include "absl/time/time.h" +#include "test/util/cleanup.h" #include "test/util/file_descriptor.h" #include "test/util/linux_capability_util.h" #include "test/util/logging.h" +#include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" #include "test/util/test_util.h" @@ -156,6 +160,44 @@ TEST(SetnsTest, ChangeMountNamespaceZeroFlags) { ASSERT_THAT(setns(nsfd.get(), 0), SyscallSucceedsWithValue(0)); } +TEST(SetnsTest, ChangeUserNamespace) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + int pfd[2]; + ASSERT_THAT(pipe(pfd), SyscallSucceeds()); + FileDescriptor pipe_read(pfd[0]); + FileDescriptor pipe_write(pfd[1]); + + pid_t child = fork(); + ASSERT_THAT(child, SyscallSucceeds()); + if (child == 0) { + pipe_read.reset(); + TEST_CHECK_SUCCESS(unshare(CLONE_NEWUSER)); + TEST_CHECK_SUCCESS(write(pipe_write.get(), "R", 1)); + pause(); + _exit(0); + } + Cleanup cleanup([child] { + kill(child, SIGKILL); + kill(child, SIGCONT); + int status; + RetryEINTR(waitpid)(child, &status, 0); + }); + pipe_write.reset(); + + char buf; + ASSERT_THAT(read(pipe_read.get(), &buf, 1), SyscallSucceedsWithValue(1)); + + char nspath[PATH_MAX]; + snprintf(nspath, sizeof(nspath), "/proc/%d/ns/user", child); + const FileDescriptor nsfd = ASSERT_NO_ERRNO_AND_VALUE(Open(nspath, O_RDONLY)); + + EXPECT_THAT(InForkedProcess([&nsfd] { + TEST_CHECK_SUCCESS(setns(nsfd.get(), CLONE_NEWUSER)); + }), + IsPosixErrorOkAndHolds(0)); +} + } // namespace } // namespace testing } // namespace gvisor