Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/sentry/fsimpl/proc/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns
"net": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET),
"mnt": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS),
"pid": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWPID),
"user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"),
"user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUSER),
"ipc": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC),
"uts": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS),
}),
Expand Down
7 changes: 7 additions & 0 deletions pkg/sentry/fsimpl/proc/task_files.go
Original file line number Diff line number Diff line change
Expand Up @@ -1461,6 +1461,13 @@ func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode {
return pidns.GetInode()
}
return nil
case linux.CLONE_NEWUSER:
inode, _ := t.UserNamespace().GetInode().(*nsfs.Inode)
if inode == nil {
return nil
}
inode.IncRef()
return inode
default:
panic("unknown namespace")
}
Expand Down
1 change: 1 addition & 0 deletions pkg/sentry/kernel/auth/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ go_library(
"//pkg/errors/linuxerr",
"//pkg/log",
"//pkg/rand",
"//pkg/refs",
"//pkg/sentry/seccheck",
"//pkg/sentry/seccheck/points:points_go_proto",
"//pkg/sync",
Expand Down
31 changes: 31 additions & 0 deletions pkg/sentry/kernel/auth/user_namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/refs"
)

// A UserNamespace represents a user namespace. See user_namespaces(7) for
Expand Down Expand Up @@ -60,6 +61,9 @@ type UserNamespace struct {

// setgroupsAllowed mirrors USERNS_SETGROUPS_ALLOWED in Linux. Protected by mu.
setgroupsAllowed bool

// inode is the nsfs inode associated with this namespace.
inode refs.TryRefCounter
}

// NewRootUserNamespace returns a UserNamespace that is appropriate for a
Expand Down Expand Up @@ -99,6 +103,33 @@ func (ns *UserNamespace) Root() *UserNamespace {
return ns
}

// Type implements vfs.Namespace.Type.
func (ns *UserNamespace) Type() string {
return "user"
}

// Destroy implements vfs.Namespace.Destroy.
func (ns *UserNamespace) Destroy(ctx context.Context) {}

// UserNamespace implements vfs.Namespace.UserNamespace.
func (ns *UserNamespace) UserNamespace() *UserNamespace {
return ns
}

// SetInode sets the nsfs inode associated with ns.
func (ns *UserNamespace) SetInode(inode refs.TryRefCounter) {
ns.mu.Lock()
defer ns.mu.Unlock()
ns.inode = inode
}

// GetInode returns the nsfs inode associated with ns.
func (ns *UserNamespace) GetInode() refs.TryRefCounter {
ns.mu.Lock()
defer ns.mu.Unlock()
return ns.inode
}

// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
// namespaces." - user_namespaces(7)
const maxUserNamespaceDepth = 32
Expand Down
1 change: 1 addition & 0 deletions pkg/sentry/kernel/kernel.go
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,7 @@ func (k *Kernel) Init(args InitKernelArgs) error {
}
defer nsfsFilesystem.DecRef(ctx)
k.nsfsMount = k.vfs.NewDisconnectedMount(nsfsFilesystem, nil, &vfs.MountOptions{})
k.rootUserNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUserNamespace))
k.rootNetworkNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootNetworkNamespace))
k.rootIPCNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootIPCNamespace))
k.rootUTSNamespace.SetInode(nsfs.NewInode(ctx, k.nsfsMount, k.rootUTSNamespace))
Expand Down
54 changes: 48 additions & 6 deletions pkg/sentry/kernel/task_clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/seccheck"
pb "gvisor.dev/gvisor/pkg/sentry/seccheck/points/points_go_proto"
"gvisor.dev/gvisor/pkg/sentry/vfs"
Expand Down Expand Up @@ -133,6 +134,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
if err != nil {
return 0, nil, err
}
userns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, userns))
}
if args.Flags&(linux.CLONE_NEWPID|linux.CLONE_NEWNET|linux.CLONE_NEWUTS|linux.CLONE_NEWIPC) != 0 && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
return 0, nil, linuxerr.EPERM
Expand Down Expand Up @@ -498,6 +500,7 @@ type namespaceSet struct {
utsNS *UTSNamespace
ipcNS *IPCNamespace
mountNS *vfs.MountNamespace
userNS *auth.UserNamespace

fsContext *FSContext
}
Expand Down Expand Up @@ -525,14 +528,21 @@ func (nss *namespaceSet) release(t *Task) {
}

func (nss *namespaceSet) initFromTask(t *Task, target *Task, flags int32) error {
supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS)
supported := uint32(linux.CLONE_NEWPID | linux.CLONE_NEWNET | linux.CLONE_NEWUTS | linux.CLONE_NEWIPC | linux.CLONE_NEWNS | linux.CLONE_NEWUSER)
if (uint32(flags) & ^supported) != 0 || flags == 0 {
return linuxerr.EINVAL
}
if !t.CanTrace(target, false) {
return linuxerr.EPERM
}

if flags&linux.CLONE_NEWUSER != 0 {
if target.ExitState() >= TaskExitInitiated {
return linuxerr.ESRCH
}
nss.userNS = target.Credentials().UserNamespace
}

if flags&linux.CLONE_NEWPID != 0 {
nss.childPIDNS = target.tg.pidns
if nss.childPIDNS == nil {
Expand Down Expand Up @@ -620,6 +630,11 @@ func (nss *namespaceSet) initFromNS(ns vfs.Namespace, flags int32) error {
}
nss.mountNS = ns
ns.IncRef()
case *auth.UserNamespace:
if flags != 0 && flags != linux.CLONE_NEWUSER {
return linuxerr.EINVAL
}
nss.userNS = ns
default:
return linuxerr.EINVAL
}
Expand Down Expand Up @@ -656,8 +671,31 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
return err
}

creds := t.Credentials()
checkCreds := creds
if nss.userNS != nil {
if nss.userNS == creds.UserNamespace {
return linuxerr.EINVAL
}
if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.userNS) {
return linuxerr.EPERM
}
t.tg.signalHandlers.mu.Lock()
if t.tg.tasksCount != 1 {
t.tg.signalHandlers.mu.Unlock()
return linuxerr.EINVAL
}
t.tg.signalHandlers.mu.Unlock()
fsContext := t.FSContext()
if fsContext.checkAndPreventSharingOutsideTG(t.tg) {
return linuxerr.EINVAL
}
defer fsContext.allowSharing()
checkCreds = creds.ForkIntoUserNamespace(nss.userNS)
}

if nss.childPIDNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.childPIDNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
// Allow setting the current or a child pid namespace.
Expand All @@ -674,25 +712,25 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
}

if nss.netNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.netNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
}

if nss.utsNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.utsNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
}

if nss.ipcNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.ipcNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
}

if nss.mountNS != nil {
if !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !t.HasSelfCapability(linux.CAP_SYS_CHROOT) || !t.HasSelfCapability(linux.CAP_SYS_ADMIN) {
if !checkCreds.HasCapabilityIn(linux.CAP_SYS_ADMIN, nss.mountNS.UserNamespace()) || !checkCreds.HasSelfCapability(linux.CAP_SYS_CHROOT) || !checkCreds.HasSelfCapability(linux.CAP_SYS_ADMIN) {
return linuxerr.EPERM
}
oldFSContext := t.FSContext()
Expand All @@ -711,6 +749,9 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {

// Swap to new namespaces.
// Store replaced resources in nss so that they're cleaned up by the deferred function.
if nss.userNS != nil {
t.creds.Store(checkCreds)
}
t.mu.Lock()
if nss.childPIDNS != nil {
t.childPIDNamespace, nss.childPIDNS = nss.childPIDNS, t.childPIDNamespace
Expand Down Expand Up @@ -820,6 +861,7 @@ func (t *Task) Unshare(flags int32) error {
if err != nil {
return err
}
newUserNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUserNS))
creds = t.Credentials().ForkIntoUserNamespace(newUserNS)
newCreds = true
}
Expand Down
2 changes: 2 additions & 0 deletions test/syscalls/linux/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2510,8 +2510,10 @@ cc_binary(
malloc = "//test/util:errno_safe_allocator",
deps = select_gtest() + [
"//test/util:capability_util",
"//test/util:cleanup",
"//test/util:file_descriptor",
"//test/util:logging",
"//test/util:multiprocess_util",
"//test/util:posix_error",
"//test/util:test_main",
"//test/util:test_util",
Expand Down
42 changes: 42 additions & 0 deletions test/syscalls/linux/setns.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,25 @@
// limitations under the License.

#include <linux/prctl.h>
#include <limits.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>

#include <cstdint>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "test/util/cleanup.h"
#include "test/util/file_descriptor.h"
#include "test/util/linux_capability_util.h"
#include "test/util/logging.h"
#include "test/util/multiprocess_util.h"
#include "test/util/posix_error.h"
#include "test/util/test_util.h"

Expand Down Expand Up @@ -156,6 +160,44 @@ TEST(SetnsTest, ChangeMountNamespaceZeroFlags) {
ASSERT_THAT(setns(nsfd.get(), 0), SyscallSucceedsWithValue(0));
}

TEST(SetnsTest, ChangeUserNamespace) {
SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));

int pfd[2];
ASSERT_THAT(pipe(pfd), SyscallSucceeds());
FileDescriptor pipe_read(pfd[0]);
FileDescriptor pipe_write(pfd[1]);

pid_t child = fork();
ASSERT_THAT(child, SyscallSucceeds());
if (child == 0) {
pipe_read.reset();
TEST_CHECK_SUCCESS(unshare(CLONE_NEWUSER));
TEST_CHECK_SUCCESS(write(pipe_write.get(), "R", 1));
pause();
_exit(0);
}
Cleanup cleanup([child] {
kill(child, SIGKILL);
kill(child, SIGCONT);
int status;
RetryEINTR(waitpid)(child, &status, 0);
});
pipe_write.reset();

char buf;
ASSERT_THAT(read(pipe_read.get(), &buf, 1), SyscallSucceedsWithValue(1));

char nspath[PATH_MAX];
snprintf(nspath, sizeof(nspath), "/proc/%d/ns/user", child);
const FileDescriptor nsfd = ASSERT_NO_ERRNO_AND_VALUE(Open(nspath, O_RDONLY));

EXPECT_THAT(InForkedProcess([&nsfd] {
TEST_CHECK_SUCCESS(setns(nsfd.get(), CLONE_NEWUSER));
}),
IsPosixErrorOkAndHolds(0));
}

} // namespace
} // namespace testing
} // namespace gvisor