# Security model with sandboxing: # # Buffer processes are the most security-sensitive, since they parse # various resources retrieved from the network (CSS, HTML) and sometimes # even execute untrusted code (JS, with an engine written in C). So the # main goal is to give buffers as few permissions as possible. # # On FreeBSD, we create a file descriptor to the directory sockets # reside in, and then use that for manipulating our sockets. # # Capsicum does not enable more fine-grained capability control, but # in practice the things it does enable should not be enough to harm the # user's system. # # On OpenBSD, we pledge the minimum amount of promises we need, and # do not unveil anything. It seems to be roughly equivalent to the # security we get with FreeBSD Capsicum. # # On Linux, we use libseccomp so that I don't have to manually write # BPF filters. # Sandboxing on Linux is at the moment slightly less safe than on the # two BSDs, because a rogue buffer could in theory connect to whatever # open UNIX domain socket on the system that the user has access to. #TODO look into integrating Landlock to fix this. # # We do not have OS-level sandboxing on other systems (yet). # # Aside from sandboxing in buffer processes, we also have a more # restrictive "network" sandbox that is intended for CGI processes that # just read/write from/to the network and stdin/stdout. At the moment this # is only used in the HTTP process. #TODO add it to more CGI scripts const disableSandbox {.booldefine.} = false type SandboxType* = enum stNone = "no sandbox" stCapsicum = "capsicum" stPledge = "pledge" stLibSeccomp = "libseccomp" const SandboxMode* = when disableSandbox: stNone elif defined(freebsd): stCapsicum elif defined(openbsd): stPledge elif defined(linux): stLibSeccomp else: stNone when SandboxMode == stCapsicum: import bindings/capsicum proc enterBufferSandbox*(sockPath: string) = # per man:cap_enter(2), it may return ENOSYS if the kernel was compiled # without CAPABILITY_MODE. So it seems better not to panic in this case. # (But TODO: when we get enough sandboxing coverage it should print a # warning or something.) discard cap_enter() proc enterNetworkSandbox*() = # no difference between buffer; Capsicum is quite straightforward # to use in this regard. discard cap_enter() elif SandboxMode == stPledge: import bindings/pledge proc enterBufferSandbox*(sockPath: string) = # take whatever we need to # * fork # * connect to UNIX domain sockets # * take FDs from the main process doAssert pledge("unix stdio sendfd recvfd proc", nil) == 0 proc enterNetworkSandbox*() = # we don't need much to write out data from sockets to stdout. doAssert pledge("stdio", nil) == 0 elif SandboxMode == stLibSeccomp: import std/posix import bindings/libseccomp when defined(android): let PR_SET_VMA {.importc, header: "", nodecl.}: cint let PR_SET_VMA_ANON_NAME {.importc, header: "", nodecl.}: cint proc allowBionic(ctx: scmp_filter_ctx) = # Things needed for bionic libc. Tested with Termux. const androidAllowList = [ cstring"rt_sigprocmask", "epoll_pwait", "madvise" ] for it in androidAllowList: let syscall = seccomp_syscall_resolve_name(it) doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) == 0 # bionic likes to set this very much. In fact, it was added to # the kernel by Android devs. block allowAnonVMAName: let syscall = seccomp_syscall_resolve_name("prctl") let arg0 = scmp_arg_cmp( arg: 0, # op op: SCMP_CMP_EQ, # equals datum_a: uint64(PR_SET_VMA) ) let arg1 = scmp_arg_cmp( arg: 1, # attr op: SCMP_CMP_EQ, # equals datum_a: uint64(PR_SET_VMA_ANON_NAME) ) doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 2, arg0, arg1) == 0 # We have to be careful with this one; PROT_EXEC will happily set # memory as executable, which is certainly not what we want. # Now, bionic seems to be calling this from mutate(), ergo we # should be fine just allowing PROT_READ and PROT_READ | PROT_WRITE. block allowMprotect: let syscall = seccomp_syscall_resolve_name("mprotect") let arg2 = scmp_arg_cmp( arg: 2, # attr op: SCMP_CMP_LE, # less than or equals datum_a: 3 # PROT_READ | PROT_WRITE ) # Note that libseccomp can't really express multiple comparisons. # However, we are lucky, and we only have to "excessively" allow # PROT_WRITE (w/o PROT_READ) and PROT_NONE, which does no harm. doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, arg2) == 0 proc blockStat(ctx: scmp_filter_ctx) = # glibc calls fstat and its variants on fread, and it's quite hard # to ensure we never use it. Plus, in older glibc versions (< 2.39), # fstat is implemented as fstatat, and allowing that would imply # access to arbitrary paths. So for consistency, we make all of them # return an error. # # The offending function is _IO_file_doallocate; it doesn't actually # look at errno, so EPERM should work fine. const err = SCMP_ACT_ERRNO(1u16) const fstatList = [ cstring"fstat", "fstat64", "fstatat64", "newfstatat", "statx" ] for it in fstatList: let syscall = seccomp_syscall_resolve_name(it) doAssert seccomp_rule_add(ctx, err, syscall, 0) == 0 proc allowFcntl(ctx: scmp_filter_ctx) = # only allow F_DUPFD, F_GETFD, F_SETFD, F_GETFL, F_SETFL # (F_SETFL is 4, the other ones are 0-3) let syscall = seccomp_syscall_resolve_name("fcntl") let syscall2 = seccomp_syscall_resolve_name("fcntl64") let arg1 = scmp_arg_cmp( arg: 1, # cmd op: SCMP_CMP_LE, # less than or equals datum_a: 4 # F_SETFL (includes the above mentioned ones) ) doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, arg1) == 0 doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall2, 1, arg1) == 0 proc enterBufferSandbox*(sockPath: string) = onSignal SIGSYS: discard sig raise newException(Defect, "Sandbox violation in buffer") let ctx = seccomp_init(SCMP_ACT_TRAP) doAssert pointer(ctx) != nil const allowList = [ cstring"accept", # for incoming requests to our controlling socket "accept4", # for when accept is implemented as accept4 "bind", # for outgoing requests to loader "brk", # memory allocation "clock_gettime", # used by QuickJS in atomics "clone", # for when fork is implemented as clone "close", # duh "connect", # for outgoing requests to loader "epoll_create", "epoll_create1", "epoll_ctl", "epoll_wait", # epoll stuff "eventfd", # used by Nim selectors "exit_group", # for quit "fork", # for when fork is really fork "futex", # bionic libc & WSL both need it "getpid", # for determining current PID after we fork "getrlimit", # glibc uses it after fork it seems "getsockname", # Nim needs it for connecting "gettimeofday", # used by QuickJS in Date.now() "lseek", # glibc calls lseek on open files at exit "mmap", # memory allocation "mmap2", # memory allocation "mremap", # memory allocation "munmap", # memory allocation "pipe", # for pipes to child process "pipe2", # for when pipe is implemented as pipe2 "prlimit64", # for when getrlimit is implemented as prlimit64 "read", "recv", "recvfrom", "recvmsg", # for reading from sockets "rt_sigreturn", # for when sigreturn is implemented as rt_sigreturn "send", "sendmsg", "sendto", # for writing to sockets "set_robust_list", # glibc seems to need it for whatever reason "setrlimit", # glibc seems to use it for whatever reason "sigreturn", # called by signal trampoline "timerfd_create", # used by Nim selectors "timerfd_gettime", # not actually used by Nim but may be in the future "timerfd_settime", # used by Nim selectors "ugetrlimit", # glibc uses it after fork it seems "write" # for writing to sockets ] for it in allowList: let syscall = seccomp_syscall_resolve_name(it) doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) == 0 block allowUnixSockets: # only allow creation of UNIX domain sockets. let syscall = seccomp_syscall_resolve_name("socket") let arg0 = scmp_arg_cmp( arg: 0, # domain op: SCMP_CMP_EQ, # equals datum_a: 1 # PF_LOCAL == PF_UNIX == AF_UNIX ) doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, arg0) == 0 ctx.allowFcntl() ctx.blockStat() when defined(android): ctx.allowBionic() doAssert seccomp_load(ctx) == 0 seccomp_release(ctx) proc enterNetworkSandbox*() = onSignal SIGSYS: discard sig raise newException(Defect, "Sandbox violation in network process") let ctx = seccomp_init(SCMP_ACT_KILL_PROCESS) doAssert pointer(ctx) != nil const allowList = [ cstring"close", "exit_group", # duh "read", "write", "recv", "send", "recvfrom", "sendto", # socket i/o "lseek", # glibc calls lseek on open files at exit "mmap", "mmap2", "mremap", "munmap", "brk", # memory allocation "poll", # curl needs poll "getpid", # used indirectly by OpenSSL EVP_RAND_CTX_new (through drbg) "futex", # bionic libc & WSL both need it # we either have to use CURLOPT_NOSIGNAL or allow signals. # do the latter, otherwise the default name resolver will never time out. "signal", "sigaction", "rt_sigaction", ] for it in allowList: doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, seccomp_syscall_resolve_name(it), 0) == 0 ctx.allowFcntl() ctx.blockStat() when defined(android): ctx.allowBionic() doAssert seccomp_load(ctx) == 0 seccomp_release(ctx) else: {.warning: "Building without OS-level sandboxing!".} proc enterBufferSandbox*(sockPath: string) = discard proc enterNetworkSandbox*() = discard