about summary refs log blame commit diff stats
path: root/src/utils/sandbox.nim
blob: 6f63139fa5caef93251a01c7b1153dd5cc971c08 (plain) (tree)
1
2
3
4
5
6
7
8
9








                                                                        





                                                                        

                                                                  

















                                                                          

















                                         

                          
                                              




                                                                            




                                                                     
                             




                                              
                                      
                                      
                                                              



                                                                  
                                 


                            








                                                                               




























                                                                          
                                                






                                                                             








                                                                        
                                    










                                                          












                                                                          















                                                                               
                                        
                              
                                            
                                               



                                                           
                                                        

                                  
                                   




                                                                       
                                                                         


                                                                     
                                                


                                                                            
                                                       













                                                                           
                    
                   

                          






                                                                        
                                                 



                                                                         
                                                        
                                                                     
                               
                                                                            
                                               






                                                                               
                    
                   

                          

                                   
     
                                                      



                                              
           
# Security model with sandboxing:
#
# Buffer processes are the most security-sensitive, since they parse
# various resources retrieved from the network (CSS, HTML) and sometimes
# even execute untrusted code (JS, with an engine written in C). So the
# main goal is to give buffers as few permissions as possible.
#
# On FreeBSD, we create a file descriptor to the directory sockets
# reside in, and then use that for manipulating our sockets.
#
# Capsicum does not enable more fine-grained capability control, but
# in practice the things it does enable should not be enough to harm the
# user's system.
#
# On OpenBSD, we pledge the minimum amount of promises we need, and
# do not unveil anything. It seems to be roughly equivalent to the
# security we get with FreeBSD Capsicum.
#
# On Linux, we use libseccomp so that I don't have to manually write
# BPF filters.
# Sandboxing on Linux is at the moment slightly less safe than on the
# two BSDs, because a rogue buffer could in theory connect to whatever
# open UNIX domain socket on the system that the user has access to.
#TODO look into integrating Landlock to fix this.
#
# We do not have OS-level sandboxing on other systems (yet).
#
# Aside from sandboxing in buffer processes, we also have a more
# restrictive "network" sandbox that is intended for CGI processes that
# just read/write from/to the network and stdin/stdout. At the moment this
# is only used in the HTTP process.
#TODO add it to more CGI scripts

const disableSandbox {.booldefine.} = false

type SandboxType* = enum
  stNone = "no sandbox"
  stCapsicum = "capsicum"
  stPledge = "pledge"
  stLibSeccomp = "libseccomp"

const SandboxMode* = when disableSandbox:
  stNone
elif defined(freebsd):
  stCapsicum
elif defined(openbsd):
  stPledge
elif defined(linux):
  stLibSeccomp
else:
  stNone

when SandboxMode == stCapsicum:
  import bindings/capsicum

  proc enterBufferSandbox*(sockPath: string) =
    # per man:cap_enter(2), it may return ENOSYS if the kernel was compiled
    # without CAPABILITY_MODE. So it seems better not to panic in this case.
    # (But TODO: when we get enough sandboxing coverage it should print a
    # warning or something.)
    discard cap_enter()

  proc enterNetworkSandbox*() =
    # no difference between buffer; Capsicum is quite straightforward
    # to use in this regard.
    discard cap_enter()
elif SandboxMode == stPledge:
  import bindings/pledge

  proc enterBufferSandbox*(sockPath: string) =
    # take whatever we need to
    # * fork
    # * connect to UNIX domain sockets
    # * take FDs from the main process
    doAssert pledge("unix stdio sendfd recvfd proc", nil) == 0

  proc enterNetworkSandbox*() =
    # we don't need much to write out data from sockets to stdout.
    doAssert pledge("stdio", nil) == 0
elif SandboxMode == stLibSeccomp:
  import std/posix
  import bindings/libseccomp

  when defined(android):
    let PR_SET_VMA {.importc, header: "<sys/prctl.h>", nodecl.}: cint
    let PR_SET_VMA_ANON_NAME {.importc, header: "<sys/prctl.h>", nodecl.}: cint

    proc allowBionic(ctx: scmp_filter_ctx) =
      # Things needed for bionic libc. Tested with Termux.
      const androidAllowList = [
        cstring"rt_sigprocmask",
        "epoll_pwait",
        "madvise"
      ]
      for it in androidAllowList:
        let syscall = seccomp_syscall_resolve_name(it)
        doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) == 0
      # bionic likes to set this very much. In fact, it was added to
      # the kernel by Android devs.
      block allowAnonVMAName:
        let syscall = seccomp_syscall_resolve_name("prctl")
        let arg0 = scmp_arg_cmp(
          arg: 0, # op
          op: SCMP_CMP_EQ, # equals
          datum_a: uint64(PR_SET_VMA)
        )
        let arg1 = scmp_arg_cmp(
          arg: 1, # attr
          op: SCMP_CMP_EQ, # equals
          datum_a: uint64(PR_SET_VMA_ANON_NAME)
        )
        doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 2, arg0,
          arg1) == 0
      # We have to be careful with this one; PROT_EXEC will happily set
      # memory as executable, which is certainly not what we want.
      # Now, bionic seems to be calling this from mutate(), ergo we
      # should be fine just allowing PROT_READ and PROT_READ | PROT_WRITE.
      block allowMprotect:
        let syscall = seccomp_syscall_resolve_name("mprotect")
        let arg2 = scmp_arg_cmp(
          arg: 2, # attr
          op: SCMP_CMP_LE, # less than or equals
          datum_a: 3 # PROT_READ | PROT_WRITE
        )
        # Note that libseccomp can't really express multiple comparisons.
        # However, we are lucky, and we only have to "excessively" allow
        # PROT_WRITE (w/o PROT_READ) and PROT_NONE, which does no harm.
        doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, arg2) == 0

  proc blockStat(ctx: scmp_filter_ctx) =
    # glibc calls fstat and its variants on fread, and it's quite hard
    # to ensure we never use it. Plus, in older glibc versions (< 2.39),
    # fstat is implemented as fstatat, and allowing that would imply
    # access to arbitrary paths. So for consistency, we make all of them
    # return an error.
    #
    # The offending function is _IO_file_doallocate; it doesn't actually
    # look at errno, so EPERM should work fine.
    const err = SCMP_ACT_ERRNO(1u16)
    const fstatList = [
      cstring"fstat",
      "fstat64",
      "fstatat64",
      "newfstatat",
      "statx"
    ]
    for it in fstatList:
      let syscall = seccomp_syscall_resolve_name(it)
      doAssert seccomp_rule_add(ctx, err, syscall, 0) == 0

  proc allowFcntl(ctx: scmp_filter_ctx) =
    # only allow F_DUPFD, F_GETFD, F_SETFD, F_GETFL, F_SETFL
    # (F_SETFL is 4, the other ones are 0-3)
    let syscall = seccomp_syscall_resolve_name("fcntl")
    let syscall2 = seccomp_syscall_resolve_name("fcntl64")
    let arg1 = scmp_arg_cmp(
      arg: 1, # cmd
      op: SCMP_CMP_LE, # less than or equals
      datum_a: 4 # F_SETFL (includes the above mentioned ones)
    )
    doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, arg1) == 0
    doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall2, 1, arg1) == 0

  proc enterBufferSandbox*(sockPath: string) =
    onSignal SIGSYS:
      discard sig
      raise newException(Defect, "Sandbox violation in buffer")
    let ctx = seccomp_init(SCMP_ACT_TRAP)
    doAssert pointer(ctx) != nil
    const allowList = [
      cstring"accept", # for incoming requests to our controlling socket
      "accept4", # for when accept is implemented as accept4
      "bind", # for outgoing requests to loader
      "brk", # memory allocation
      "clock_gettime", # used by QuickJS in atomics
      "clone", # for when fork is implemented as clone
      "close", # duh
      "connect", # for outgoing requests to loader
      "epoll_create", "epoll_create1", "epoll_ctl", "epoll_wait", # epoll stuff
      "eventfd", # used by Nim selectors
      "exit_group", # for quit
      "fork", # for when fork is really fork
      "futex", # bionic libc & WSL both need it
      "getpid", # for determining current PID after we fork
      "getrlimit", # glibc uses it after fork it seems
      "getsockname", # Nim needs it for connecting
      "gettimeofday", # used by QuickJS in Date.now()
      "lseek", # glibc calls lseek on open files at exit
      "mmap", # memory allocation
      "mmap2", # memory allocation
      "mremap", # memory allocation
      "munmap", # memory allocation
      "pipe", # for pipes to child process
      "pipe2", # for when pipe is implemented as pipe2
      "prlimit64", # for when getrlimit is implemented as prlimit64
      "read", "recv", "recvfrom", "recvmsg", # for reading from sockets
      "rt_sigreturn", # for when sigreturn is implemented as rt_sigreturn
      "send", "sendmsg", "sendto", # for writing to sockets
      "set_robust_list", # glibc seems to need it for whatever reason
      "setrlimit", # glibc seems to use it for whatever reason
      "sigreturn", # called by signal trampoline
      "timerfd_create", # used by Nim selectors
      "timerfd_gettime", # not actually used by Nim but may be in the future
      "timerfd_settime", # used by Nim selectors
      "ugetrlimit", # glibc uses it after fork it seems
      "write" # for writing to sockets
    ]
    for it in allowList:
      let syscall = seccomp_syscall_resolve_name(it)
      doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) == 0
    block allowUnixSockets:
      # only allow creation of UNIX domain sockets.
      let syscall = seccomp_syscall_resolve_name("socket")
      let arg0 = scmp_arg_cmp(
        arg: 0, # domain
        op: SCMP_CMP_EQ, # equals
        datum_a: 1 # PF_LOCAL == PF_UNIX == AF_UNIX
      )
      doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, arg0) == 0
    ctx.allowFcntl()
    ctx.blockStat()
    when defined(android):
      ctx.allowBionic()
    doAssert seccomp_load(ctx) == 0
    seccomp_release(ctx)

  proc enterNetworkSandbox*() =
    onSignal SIGSYS:
      discard sig
      raise newException(Defect, "Sandbox violation in network process")
    let ctx = seccomp_init(SCMP_ACT_KILL_PROCESS)
    doAssert pointer(ctx) != nil
    const allowList = [
      cstring"close", "exit_group", # duh
      "read", "write", "recv", "send", "recvfrom", "sendto", # socket i/o
      "lseek", # glibc calls lseek on open files at exit
      "mmap", "mmap2", "mremap", "munmap", "brk", # memory allocation
      "poll", # curl needs poll
      "getpid", # used indirectly by OpenSSL EVP_RAND_CTX_new (through drbg)
      "futex", # bionic libc & WSL both need it
      # we either have to use CURLOPT_NOSIGNAL or allow signals.
      # do the latter, otherwise the default name resolver will never time out.
      "signal", "sigaction", "rt_sigaction",
    ]
    for it in allowList:
      doAssert seccomp_rule_add(ctx, SCMP_ACT_ALLOW,
        seccomp_syscall_resolve_name(it), 0) == 0
    ctx.allowFcntl()
    ctx.blockStat()
    when defined(android):
      ctx.allowBionic()
    doAssert seccomp_load(ctx) == 0
    seccomp_release(ctx)
else:
  {.warning: "Building without OS-level sandboxing!".}
  proc enterBufferSandbox*(sockPath: string) =
    discard

  proc enterNetworkSandbox*() =
    discard