blob: 4f31f883c33afaa3e240838610bbae382e2fbae8 [file] [log] [blame]
#include <unistd.h>
#include <fcntl.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/ptrace.h>
#include <sys/syscall.h>
#include <sys/user.h>
#include <sys/types.h>
#include <sys/wait.h>
#include "seccomp_bpf.h"
#include <iostream>
using namespace std;
static bool install_syscall_filter() {
struct sock_filter filter[] = {
VALIDATE_ARCHITECTURE,
/* Grab the system call number. */
EXAMINE_SYSCALL,
/* List allowed syscalls. Look in /usr/include/x86_64-linux-gnu/asm/unistd_64.h */
ALLOW_SYSCALL(exit_group),
ALLOW_SYSCALL(exit),
ALLOW_SYSCALL(stat),
ALLOW_SYSCALL(fstat),
ALLOW_SYSCALL(read),
ALLOW_SYSCALL(write),
ALLOW_SYSCALL(getdents),
ALLOW_SYSCALL(close),
ALLOW_SYSCALL(mmap),
ALLOW_SYSCALL(mprotect),
ALLOW_SYSCALL(munmap),
ALLOW_SYSCALL(brk),
ALLOW_SYSCALL(futex),
ALLOW_SYSCALL(lseek),
ALLOW_SYSCALL(set_tid_address),
ALLOW_SYSCALL(set_robust_list),
ALLOW_SYSCALL(rt_sigaction),
ALLOW_SYSCALL(rt_sigprocmask),
ALLOW_SYSCALL(getrlimit),
ALLOW_SYSCALL(arch_prctl),
ALLOW_SYSCALL(access),
ALLOW_SYSCALL(fstatfs),
ALLOW_SYSCALL(readlink),
ALLOW_SYSCALL(fadvise64),
ALLOW_SYSCALL(clock_gettime),
/*
The set of sycall's needed if running against an NVIDIA GPU, YMMV.
ALLOW_SYSCALL(mremap),
ALLOW_SYSCALL(statfs),
ALLOW_SYSCALL(readlink),
ALLOW_SYSCALL(getpid),
*/
TRACE_SYSCALL(execve),
TRACE_OPENS_FOR_READS_ONLY(open, 1),
TRACE_OPENS_FOR_READS_ONLY(openat, 2),
TRACE_ALL,
KILL_PROCESS,
};
struct sock_fprog prog = {
sizeof(filter)/sizeof(filter[0]),
filter,
};
// Lock down the app so that it can't get new privs, such as setuid.
// Calling this is a requirement for an unprivileged process to use mode
// 2 seccomp filters, ala SECCOMP_MODE_FILTER, otherwise we'd have to be
// root.
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
perror("prctl(NO_NEW_PRIVS)");
goto failed;
}
// Now call seccomp and restrict the system calls that can be made to only
// the ones in the provided filter list.
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
perror("prctl(SECCOMP)");
goto failed;
}
return true;
failed:
if (errno == EINVAL) {
fprintf(stderr, "SECCOMP_FILTER is not available. :(\n");
}
return false;
}
static void setLimits() {
struct rlimit n;
// Limit to 10 seconds of CPU.
n.rlim_cur = 10;
n.rlim_max = 10;
if (setrlimit(RLIMIT_CPU, &n)) {
perror("setrlimit(RLIMIT_CPU)");
}
// Limit to 1G of Address space.
n.rlim_cur = 1000000000;
n.rlim_max = 1000000000;
if (setrlimit(RLIMIT_AS, &n)) {
perror("setrlimit(RLIMIT_CPU)");
}
}
int do_child(int argc, char **argv) {
char *args[argc+1];
memcpy(args, argv, argc * sizeof(char *));
args[argc] = NULL;
if (ptrace(PTRACE_TRACEME, 0, 0, 0)) {
perror("ptrace");
exit(-1);
}
kill(getpid(), SIGSTOP);
setLimits();
if (!install_syscall_filter()) {
return -1;
}
(void)execvp(args[0], args);
// if execvp returns, we couldn't run the child. Probably
// because the compile failed. Let's kill ourselves so the
// parent sees the signal and exits appropriately.
kill(getpid(), SIGKILL);
return -1;
}
// read_string copies a null-terminated string out
// of the child's address space, one character at a time.
// It allocates memory and returns it to the caller;
// it is the caller's responsibility to free it.
char *read_string(pid_t child, unsigned long addr) {
#define INITIAL_ALLOCATION 4096
char *val = (char *) malloc(INITIAL_ALLOCATION);
size_t allocated = INITIAL_ALLOCATION;
size_t read = 0;
unsigned long tmp;
while (1) {
if (read + sizeof tmp > allocated) {
allocated *= 2;
val = (char *) realloc(val, allocated);
}
tmp = ptrace(PTRACE_PEEKDATA, child, addr + read);
if (errno != 0) {
val[read] = 0;
break;
}
memcpy(val + read, &tmp, sizeof tmp);
if (memchr(&tmp, 0, sizeof tmp) != NULL) {
break;
}
read += sizeof tmp;
}
return val;
}
int do_trace(pid_t child, char *allowed_exec) {
int status;
waitpid(child, &status, 0);
ptrace(PTRACE_SETOPTIONS, child, 0, PTRACE_O_TRACEEXEC | PTRACE_O_TRACESECCOMP);
ptrace(PTRACE_CONT, child, 0, 0);
#define CHILD_FAIL(message) \
perror(message); \
kill(child, SIGKILL); \
exit(-1)
while(1) {
waitpid(child, &status, 0);
if (WIFEXITED(status)) {
return 0;
}
if (WIFSIGNALED(status)) {
return 1;
}
if (status>>8 == (SIGTRAP | (PTRACE_EVENT_SECCOMP<<8))) {
struct user_regs_struct regs;
if(ptrace(PTRACE_GETREGS, child, NULL, &regs)) {
perror("The child failed...");
exit(-1);
}
int syscall = regs.orig_rax;
if (syscall == SYS_execve) {
char *name = read_string( child, regs.rdi );
if (strcmp(name, allowed_exec)) {
CHILD_FAIL( "Invalid exec." );
}
free(name);
} else if (syscall == SYS_open) {
char *name = read_string( child, regs.rdi );
if (NULL != strstr(name, "..")) {
CHILD_FAIL( "No relative paths..." );
}
int flags = regs.rsi;
if (O_RDONLY != (flags & O_ACCMODE)) {
CHILD_FAIL( "No writing to files..." );
}
const char *allowed_prefixes[] = { "/usr/local/share/fonts", "/var/cache/fontconfig", "/etc/fonts", "/usr/share/fonts", "/etc/ld.so.cache", "/lib/", "/usr/lib/", "skia.conf", "/mnt/pd0/", "/proc/meminfo" };
bool okay = false;
for (unsigned int i = 0 ; i < sizeof(allowed_prefixes) / sizeof(allowed_prefixes[0]) ; i++) {
if (!strncmp(allowed_prefixes[i], name, strlen(allowed_prefixes[i]))) {
okay = true;
break;
}
}
if (!okay) {
perror( name );
CHILD_FAIL( "Invalid open." );
}
free(name);
} else if (syscall == SYS_openat) {
char *name = read_string( child, regs.rsi );
if (NULL != strstr(name, "..")) {
CHILD_FAIL( "No relative paths..." );
}
int flags = regs.rdx;
if (O_RDONLY != (flags & O_ACCMODE)) {
CHILD_FAIL( "No writing to files..." );
}
if (
strncmp(name, "/usr/share/fonts", strlen("/usr/share/fonts")) &&
strncmp(name, "/usr/local/share/fonts", strlen("/usr/local/share/fonts")) &&
strncmp(name, "/var/cache/fontconfig", strlen("/var/cache/fontconfig")) &&
strncmp(name, "/etc/fonts", strlen("/etc/fonts"))
) {
perror(name);
CHILD_FAIL( "Invalid openat." );
}
free(name);
} else {
// this should never happen, but if we're in TRACE_ALL
// mode for debugging, this lets me print out what system
// calls are happening unexpectedly.
cout << "WEIRD SYSTEM CALL: " << syscall << endl;
}
}
ptrace(PTRACE_CONT, child, 0, 0);
}
return 0;
}
int main(int argc, char** argv) {
pid_t child = fork();
if (child == 0) {
return do_child(argc-1, argv+1);
} else {
return do_trace(child, argv[1]);
}
}