clockley1 clockley1 - 4 months ago 16
C Question

Raw clone system call not working correctly

I'm trying to use the raw clone system call to avoid having to refactor the pid 0 code into a function. Linux requires stacks to by 16 bytes, additionally, libc reserves 16 bits presumably to store ptid and ctid. The code below creates an aligned stack then exits from the child. After waiting for the child cloned by libc's wrapper I used the raw system call with the same buffer yet each time the program segfaults when using the raw system call. Attached is the output from strace which, unless I'm overlooking anything shows the system call arguments are the same both times.
There is at least one other question Raw Clone system call on SO where the OP seemed to have similar difficulties, unfortunately, the accepted answer uses the libc clone wrapper instead of the syscall.

#define _GNU_SOURCE

#include <sched.h>
#include <stdalign.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <syscall.h>
#include <signal.h>
#include <stdint.h>
#include <errno.h>
#include <string.h>
#include <sys/wait.h>

int test(void*c)
{
quick_exit(0);
}

int main(void)
{
alignas (16) unsigned char stack[4096] = {0};
printf("Top of stack %p\n", stack+sizeof(stack));
printf("Top of stack minus 16 %p\n", stack+sizeof(stack)-16);
pid_t pid = clone(test, stack+sizeof(stack), CLONE_VM|SIGCHLD, 0, 0, 0, 0);

wait(NULL);

memset(stack, 0, sizeof stack);

pid = syscall(SYS_clone, CLONE_VM|SIGCHLD, stack+sizeof(stack)-16);
if (pid == 0)
quick_exit(0);
wait(NULL);
quick_exit(0);
}


Strace output:

clockley@ubuntu:~$ strace ./a.out
execve("./a.out", ["./a.out"], [/* 57 vars */]) = 0
brk(NULL) = 0x55b1e58ee000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f70303a0000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/opt/google/chrome/tls/x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome/tls/x86_64", 0x7ffcc0e2e400) = -1 ENOENT (No such file or directory)
open("/opt/google/chrome/tls/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome/tls", 0x7ffcc0e2e400) = -1 ENOENT (No such file or directory)
open("/opt/google/chrome/x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome/x86_64", 0x7ffcc0e2e400) = -1 ENOENT (No such file or directory)
open("/opt/google/chrome/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
open("/opt/google/chrome/lib/tls/x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome/lib/tls/x86_64", 0x7ffcc0e2e400) = -1 ENOENT (No such file or directory)
open("/opt/google/chrome/lib/tls/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome/lib/tls", 0x7ffcc0e2e400) = -1 ENOENT (No such file or directory)
open("/opt/google/chrome/lib/x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome/lib/x86_64", 0x7ffcc0e2e400) = -1 ENOENT (No such file or directory)
open("/opt/google/chrome/lib/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
stat("/opt/google/chrome/lib", 0x7ffcc0e2e400) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=171231, ...}) = 0
mmap(NULL, 171231, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f7030376000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\20\5\2\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1856752, ...}) = 0
mmap(NULL, 3959200, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f702fdb7000
mprotect(0x7f702ff74000, 2097152, PROT_NONE) = 0
mmap(0x7f7030174000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1bd000) = 0x7f7030174000
mmap(0x7f703017a000, 14752, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f703017a000
close(3) = 0
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f7030374000
arch_prctl(ARCH_SET_FS, 0x7f7030374700) = 0
mprotect(0x7f7030174000, 16384, PROT_READ) = 0
mprotect(0x55b1e46da000, 4096, PROT_READ) = 0
mprotect(0x7f70303a3000, 4096, PROT_READ) = 0
munmap(0x7f7030376000, 171231) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 8), ...}) = 0
brk(NULL) = 0x55b1e58ee000
brk(0x55b1e590f000) = 0x55b1e590f000
write(1, "Top of stack 0x7ffcc0e2ecd0\n", 28Top of stack 0x7ffcc0e2ecd0
) = 28
write(1, "Top of stack minus 16 0x7ffcc0e2"..., 37Top of stack minus 16 0x7ffcc0e2ecc0
) = 37
clone(child_stack=0x7ffcc0e2ecc0, flags=CLONE_VM|SIGCHLD) = 122458
wait4(-1, NULL, 0, NULL) = 122458
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=122458, si_uid=1000, si_status=0, si_utime=0, si_stime=0} ---
clone(child_stack=0x7ffcc0e2ecc0, flags=CLONE_VM|SIGCHLD) = 122459
exit_group(0 <unfinished ...>
+++ killed by SIGSEGV +++
Segmentation fault (core dumped)

Answer Source

syscall has no special knowledge of clone. This means that when the function tries to return in the newly-created thread, it reads the return address from the switched stack, which is zero. This is more obvious if you write a non-zero bit pattern to the stack and also drop the CLONE_VM, so that the child does not clobber the parent.