Fix 16-byte stack alignment requirement for x86_64 ABI

Change-Id: I43304803ac54c8688c61688bd96c7160614172d4
Signed-off-by: Pavel Chupin <pavel.v.chupin@intel.com>
diff --git a/libc/arch-x86_64/bionic/clone.S b/libc/arch-x86_64/bionic/clone.S
index 2ae0e85..7511e86 100644
--- a/libc/arch-x86_64/bionic/clone.S
+++ b/libc/arch-x86_64/bionic/clone.S
@@ -59,10 +59,11 @@
 
         # We're in the child now, so call __thread_entry
         # with the arguments from the child stack moved into
-        # the appropriate registers.
-        popq    %rdi  # fn
-        popq    %rsi  # arg
-        popq    %rdx  # tls
+        # the appropriate registers. We avoid pop here to keep
+        # the required 16-byte stack alignment.
+        movq    (%rsp), %rdi    # fn
+        movq    8(%rsp), %rsi   # arg
+        movq    16(%rsp), %rdx  # tls
         call    __thread_entry
         hlt
 2: