/* * Copyright (c) 2017 Intel Corporation * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #ifdef CONFIG_X86_KPTI /* Copy interrupt return stack context to the trampoline stack, switch back * to the user page table, and only then 'iret'. We jump to this instead * of calling 'iret' if KPTI is turned on. This must be invoked with interrupts * locked. * * Stack layout is expected to be what 'iretq' expects, which is as follows: * * 32 SS * 24 RSP * 16 RFLAGS * 8 CS * 0 RIP */ .global z_x86_trampoline_to_user z_x86_trampoline_to_user: /* Stash EDI, need a free register */ pushq %rdi /* Store old stack pointer and switch to trampoline stack */ movq %rsp, %rdi movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp /* Copy context */ pushq 40(%rdi) /* SS */ pushq 32(%rdi) /* RSP */ pushq 24(%rdi) /* RFLAGS */ pushq 16(%rdi) /* CS */ pushq 8(%rdi) /* RIP */ xchgq %rdi, (%rdi) /* Exchange old rdi to restore it and put trampoline stack address in its old storage area */ /* Switch to thread's page table */ pushq %rax movq %gs:__x86_tss64_t_cpu_OFFSET, %rax movq ___cpu_t_current_OFFSET(%rax), %rax movq _thread_offset_to_ptables(%rax), %rax movq %rax, %cr3 popq %rax movq $0, -8(%rsp) /* Delete stashed RAX data */ /* Trampoline stack should have nothing sensitive in it at this point */ swapgs iretq #endif /* CONFIG_X86_KPTI */ /* Landing site for 'syscall' instruction * * Call id is in RAX * Arguments are in RDI, RSI, RDX, R10, R8, R9 * Return address stored by CPU in RCX * User RFLAGS store by CPU in R11 * Current RFLAGS has been masked with ~X86_FMASK_MSR */ .global z_x86_syscall_entry_stub z_x86_syscall_entry_stub: swapgs /* Save original stack pointer from user mode in memory, at the * moment we have no free registers or stack to save it to. This * eventually gets put on the stack before we re-enable interrupts * as this is a per-cpu and not per-thread area. */ movq %rsp, %gs:__x86_tss64_t_usp_OFFSET #ifdef CONFIG_X86_KPTI /* We need to switch to the trampoline stack so that we can * switch to the kernel's page table */ movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp /* Load kernel's page table */ pushq %rax movq $z_x86_kernel_ptables, %rax movq %rax, %cr3 popq %rax movq $0, -8(%rsp) /* Delete stashed RAX data */ #endif /* CONFIG_X86_KPTI */ /* Switch to the privilege mode stack pointer stored in * x86_tss64.psp */ movq %gs:__x86_tss64_t_psp_OFFSET, %rsp /* We're now on the privilege mode stack; push the old user stack * pointer onto it */ pushq %gs:__x86_tss64_t_usp_OFFSET #ifdef CONFIG_X86_KPTI movq $0, %gs:__x86_tss64_t_usp_OFFSET #endif sti /* re-enable interrupts */ /* call_id is in RAX. bounds-check it, must be less than * K_SYSCALL_LIMIT. */ cmp $K_SYSCALL_LIMIT, %rax jae _bad_syscall _id_ok: #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION /* Prevent speculation with bogus system call IDs */ lfence #endif /* Remaining registers not involved in the syscall operation are * RBX, RBP, R12-R15, plus floating point / SIMD registers. * * We save caller-saved registers so we can restore to original values * when we call 'sysretq' at the end. */ pushq %rdi subq $X86_FXSAVE_SIZE, %rsp fxsave (%rsp) pushq %rsi pushq %rdx pushq %r8 pushq %r9 pushq %r10 pushq %r11 /* RFLAGS */ pushq %rcx /* Return address stored by 'syscall' */ pushq %rsp /* SSF parameter */ /* All other args are in the right registers, except arg4 which * we had to put in r10 instead of RCX */ movq %r10, %rcx /* from the call ID in RAX, load R10 with the actual function pointer * to call by looking it up in the system call dispatch table */ xorq %r11, %r11 movq _k_syscall_table(%r11, %rax, 8), %r10 /* Run the marshal function, which is some entry in _k_syscall_table */ call *%r10 /* RAX now contains the return value * * Callee-saved registers are un-touched from original values per C * calling convention, but sensitive data may lurk in caller-saved regs * RDI, RSI, RDX, R8, R9, R10, XMM* after we have serviced the system * call. We saved them earlier, restore their original values when * the syscall was made. This also preserves these registers if they * were not used as arguments. * * We also can't have RCX and R11 clobbered as we need the original * values to successfully 'sysretq'. */ addq $8, %rsp /* Discard ssf */ popq %rcx /* Restore return address for 'sysretq' */ popq %r11 /* Restore RFLAGS for 'sysretq' */ popq %r10 popq %r9 popq %r8 popq %rdx popq %rsi fxrstor (%rsp) addq $X86_FXSAVE_SIZE, %rsp popq %rdi #ifdef CONFIG_X86_KPTI /* Lock IRQs as we are using per-cpu memory areas and the * trampoline stack */ cli /* Stash user stack pointer and switch to trampoline stack */ popq %gs:__x86_tss64_t_usp_OFFSET movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp /* Switch to thread's page table */ pushq %rax movq %gs:__x86_tss64_t_cpu_OFFSET, %rax movq ___cpu_t_current_OFFSET(%rax), %rax movq _thread_offset_to_ptables(%rax), %rax movq %rax, %cr3 popq %rax movq $0, -8(%rsp) /* Delete stashed RAX data */ /* Restore saved user stack pointer */ movq %gs:__x86_tss64_t_usp_OFFSET, %rsp movq $0, %gs:__x86_tss64_t_usp_OFFSET #else /* Restore user stack pointer */ popq %rsp /* Return to user mode, locking interrupts as the normal interrupt * handling path will get very confused if it occurs between * 'swapgs' and 'sysretq' */ cli #endif /* CONFIG_X86_KPTI */ swapgs sysretq _bad_syscall: /* RAX had a bogus syscall value in it, replace with the bad syscall * handler's ID, and put the bad ID as its first argument. * * TODO: On this and all other arches, simply immediately return * with -ENOSYS, once all syscalls have a return value */ movq %rax, %rdi movq $K_SYSCALL_BAD, %rax jmp _id_ok /* * size_t arch_user_string_nlen(const char *s, size_t maxsize, int *err_arg) * ^ RDI ^ RSI ^ RDX */ .global arch_user_string_nlen arch_user_string_nlen: /* Initial error value, strlen_done adjusts this if we succeed */ movl $-1, %r8d /* use RAX as our length count (this function's return value) */ xor %rax, %rax /* This code might page fault */ strlen_loop: .global z_x86_user_string_nlen_fault_start z_x86_user_string_nlen_fault_start: cmpb $0x0, (%rdi, %rax, 1) /* *(RDI + RAX) == 0? Could fault. */ .global z_x86_user_string_nlen_fault_end z_x86_user_string_nlen_fault_end: je strlen_done cmp %rsi, %rax /* Max length reached? */ je strlen_done inc %rax /* EAX++ and loop again */ jmp strlen_loop strlen_done: /* Set error value to 0 since we succeeded */ xorl %r8d, %r8d .global z_x86_user_string_nlen_fixup z_x86_user_string_nlen_fixup: /* Write error value to 32-bit integer err pointer parameter */ movl %r8d, (%rdx) retq /* * Trampoline function to put the p3 parameter in the register expected * by the calling convention, we couldn't use RCX when we called 'sysret' */ z_x86_userspace_landing_site: /* Place argument 4 in the correct position */ movq %r10, %rcx call z_thread_entry /* FUNC_NORETURN void z_x86_userspace_enter( * k_thread_entry_t user_entry, <- RDI * void *p1, void *p2, void *p3, <- RSI, RDX, RCX * uintptr_t stack_end, <- R8 * uintptr_t stack_start) <- R9 * * A one-way trip to userspace. */ .global z_x86_userspace_enter z_x86_userspace_enter: /* RCX is sysret return address, pass along p3 in r10, * z_x86_userspace_landing_site will fix this up */ movq %rcx, %r10 /* switch to privilege mode stack so we can erase thread stack buffer, * the buffer is the page immediately before the thread stack */ movq %r9, %rsp /* Push callee-saved regs and go back into C code to erase the stack * buffer and set US bit in page tables for it */ pushq %rdx pushq %rsi pushq %rdi pushq %r8 pushq %r10 callq z_x86_current_stack_perms popq %r10 popq %r8 popq %rdi popq %rsi popq %rdx /* Reset to the beginning of the user stack */ movq %r8, %rsp /* set sysret entry point */ movq $z_x86_userspace_landing_site, %rcx /* Copy RFLAGS into r11, required by sysret */ pushfq movq (%rsp), %r11 movq $0, (%rsp) /* Now a debugger-friendly return address */ /* cleanse other registers */ xorq %rbx, %rbx xorq %rbp, %rbp xorq %r12, %r12 xorq %r13, %r13 xorq %r14, %r14 xorq %r15, %r15 cli #ifdef CONFIG_X86_KPTI /* Switch to thread's page table. We have free registers so no need * to involve the trampoline stack. */ movq %gs:__x86_tss64_t_cpu_OFFSET, %rax movq ___cpu_t_current_OFFSET(%rax), %rax movq _thread_offset_to_ptables(%rax), %rax movq %rax, %cr3 #endif swapgs sysretq