当前位置：首页>Linux>Linux syscall过程分析(万字长文)

Linux syscall过程分析(万字长文)

2026-01-04 21:04:20

世间上的相遇

都是久别重逢

文章有点长，慢慢看。

一、背景

为了安全，Linux 中分为用户态和内核态两种运行状态。对于普通进程，平时都是运行在用户态下，仅拥有基本的运行能力。当进行一些敏感操作，比如说要打开文件(open)然后进行写入(write)、分配内存(malloc)时，就会切换到内核态。内核态进行相应的检查，如果通过了，则按照进程的要求执行相应的操作，分配相应的资源。这种机制被称为系统调用，用户态进程发起调用，切换到内核态，内核态完成，返回用户态继续执行，是用户态唯一主动切换到内核态的合法手段(exception 和 interrupt 是被动切换)。

关于系统调用的详细定义可以通过 man syscalls 查看，它列出了目前 Linux Kernel 提供的系统调用 ABI 。我们熟悉的调用比如 open， read ，close 之类的都属于系统调用，但它们都经过了 C 库 (glibc)的封装。实际上，只要符合 ABI 规范，我们可以自己用汇编代码来进行调用。

历史上，x86 的系统调用实现经历了 int / iret 到 sysenter / sysexit 再到 syscall / sysret 的演变。

以下的分析基于 Linux kernel 4.9.76 ，glibc 为 2.25.90。

二、int / iret

很久很久以前，我们通过 int 0x80 进行系统调用(open)：

mov 0x05 ,eax       /* 设置系统调用号 */int 0x80

在 arch/x86/kernel/traps.c 的 trap_init 中，定义了各种 set_intr_gate / set_intr_gate_ist / set_system_intr_gate 。其中 set_system_intr_gate 用于在中断描述符表(IDT)上设置系统调用门：

#ifdef CONFIG_X86_32   set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);   set_bit(IA32_SYSCALL_VECTOR, used_vectors);#endif

根据 arch/x86/include/asm/irq_vectors.h， IA32_SYSCALL_VECTOR 值为 0x80。

于是在调用 int 0x80 后，硬件根据向量号在 IDT 中找到对应的表项，即中断描述符，进行特权级检查，发现 DPL = CPL = 3 ，允许调用。然后硬件将切换到内核栈 (tss.ss0 : tss.esp0)。接着根据中断描述符的 segment selector 在 GDT / LDT 中找到对应的段描述符，从段描述符拿到段的基址，加载到 cs 。将 offset 加载到 eip。最后硬件将 ss / sp / eflags / cs / ip / error code 依次压到内核栈。

于是从 entry_INT80_32 开始执行，其定义在 arch/x86/entry/entry_32.S ：

ENTRY(entry_INT80_32)    ASM_CLAC    pushl   %eax            /* pt_regs->orig_ax */    SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */    /*     * User mode is traced as though IRQs are on, and the interrupt gate     * turned them off.     */    TRACE_IRQS_OFF    movl    %esp, %eax    call    do_int80_syscall_32...

它将存在 eax 中的系统调用号压入栈中，然后调用 SAVE_ALL 将其他寄存器的值压入栈中进行保存：

.macro SAVE_ALL pt_regs_ax=%eax    cld    PUSH_GS    pushl   %fs    pushl   %es    pushl   %ds    pushl   \pt_regs_ax    pushl   %ebp    pushl   %edi    pushl   %esi    pushl   %edx    pushl   %ecx    pushl   %ebx    movl    $(__USER_DS), %edx    movl    %edx, %ds    movl    %edx, %es    movl    $(__KERNEL_PERCPU), %edx    movl    %edx, %fs    SET_KERNEL_GS %edx.endm

保存完毕后，关闭中断，将当前栈指针保存到 eax ，调用 do_int80_syscall_32 => do_syscall_32_irqs_on ，该函数在 arch/x86/entry/common.c 中定义：

static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs){    struct thread_info *ti = current_thread_info();    unsigned int nr = (unsigned int)regs->orig_ax;#ifdef CONFIG_IA32_EMULATION    current->thread.status |= TS_COMPAT;#endif    if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {        /*         * Subtlety here: if ptrace pokes something larger than         * 2^32-1 into orig_ax, this truncates it.  This may or         * may not be necessary, but it matches the old asm         * behavior.         */        nr = syscall_trace_enter(regs);    }    if (likely(nr < IA32_NR_syscalls)) {        /*         * It's possible that a 32-bit syscall implementation         * takes a 64-bit parameter but nonetheless assumes that         * the high bits are zero.  Make sure we zero-extend all         * of the args.         */        regs->ax = ia32_sys_call_table[nr](            (unsigned int)regs->bx, (unsigned int)regs->cx,            (unsigned int)regs->dx, (unsigned int)regs->si,            (unsigned int)regs->di, (unsigned int)regs->bp);    }    syscall_return_slowpath(regs);}

这个函数的参数 regs(struct pt_regs 定义见 arch/x86/include/asm/ptrace.h )就是先前在 entry_INT80_32 依次被压入栈的寄存器值。这里先取出系统调用号，从系统调用表(ia32_sys_call_table) 中取出对应的处理函数，然后通过先前寄存器中的参数调用之。

系统调用表 ia32_sys_call_table 在 arch/x86/entry/syscall_32.c 中定义，但内容有点奇怪，看上去表的内容是 include 进来的：

/* System call table for i386. */#include <linux/linkage.h>#include <linux/sys.h>#include <linux/cache.h>#include <asm/asm-offsets.h>#include <asm/syscall.h>#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;#include <asm/syscalls_32.h>#undef __SYSCALL_I386#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {    /*     * Smells like a compiler bug -- it doesn't work     * when the & below is removed.     */    [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,#include <asm/syscalls_32.h>};

然而我们到源码的 arch/x86/include/asm 目录下却找不到 syscalls_32.h 的，但在编译 kernel 后的 arch/x86/include/generated/asm 里面发现了它：

__SYSCALL_I386(0, sys_restart_syscall, )__SYSCALL_I386(1, sys_exit, )#ifdef CONFIG_X86_32__SYSCALL_I386(2, sys_fork, )#else__SYSCALL_I386(2, sys_fork, )#endif__SYSCALL_I386(3, sys_read, )__SYSCALL_I386(4, sys_write, )#ifdef CONFIG_X86_32__SYSCALL_I386(5, sys_open, )#else__SYSCALL_I386(5, compat_sys_open, )...

这说明 syscalls_32.h 是在编译过程中动态生成的，请看脚本 arch/x86/entry/syscalls/syscalltbl.sh，它读取了同目录下的 syscall_32.tbl ，为每一有效行都生成了 __SYSCALL_${abi}($nr, $real_entry, $qualifier) 结构。然后在宏 __SYSCALL_I386 的作用下形成了这样的定义：

__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {   [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,   [0] = sys_restart_syscall,   [1] = sys_exit,   [2] = sys_fork,   [3] = sys_read,   [4] = sys_write,   [5] = sys_open,   ...};

根据 GCC文档，这样的初始化方法在 ISO C99 中定义，个人称之为数组的乱序初始化。

因为我们的调用号是 0x05 ，所以这里调用了 sys_open ，定义在 fs/open.c 中定义：

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode){    if (force_o_largefile())        flags |= O_LARGEFILE;    return do_sys_open(AT_FDCWD, filename, flags, mode);}

宏 SYSCALL_DEFINE3 及相关定义如下：

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)#define SYSCALL_DEFINEx(x, sname, ...)                \        SYSCALL_METADATA(sname, x, __VA_ARGS__)       \        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)#define __SYSCALL_DEFINEx(x, name, ...)                                 \        asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \                __attribute__((alias(__stringify(SyS##name))));         \                                                                        \        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \                                                                        \        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \                                                                        \        asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \        {                                                               \                long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \                __MAP(x,__SC_TEST,__VA_ARGS__);                         \                __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \                return ret;                                             \        }                                                               \                                                                        \        static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

SYSCALL_METADATA 保存了调用的基本信息，供调试程序跟踪使用( kernel 需开启 CONFIG_FTRACE_SYSCALLS )。

而 __SYSCALL_DEFINEx 用于拼接函数，函数名被拼接为 sys##_##open，参数也通过 __SC_DECL 拼接，最终得到展开后的定义：

asmlinkage long sys_open(const char __user * filename, int flags, umode_t mode){    if (force_o_largefile())        flags |= O_LARGEFILE;    return do_sys_open(AT_FDCWD, filename, flags, mode);}

sys_open 是对 do_sys_open 的封装：

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode){    struct open_flags op;    int fd = build_open_flags(flags, mode, &op);    struct filename *tmp;    if (fd)        return fd;    tmp = getname(filename);    if (IS_ERR(tmp))        return PTR_ERR(tmp);    fd = get_unused_fd_flags(flags);    if (fd >= 0) {        struct file *f = do_filp_open(dfd, tmp, &op);        if (IS_ERR(f)) {            put_unused_fd(fd);            fd = PTR_ERR(f);        } else {            fsnotify_open(f);            fd_install(fd, f);        }    }    putname(tmp);    return fd;}

getname 将处于用户态的文件名拷到内核态，然后通过 get_unused_fd_flags 获取一个没用过的文件描述符，然后 do_filp_open 创建 struct file ， fd_install 将 fd 和 struct file 绑定(task_struct->files->fdt[fd] = file)，然后返回 fd。

fd一直返回到 do_syscall_32_irqs_on ，被设置到 regs->ax (eax) 中。接着返回 entry_INT80_32 继续执行，最后执行 INTERRUPT_RETURN 。INTERRUPT_RETURN 在 arch/x86/include/asm/irqflags.h 中定义为 iret ，负责恢复先前压栈的寄存器，返回用户态。系统调用执行完毕。

在目前主流的系统调用库(glibc) 中，int 0x80 只有在硬件不支持快速系统调用(sysenter / syscall)的时候才会调用，但目前的硬件都支持快速系统调用，所以为了能够看看 int 0x80 的效果，我们手撸汇编：

#include <stdio.h>#include <stdlib.h>#include <string.h>int main(){    char * filename = "/tmp/test";    char * buffer = malloc(80);    memset(buffer, 0, 80);    int count;    __asm__ __volatile__("movl $0x5, %%eax\n\t"                         "movl %1, %%ebx\n\t"                         "movl $0, %%ecx\n\t"                         "movl $0664, %%edx\n\t"                         "int $0x80\n\t"                         "movl %%eax, %%ebx\n\t"                         "movl $0x3, %%eax\n\t"                         "movl %2, %%ecx\n\t"                         "movl $80, %%edx\n\t"                         "int $0x80\n\t"                         "movl %%eax, %0\n\t"                         :"=m"(count)                         :"g"(filename), "g"(buffer)                         :"%eax", "%ebx", "%ecx", "%edx");    printf("%d\n", count);    printf("%s\n", buffer);    free(buffer);}

这段代码首先通过 int 0x80 调用系统调用 open 得到 fd (由 eax 返回)，再作为 read 的参数传入，从而读出了文件中的内容。但比较奇怪的是如果 buffer 存储在栈中 (buffer[80])，则调用 read 失败。只有将 buffer 作为全局变量或存储在堆中，才能调用成功。希望有知道的大大指点一下。

三、sysenter / sysexit

接下来介绍的是 32位下 Intel 提出的快速系统调用 sysenter/sysexit，它和同期AMD的 syscall/sysret 机制类似。

之所以提出新指令，是因为通过软中断来实现系统调用实在太慢了。于是 Intel x86 CPU 自 Pentium II（Family 6, Model 3, Stepping 3）之后，开始支持新的系统调用指令 sysenter/sysexit。前者用于从低特权级切换到 ring 0，后者用于从ring 0 切换到低特权级。没有特权级别检查(CPL, DPL)，也没有压栈的操作，快最重要！

在 Intel SDM 中阐述了sysenter指令。首先 CPU 有一堆特殊的寄存器，名为 Model-Specific Register(MSR)，这些寄存器在操作系统运行过程中起着重要作用。对于这些寄存器，需要采用专门的指令 RDMSR 和 WRMSR 进行读写。

sysenter 用到了以下 MSR (定义在 arch/x86/include/asm/msr-index.h)：

IA32_SYSENTER_CS(174H)：存放内核态处理代码的段选择符
IA32_SYSENTER_EIP(175H)：存放内核态栈顶偏移量
IA32_SYSENTER_ESP(176H)：存放内核态处理代码偏移量

当执行 sysenter 时，执行以下操作：

清除 FLAGS 的 VM 标志，确保在保护模式下运行
清除 FLAGS 的 IF 标志，屏蔽中断
加载 IA32_SYSENTER_ESP 的值到 esp
加载 IA32_SYSENTER_EIP 的值到 eip
加载 SYSENTER_CS_MSR 的值到 CS
将 SYSENTER_CS_MSR + 8 的值加载到 ss 。因为在GDT中， ss 就跟在 cs 后面
开始执行(cs:eip)指向的代码

这些 MSR 在 arch/x86/kernel/cpu/common.c 的 enable_sep_cpu 中初始化：

void enable_sep_cpu(void){    struct tss_struct *tss;    int cpu;    if (!boot_cpu_has(X86_FEATURE_SEP))        return;    cpu = get_cpu();    tss = &per_cpu(cpu_tss, cpu);    /*     * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --     * see the big comment in struct x86_hw_tss's definition.     */    tss->x86_tss.ss1 = __KERNEL_CS;    wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);    wrmsr(MSR_IA32_SYSENTER_ESP,          (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),          0);    wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);    put_cpu();}

这里将 __KERNEL_CS 设置到 MSR_IA32_SYSENTER_CS 中，将 tss.SYSENTER_stack 地址设置到 MSR_IA32_SYSENTER_ESP 中，最后将内核入口点 entry_SYSENTER_32 的地址设置到 MSR_IA32_SYSENTER_EIP 中。

当用户程序进行系统调用时，实际上在用户态中最终会调用到 VDSO 中映射的 __kernel_vsyscall ，其定义位于 arch/x86/entry/vdso/vdso32/system_call.S：

__kernel_vsyscall:    CFI_STARTPROC    pushl   %ecx    CFI_ADJUST_CFA_OFFSET   4    CFI_REL_OFFSET      ecx, 0    pushl   %edx    CFI_ADJUST_CFA_OFFSET   4    CFI_REL_OFFSET      edx, 0    pushl   %ebp    CFI_ADJUST_CFA_OFFSET   4    CFI_REL_OFFSET      ebp, 0    #define SYSENTER_SEQUENCE   "movl %esp, %ebp; sysenter"    #define SYSCALL_SEQUENCE    "movl %ecx, %ebp; syscall"#ifdef CONFIG_X86_64    /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */    ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \                      SYSCALL_SEQUENCE,  X86_FEATURE_SYSCALL32#else    ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP#endif    /* Enter using int $0x80 */    int $0x80GLOBAL(int80_landing_pad)    /*     * Restore EDX and ECX in case they were clobbered.  EBP is not     * clobbered (the kernel restores it), but it's cleaner and     * probably faster to pop it than to adjust ESP using addl.     */    popl    %ebp    CFI_RESTORE     ebp    CFI_ADJUST_CFA_OFFSET   -4    popl    %edx    CFI_RESTORE     edx    CFI_ADJUST_CFA_OFFSET   -4    popl    %ecx    CFI_RESTORE     ecx    CFI_ADJUST_CFA_OFFSET   -4    ret    CFI_ENDPROC    .size __kernel_vsyscall,.-__kernel_vsyscall    .previous

__kernel_vsyscall 首先将寄存器当前值压栈保存，因为这些寄存器以后要用作系统调用传参。然后填入参数，调用 sysenter

ALTERNATIVE_2 宏实际上是在做选择，如果支持 X86_FEATURE_SYSENTER32(Intel CPU) ，则执行 SYSENTER_SEQUENCE ，如果支持 X86_FEATURE_SYSCALL32(AMD CPU)，则执行 SYSCALL_SEQUENCE 。如果都不支持，那么啥都不干(???)。如果啥都没干，那么接着往下执行，即执行 int $0x80，退化到传统(legacy)方式进行系统调用。

注意 sysenter 指令会覆盖掉 esp ，因此 SYSENTER_SEQUENCE 中会将当前 esp 保存到 ebp 中。sysenter 同样会覆盖 eip ，但由于返回地址是固定的(__kernel_vsyscall 函数结尾)，因此无需保存。

前文提到过，执行了 sysenter 指令之后直接切换到内核态，同时寄存器也都设置好了：eip 被设置为 IA32_SYSENTER_EIP 即 entry_SYSENTER_32 的地址，其定义在arch/x86/entry/entry_32.S中：

ENTRY(entry_SYSENTER_32)    movl    TSS_sysenter_sp0(%esp), %espsysenter_past_esp:    pushl   $__USER_DS      /* pt_regs->ss */    pushl   %ebp            /* pt_regs->sp (stashed in bp) */    pushfl              /* pt_regs->flags (except IF = 0) */    orl $X86_EFLAGS_IF, (%esp)  /* Fix IF */    pushl   $__USER_CS      /* pt_regs->cs */    pushl   $0          /* pt_regs->ip = 0 (placeholder) */    pushl   %eax            /* pt_regs->orig_ax */    SAVE_ALL pt_regs_ax=$-ENOSYS    /* save rest */    testl   $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)    jnz .Lsysenter_fix_flags.Lsysenter_flags_fixed:    /*     * User mode is traced as though IRQs are on, and SYSENTER     * turned them off.     */    TRACE_IRQS_OFF    movl    %esp, %eax    call    do_fast_syscall_32.../* arch/x86/kernel/asm-offsets_32.c *//* Offset from the sysenter stack to tss.sp0 */DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -       offsetofend(struct cpu_entry_area, entry_stack_page.stack));

前文提到过，sysenter 会将 IA32_SYSENTER_ESP 加载到 esp 中，但 IA32_SYSENTER_ESP 保存的是 SYSENTER_stack 的地址，需要通过 TSS_sysenter_sp0 进行修正，指向进程的内核栈。

然后开始按照 pt_regs 的结构将相关寄存器中的值压入栈中，包括在 sysenter 前保存到 ebp 的用户态栈顶指针。由于 eip 无需保存，于是压入 0 用于占位。

最后调用 do_fast_syscall_32 ，该函数在 arch/x86/entry/common.c 中定义：

/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */__visible long do_fast_syscall_32(struct pt_regs *regs){    /*     * Called using the internal vDSO SYSENTER/SYSCALL32 calling     * convention.  Adjust regs so it looks like we entered using int80.     */    unsigned long landing_pad = (unsigned long)current->mm->context.vdso +        vdso_image_32.sym_int80_landing_pad;    /*     * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward     * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.     * Fix it up.     */    regs->ip = landing_pad;    enter_from_user_mode();    local_irq_enable();    /* Fetch EBP from where the vDSO stashed it. */    if (#ifdef CONFIG_X86_64        /*         * Micro-optimization: the pointer we're following is explicitly         * 32 bits, so it can't be out of range.         */        __get_user(*(u32 *)&regs->bp,                (u32 __user __force *)(unsigned long)(u32)regs->sp)#else        get_user(*(u32 *)&regs->bp,             (u32 __user __force *)(unsigned long)(u32)regs->sp)#endif        ) {        /* User code screwed up. */        local_irq_disable();        regs->ax = -EFAULT;        prepare_exit_to_usermode(regs);        return 0;   /* Keep it simple: use IRET. */    }    /* Now this is just like a normal syscall. */    do_syscall_32_irqs_on(regs);#ifdef CONFIG_X86_64    /*     * Opportunistic SYSRETL: if possible, try to return using SYSRETL.     * SYSRETL is available on all 64-bit CPUs, so we don't need to     * bother with SYSEXIT.     *     * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,     * because the ECX fixup above will ensure that this is essentially     * never the case.     */    return regs->cs == __USER32_CS && regs->ss == __USER_DS &&        regs->ip == landing_pad &&        (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;#else    /*     * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.     *     * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,     * because the ECX fixup above will ensure that this is essentially     * never the case.     *     * We don't allow syscalls at all from VM86 mode, but we still     * need to check VM, because we might be returning from sys_vm86.     */    return static_cpu_has(X86_FEATURE_SEP) &&        regs->cs == __USER_CS && regs->ss == __USER_DS &&        regs->ip == landing_pad &&        (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;#endif}

由于没有保存 eip，我们需要计算系统调用完毕后返回到用户态的地址：current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad (即跳过 sym_int80_landing_pad 来到 __kernel_vsyscall 的结尾) 覆盖掉先前压栈的 0 。

接下来就和 int 0x80 的流程一样，通过 do_syscall_32_irqs_on 从系统调用表中找到相应的处理函数进行调用。完成后，如果都符合 sysexit 的要求，返回 1，否则返回 0 。

...    call    do_fast_syscall_32    /* XEN PV guests always use IRET path */    ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \            "jmp .Lsyscall_32_done", X86_FEATURE_XENPV/* Opportunistic SYSEXIT */    TRACE_IRQS_ON           /* User mode traces as IRQs on. */    movl    PT_EIP(%esp), %edx  /* pt_regs->ip */    movl    PT_OLDESP(%esp), %ecx   /* pt_regs->sp */1:  mov PT_FS(%esp), %fs    PTGS_TO_GS    popl    %ebx            /* pt_regs->bx */    addl    $2*4, %esp      /* skip pt_regs->cx and pt_regs->dx */    popl    %esi            /* pt_regs->si */    popl    %edi            /* pt_regs->di */    popl    %ebp            /* pt_regs->bp */    popl    %eax            /* pt_regs->ax */    /*     * Restore all flags except IF. (We restore IF separately because     * STI gives a one-instruction window in which we won't be interrupted,     * whereas POPF does not.)     */    addl    $PT_EFLAGS-PT_DS, %esp  /* point esp at pt_regs->flags */    btr $X86_EFLAGS_IF_BIT, (%esp)    popfl    /*     * Return back to the vDSO, which will pop ecx and edx.     * Don't bother with DS and ES (they already contain __USER_DS).     */    sti    sysexit

根据 testl %eax, %eax; jz .Lsyscall_32_done ，如果 do_fast_syscall_32 的返回值(eax)为 0 ，表示不支持快速返回，于是跳转到 Lsyscall_32_done ，通过 iret 返回。否则继续执行下面代码，将内核栈中保存的值保存到相应寄存器中，然后通过 sysexit 返回。

注意这里将原有的 eip 设置到 edx、 esp 设置到 ecx ，这是因为根据 Intel SDM，sysexit 会用 edx 来设置 eip，用 ecx 来设置 esp ，从而指向先前用户空间的代码偏移和栈偏移。并加载 SYSENTER_CS_MSR+16 到 cs，加载 SYSENTER_CS_MSR+24 到 ss 。如此一来就回到了用户态的 __kernel_vsyscall 尾端。

四、实验

我们通过 gdb 一个 C 程序来检验一下：

#include <unistd.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>int main(int argc, char *argv[]){    char buffer[80] = "/tmp/test";    int fd = open(buffer, O_RDONLY);    int size = read(fd, buffer, sizeof(buffer));    close(fd);}$ gcc -m32 -g -static -o read read.c$ file readread: ELF 32-bit LSB executable, Intel 80386, version 1 (GNU/Linux), statically linked, for GNU/Linux 2.6.32, BuildID[sha1]=8a7f3d69d3e4c9582551934b0617ad78e492e48c, not stripped[txt](gdb) disas   0x0804888a <+14>:    push   %ecx   0x0804888b <+15>:    sub    $0x70,%esp   0x0804888e <+18>:    mov    %ecx,%eax   0x08048890 <+20>:    mov    0x4(%eax),%eax   0x08048893 <+23>:    mov    %eax,-0x6c(%ebp)   0x08048896 <+26>:    mov    %gs:0x14,%eax   0x0804889c <+32>:    mov    %eax,-0xc(%ebp)   0x0804889f <+35>:    xor    %eax,%eax   0x080488a1 <+37>:    movl   $0x706d742f,-0x5c(%ebp)   0x080488a8 <+44>:    movl   $0x7365742f,-0x58(%ebp)   0x080488af <+51>:    movl   $0x74,-0x54(%ebp)   0x080488b6 <+58>:    lea    -0x50(%ebp),%edx   0x080488b9 <+61>:    mov    $0x0,%eax   0x080488be <+66>:    mov    $0x11,%ecx   0x080488c3 <+71>:    mov    %edx,%edi   0x080488c5 <+73>:    rep stos %eax,%es:(%edi)   0x080488c7 <+75>:    sub    $0x8,%esp   0x080488ca <+78>:    push   $0x0   0x080488cc <+80>:    lea    -0x5c(%ebp),%eax   0x080488cf <+83>:    push   %eax   0x080488d0 <+84>:    call   0x806cf30 <open>   0x080488d5 <+89>:    add    $0x10,%esp   0x080488d8 <+92>:    mov    %eax,-0x64(%ebp)   0x080488db <+95>:    sub    $0x4,%esp   0x080488de <+98>:    push   $0x50   0x080488e0 <+100>:   lea    -0x5c(%ebp),%eax   0x080488e3 <+103>:   push   %eax   0x080488e4 <+104>:   pushl  -0x64(%ebp)   0x080488e7 <+107>:   call   0x806cfa0 <read>   0x080488ec <+112>:   add    $0x10,%esp   0x080488ef <+115>:   mov    %eax,-0x60(%ebp)=> 0x080488f2 <+118>:   sub    $0xc,%esp   0x080488f5 <+121>:   pushl  -0x64(%ebp)   0x080488f8 <+124>:   call   0x806d150 <close>   0x080488fd <+129>:   add    $0x10,%esp   0x08048900 <+132>:   mov    $0x0,%eax   0x08048905 <+137>:   mov    -0xc(%ebp),%edx   0x08048908 <+140>:   xor    %gs:0x14,%edx   0x0804890f <+147>:   je     0x8048916 <main+154>   0x08048911 <+149>:   call   0x806ef90 <__stack_chk_fail>   0x08048916 <+154>:   lea    -0x8(%ebp),%esp   0x08048919 <+157>:   pop    %ecx   0x0804891a <+158>:   pop    %edi   0x0804891b <+159>:   pop    %ebp   0x0804891c <+160>:   lea    -0x4(%ecx),%esp   0x0804891f <+163>:   retEnd of assembler dump.

首先是 open ，将将参数 O_RDONLY (根据 #define O_RDONLY 0，值为 0x0 )，将 buffer 地址(eax) 压栈后调用系统调用 glibc 的 open 函数，disas 之：

(gdb) disas 0x806cf30Dump of assembler code for function open:   0x0806cf30 <+0>:     cmpl   $0x0,%gs:0xc   0x0806cf38 <+8>:     jne    0x806cf5f    0x0806cf3a <+0>:     push   %ebx   0x0806cf3b <+1>:     mov    0x10(%esp),%edx   0x0806cf3f <+5>:     mov    0xc(%esp),%ecx   0x0806cf43 <+9>:     mov    0x8(%esp),%ebx   0x0806cf47 <+13>:    mov    $0x5,%eax   0x0806cf4c <+18>:    call   *0x80ea9f0   0x0806cf52 <+24>:    pop    %ebx   0x0806cf53 <+25>:    cmp    $0xfffff001,%eax   0x0806cf58 <+30>:    jae    0x8070590 <__syscall_error>   0x0806cf5e <+36>:    ret   0x0806cf5f <+47>:    call   0x806ea80 <__libc_enable_asynccancel>   0x0806cf64 <+52>:    push   %eax   0x0806cf65 <+53>:    push   %ebx   0x0806cf66 <+54>:    mov    0x14(%esp),%edx   0x0806cf6a <+58>:    mov    0x10(%esp),%ecx   0x0806cf6e <+62>:    mov    0xc(%esp),%ebx   0x0806cf72 <+66>:    mov    $0x5,%eax   0x0806cf77 <+71>:    call   *0x80ea9f0   0x0806cf7d <+77>:    pop    %ebx   0x0806cf7e <+78>:    xchg   %eax,(%esp)   0x0806cf81 <+81>:    call   0x806eaf0 <__libc_disable_asynccancel>   0x0806cf86 <+86>:    pop    %eax   0x0806cf87 <+87>:    cmp    $0xfffff001,%eax   0x0806cf8c <+92>:    jae    0x8070590 <__syscall_error>   0x0806cf92 <+98>:    retEnd of assembler dump.

将压入栈中的参数保存到寄存器中，然后调用了 0x80ea9f0，用 x 查看该地址的值：

(gdb) x 0x80ea9f00x80ea9f0 <_dl_sysinfo>:        0xf7ffcc80

disas 之，发现来到了 __kernel_vsyscall ，并执行了sysenter指令：

(gdb) disas 0xf7ffcc80Dump of assembler code for function __kernel_vsyscall:   0xf7ffcc80 <+0>:     push   %ecx   0xf7ffcc81 <+1>:     push   %edx   0xf7ffcc82 <+2>:     push   %ebp   0xf7ffcc83 <+3>:     mov    %esp,%ebp   0xf7ffcc85 <+5>:     sysenter   0xf7ffcc87 <+7>:     int    $0x80   0xf7ffcc89 <+9>:     pop    %ebp   0xf7ffcc8a <+10>:    pop    %edx   0xf7ffcc8b <+11>:    pop    %ecx   0xf7ffcc8c <+12>:    retEnd of assembler dump.

read 同理，只是有三个参数，需要 push 三次而已。

五、syscall / sysret

前文提到过，在32位下 Intel 和 AMD 对快速系统调用指令的定义有分歧，一个使用 sysenter ，另一个使用 syscall 。但到了64位下，为啥都统一成 syscall 了呢？

关于这个我在网上也没有找到权威的答案，只是一些道途听说：为什么IA-64指令集架构失败了？

在 64 位架构的开发上，Intel 和 AMD 选择了不同的道路：Intel搞出了一套全新的架构，名为安腾(IA-64)，这套架构性能完爆x86，这样用户为了更好的性能需要进行硬件换代，岂不是喜滋滋？然而这种做法在商业上取得了失败。因为 IA-64 架构虽然提高了性能，却不能向后兼容，即原来能在 x86 下跑的程序到新架构下就跑不了了，用户非常 angry 。AMD 就比较厚道，老老实实地做出了兼容 x86 的 x86_64 ，能够运行 32 位下的程序。于是农企日常翻身，逼得 Intel 反过来兼容 x86_64 架构，于是只能支持 AMD 标准中定义的 syscall 了。

这次我们直接从gdb出发，同样是之前的代码，只是这次编译成 64 位：

(gdb) disasDump of assembler code for function main:   0x00000000004009ae <+0>:     push   %rbp   0x00000000004009af <+1>:     mov    %rsp,%rbp   0x00000000004009b2 <+4>:     add    $0xffffffffffffff80,%rsp   0x00000000004009b6 <+8>:     mov    %edi,-0x74(%rbp)   0x00000000004009b9 <+11>:    mov    %rsi,-0x80(%rbp)   0x00000000004009bd <+15>:    mov    %fs:0x28,%rax   0x00000000004009c6 <+24>:    mov    %rax,-0x8(%rbp)   0x00000000004009ca <+28>:    xor    %eax,%eax   0x00000000004009cc <+30>:    movabs $0x7365742f706d742f,%rax   0x00000000004009d6 <+40>:    mov    %rax,-0x60(%rbp)   0x00000000004009da <+44>:    movq   $0x74,-0x58(%rbp)   0x00000000004009e2 <+52>:    lea    -0x50(%rbp),%rdx   0x00000000004009e6 <+56>:    mov    $0x0,%eax   0x00000000004009eb <+61>:    mov    $0x8,%ecx   0x00000000004009f0 <+66>:    mov    %rdx,%rdi   0x00000000004009f3 <+69>:    rep stos %rax,%es:(%rdi)   0x00000000004009f6 <+72>:    lea    -0x60(%rbp),%rax   0x00000000004009fa <+76>:    mov    $0x0,%esi   0x00000000004009ff <+81>:    mov    %rax,%rdi   0x0000000000400a02 <+84>:    mov    $0x0,%eax   0x0000000000400a07 <+89>:    callq  0x43e650 <open64>   0x0000000000400a0c <+94>:    mov    %eax,-0x68(%rbp)   0x0000000000400a0f <+97>:    lea    -0x60(%rbp),%rcx   0x0000000000400a13 <+101>:   mov    -0x68(%rbp),%eax   0x0000000000400a16 <+104>:   mov    $0x50,%edx   0x0000000000400a1b <+109>:   mov    %rcx,%rsi   0x0000000000400a1e <+112>:   mov    %eax,%edi   0x0000000000400a20 <+114>:   callq  0x43e6b0 <read>   0x0000000000400a25 <+119>:   mov    %eax,-0x64(%rbp)=> 0x0000000000400a28 <+122>:   mov    -0x68(%rbp),%eax   0x0000000000400a2b <+125>:   mov    %eax,%edi   0x0000000000400a2d <+127>:   callq  0x43e900 <close>   0x0000000000400a32 <+132>:   mov    $0x0,%eax   0x0000000000400a37 <+137>:   mov    -0x8(%rbp),%rdx   0x0000000000400a3b <+141>:   xor    %fs:0x28,%rdx   0x0000000000400a44 <+150>:   je     0x400a4b <main+157>   0x0000000000400a46 <+152>:   callq  0x442010 <__stack_chk_fail>   0x0000000000400a4b <+157>:   leaveq   0x0000000000400a4c <+158>:   retqEnd of assembler dump.(gdb) disas 0x43e650Dump of assembler code for function open64:   0x000000000043e650 <+0>:     cmpl   $0x0,0x28db65(%rip)        # 0x6cc1bc <__libc_multiple_threads>   0x000000000043e657 <+7>:     jne    0x43e66d <open64+29>   0x000000000043e659 <+0>:     mov    $0x2,%eax   0x000000000043e65e <+5>:     syscall   0x000000000043e660 <+7>:     cmp    $0xfffffffffffff001,%rax   0x000000000043e666 <+13>:    jae    0x4436b0 <__syscall_error>   0x000000000043e66c <+19>:    retq   0x000000000043e66d <+29>:    sub    $0x8,%rsp   0x000000000043e671 <+33>:    callq  0x441b70 <__libc_enable_asynccancel>   0x000000000043e676 <+38>:    mov    %rax,(%rsp)   0x000000000043e67a <+42>:    mov    $0x2,%eax   0x000000000043e67f <+47>:    syscall   0x000000000043e681 <+49>:    mov    (%rsp),%rdi   0x000000000043e685 <+53>:    mov    %rax,%rdx   0x000000000043e688 <+56>:    callq  0x441bd0 <__libc_disable_asynccancel>   0x000000000043e68d <+61>:    mov    %rdx,%rax   0x000000000043e690 <+64>:    add    $0x8,%rsp   0x000000000043e694 <+68>:    cmp    $0xfffffffffffff001,%rax   0x000000000043e69a <+74>:    jae    0x4436b0 <__syscall_error>   0x000000000043e6a0 <+80>:    retqEnd of assembler dump.

open64 定义在 glibc 的 sysdeps/posix/open64.c中：

#include <fcntl.h>#include <stdarg.h>#include <sysdep-cancel.h>/* Open FILE with access OFLAG.  If O_CREAT or O_TMPFILE is in OFLAG,   a third argument is the file protection.  */int__libc_open64 (const char *file, int oflag, ...){    int mode = 0;    if (__OPEN_NEEDS_MODE (oflag))    {        va_list arg;        va_start (arg, oflag);        mode = va_arg (arg, int);        va_end (arg);    }    if (SINGLE_THREAD_P)        return __libc_open (file, oflag | O_LARGEFILE, mode);    int oldtype = LIBC_CANCEL_ASYNC ();    int result = __libc_open (file, oflag | O_LARGEFILE, mode);    LIBC_CANCEL_RESET (oldtype);    return result;}weak_alias (__libc_open64, __open64)libc_hidden_weak (__open64)weak_alias (__libc_open64, open64)

再看 __libc_open ，定义在 unix/sysv/linux/generic/open.c ：

#include <errno.h>#include <fcntl.h>#include <stdarg.h>#include <stdio.h>#include <sysdep-cancel.h>/* Open FILE with access OFLAG.  If O_CREAT or O_TMPFILE is in OFLAG,   a third argument is the file protection.  */int__libc_open (const char *file, int oflag, ...){    int mode = 0;    if (__OPEN_NEEDS_MODE (oflag))    {        va_list arg;        va_start (arg, oflag);        mode = va_arg (arg, int);        va_end (arg);    }    return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag, mode);}

我们将宏展开：

SYSCALL_CANCEL(openat, AT_FDCWD, file, oflag, mode)=> __SYSCALL_CALL(openat, AT_FDCWD, file, oflag, mode)=> __SYSCALL_DISP(__SYSCALL, openat, AT_FDCWD, file, oflag, mode)=> __SYSCALL_CONCAT(__SYSCALL, 4)(openat, AT_FDCWD, file, oflag, mode)=> __SYSCALL_CONCAT_X(__SYSCALL, 4)(openat, AT_FDCWD, file, oflag, mode)=> __SYSCALL5(openat, AT_FDCWD, file, oflag, mode)=> INLINE_SYSCALL (openat, 4, AT_FDCWD, file, oflag, mode)=> INTERNAL_SYSCALL (openat, _, 4, AT_FDCWD, file, oflag, mode)=> INTERNAL_SYSCALL_NCS (__NR_openat, _, 4, AT_FDCWD, file, oflag, mode)

最终到达 INTERNAL_SYSCALL_NCS ：

# define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \  ({                                          \    unsigned long int resultvar;                          \    LOAD_ARGS_##nr (args)                             \    LOAD_REGS_##nr                                \    asm volatile (                                \    "syscall\n\t"                                 \    : "=a" (resultvar)                                \    : "0" (name) ASM_ARGS_##nr : "memory", REGISTERS_CLOBBERED_BY_SYSCALL);   \    (long int) resultvar; })

LOAD_ARGS_##nr 负责把参数 args 展开，然后由 LOAD_REGS_##nr 设置到相应的寄存器中，因为 syscall 通过寄存器传参。最终调用 syscall 。

根据 Intel SDM，syscall 会将当前 rip 存到 rcx ，然后将 IA32_LSTAR 加载到 rip 。同时将 IA32_STAR[47:32] 加载到cs，IA32_STAR[47:32] + 8 加载到 ss (在 GDT 中，ss 就跟在 cs 后面)。

MSR IA32_LSTAR (MSR_LSTAR) 和 IA32_STAR (MSR_STAR) 在 arch/x86/kernel/cpu/common.c 的 syscall_init 中初始化：

void syscall_init(void){    wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);    wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);#ifdef CONFIG_IA32_EMULATION    wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);    /*     * This only works on Intel CPUs.     * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.     * This does not cause SYSENTER to jump to the wrong location, because     * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).     */    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);    wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);#else    wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);    wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);#endif    /* Flags to clear on syscall */    wrmsrl(MSR_SYSCALL_MASK,           X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|           X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);}

可以看到 MSR_STAR 的第 32-47 位设置为 kernel mode 的 cs，48-63位设置为 user mode 的 cs。而 IA32_LSTAR 被设置为函数 entry_SYSCALL_64 的起始地址。

于是 syscall 时，跳转到 entry_SYSCALL_64 开始执行，其定义在 arch/x86/entry/entry_64.S：

ENTRY(entry_SYSCALL_64)    /*     * Interrupts are off on entry.     * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,     * it is too small to ever cause noticeable irq latency.     */    SWAPGS_UNSAFE_STACK    // KAISER 进内核态需要切到内核页表    SWITCH_KERNEL_CR3_NO_STACK    /*     * A hypervisor implementation might want to use a label     * after the swapgs, so that it can do the swapgs     * for the guest and jump here on syscall.     */GLOBAL(entry_SYSCALL_64_after_swapgs)    // 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中    movq    %rsp, PER_CPU_VAR(rsp_scratch)    // 加载内核栈偏移    movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp    TRACE_IRQS_OFF    /* Construct struct pt_regs on stack */    pushq   $__USER_DS          /* pt_regs->ss */    pushq   PER_CPU_VAR(rsp_scratch)    /* pt_regs->sp */    pushq   %r11                /* pt_regs->flags */    pushq   $__USER_CS          /* pt_regs->cs */    pushq   %rcx                /* pt_regs->ip */    pushq   %rax                /* pt_regs->orig_ax */    pushq   %rdi                /* pt_regs->di */    pushq   %rsi                /* pt_regs->si */    pushq   %rdx                /* pt_regs->dx */    pushq   %rcx                /* pt_regs->cx */    pushq   $-ENOSYS            /* pt_regs->ax */    pushq   %r8             /* pt_regs->r8 */    pushq   %r9             /* pt_regs->r9 */    pushq   %r10                /* pt_regs->r10 */    pushq   %r11                /* pt_regs->r11 */    // 为r12-r15, rbp, rbx保留位置    sub $(6*8), %rsp            /* pt_regs->bp, bx, r12-15 not saved */    /*     * If we need to do entry work or if we guess we'll need to do     * exit work, go straight to the slow path.     */    movq    PER_CPU_VAR(current_task), %r11    testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)    jnz entry_SYSCALL64_slow_pathentry_SYSCALL_64_fastpath:    /*     * Easy case: enable interrupts and issue the syscall.  If the syscall     * needs pt_regs, we'll call a stub that disables interrupts again     * and jumps to the slow path.     */    TRACE_IRQS_ON    ENABLE_INTERRUPTS(CLBR_NONE)#if __SYSCALL_MASK == ~0    // 确保系统调用号没超过最大值，超过了则跳转到后面的符号 1 处进行返回    cmpq    $__NR_syscall_max, %rax#else    andl    $__SYSCALL_MASK, %eax    cmpl    $__NR_syscall_max, %eax#endif    ja  1f              /* return -ENOSYS (already in pt_regs->ax) */    // 除系统调用外的其他调用都通过 rcx 来传第四个参数，因此将 r10 的内容设置到 rcx    movq    %r10, %rcx    /*     * This call instruction is handled specially in stub_ptregs_64.     * It might end up jumping to the slow path.  If it jumps, RAX     * and all argument registers are clobbered.     */    // 调用系统调用表中对应的函数    call    *sys_call_table(, %rax, 8).Lentry_SYSCALL_64_after_fastpath_call:    // 将函数返回值压到栈中，返回时弹出    movq    %rax, RAX(%rsp)1:    /*     * If we get here, then we know that pt_regs is clean for SYSRET64.     * If we see that no exit work is required (which we are required     * to check with IRQs off), then we can go straight to SYSRET64.     */    DISABLE_INTERRUPTS(CLBR_NONE)    TRACE_IRQS_OFF    movq    PER_CPU_VAR(current_task), %r11    testl   $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)    jnz 1f    LOCKDEP_SYS_EXIT    TRACE_IRQS_ON       /* user mode is traced as IRQs on */    movq    RIP(%rsp), %rcx    movq    EFLAGS(%rsp), %r11    RESTORE_C_REGS_EXCEPT_RCX_R11    /*     * This opens a window where we have a user CR3, but are     * running in the kernel.  This makes using the CS     * register useless for telling whether or not we need to     * switch CR3 in NMIs.  Normal interrupts are OK because     * they are off here.     */    SWITCH_USER_CR3    movq    RSP(%rsp), %rsp    USERGS_SYSRET641:    /*     * The fast path looked good when we started, but something changed     * along the way and we need to switch to the slow path.  Calling     * raise(3) will trigger this, for example.  IRQs are off.     */    TRACE_IRQS_ON    ENABLE_INTERRUPTS(CLBR_NONE)    SAVE_EXTRA_REGS    movq    %rsp, %rdi    call    syscall_return_slowpath /* returns with IRQs disabled */    jmp return_from_SYSCALL_64entry_SYSCALL64_slow_path:    /* IRQs are off. */    SAVE_EXTRA_REGS    movq    %rsp, %rdi    call    do_syscall_64       /* returns with IRQs disabled */return_from_SYSCALL_64:    RESTORE_EXTRA_REGS    TRACE_IRQS_IRETQ        /* we're about to change IF */    /*     * Try to use SYSRET instead of IRET if we're returning to     * a completely clean 64-bit userspace context.     */    movq    RCX(%rsp), %rcx    movq    RIP(%rsp), %r11    cmpq    %rcx, %r11          /* RCX == RIP */    jne opportunistic_sysret_failed    /*     * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP     * in kernel space.  This essentially lets the user take over     * the kernel, since userspace controls RSP.     *     * If width of "canonical tail" ever becomes variable, this will need     * to be updated to remain correct on both old and new CPUs.     */    .ifne __VIRTUAL_MASK_SHIFT - 47    .error "virtual address width changed -- SYSRET checks need update"    .endif    /* Change top 16 bits to be the sign-extension of 47th bit */    shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx    sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx    /* If this changed %rcx, it was not canonical */    cmpq    %rcx, %r11    jne opportunistic_sysret_failed    cmpq    $__USER_CS, CS(%rsp)        /* CS must match SYSRET */    jne opportunistic_sysret_failed    movq    R11(%rsp), %r11    cmpq    %r11, EFLAGS(%rsp)      /* R11 == RFLAGS */    jne opportunistic_sysret_failed    /*     * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot     * restore RF properly. If the slowpath sets it for whatever reason, we     * need to restore it correctly.     *     * SYSRET can restore TF, but unlike IRET, restoring TF results in a     * trap from userspace immediately after SYSRET.  This would cause an     * infinite loop whenever #DB happens with register state that satisfies     * the opportunistic SYSRET conditions.  For example, single-stepping     * this user code:     *     *           movq   $stuck_here, %rcx     *           pushfq     *           popq %r11     *   stuck_here:     *     * would never get past 'stuck_here'.     */    testq   $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11    jnz opportunistic_sysret_failed    /* nothing to check for RSP */    cmpq    $__USER_DS, SS(%rsp)        /* SS must match SYSRET */    jne opportunistic_sysret_failed    /*     * We win! This label is here just for ease of understanding     * perf profiles. Nothing jumps here.     */syscall_return_via_sysret:    /* rcx and r11 are already restored (see code above) */    RESTORE_C_REGS_EXCEPT_RCX_R11    /*     * This opens a window where we have a user CR3, but are     * running in the kernel.  This makes using the CS     * register useless for telling whether or not we need to     * switch CR3 in NMIs.  Normal interrupts are OK because     * they are off here.     */    // KAISER 返回用户态需要切回用户页表    SWITCH_USER_CR3    /* 根据压栈的内容，恢复 rsp 为用户态的栈顶 */    movq    RSP(%rsp), %rsp    USERGS_SYSRET64    // 无法快速返回，只能退化到 iretopportunistic_sysret_failed:    /*     * This opens a window where we have a user CR3, but are     * running in the kernel.  This makes using the CS     * register useless for telling whether or not we need to     * switch CR3 in NMIs.  Normal interrupts are OK because     * they are off here.     */    SWITCH_USER_CR3    SWAPGS    jmp restore_c_regs_and_iretEND(entry_SYSCALL_64)

注意 syscall 不会保存栈指针，因此 handler 首先将当前用户态栈偏移 rsp 存到 per-cpu 变量 rsp_scratch 中，然后将 per-cpu 变量 cpu_current_top_of_stack ，即内核态的栈偏移加载到 rsp。

随后将各寄存器中的值压入内核态的栈中，包括：

rax system call number
rcx return address
r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
rdi arg0
rsi arg1
rdx arg2
r10 arg3 (needs to be moved to rcx to conform to C ABI)
r8 arg4
r9 arg5

接着根据系统调用号从系统调用表(sys_call_table) 中找到相应的处理函数，如 sys_open ，进行调用。64位下系统调用定义在 arch/x86/entry/syscalls/syscall_64.tbl中，ABI 和 32 位不同。

如果一切顺利的话，最终通过 USERGS_SYSRET64 ，即 sysretq 返回。

六、总结

本文主要分析了Linux下的三种系统调用方式：int 0x80 ，sysenter 和 syscall 。

传统系统调用(int 0x80) 通过中断/异常实现，在执行 int 指令时，发生 trap。硬件找到在中断描述符表中的表项，在自动切换到内核栈 (tss.ss0 : tss.esp0) 后根据中断描述符的 segment selector 在 GDT / LDT 中找到对应的段描述符，从段描述符拿到段的基址，加载到 cs ，将 offset 加载到 eip。最后硬件将 ss / sp / eflags / cs / ip / error code 依次压到内核栈。返回时，iret 将先前压栈的 ss / sp / eflags / cs / ip 弹出，恢复用户态调用时的寄存器上下文。

sysenter 和 syscall 是为了加速系统调用所引入的新指令，通过引入新的 MSR 来存放内核态的代码和栈的段号和偏移量，从而实现快速跳转：

在调用 sysenter 时将 SYSENTER_CS_MSR 加载到 cs，将 SYSENTER_CS_MSR + 8 加载到 ss，将 IA32_SYSENTER_EIP 加载到 eip ，将 IA32_SYSENTER_ESP 加载到 esp ，整套切换到内核态。返回时，sysexit 将 IA32_SYSENTER_CS + 16 加载到 cs ，将 IA32_SYSENTER_CS + 24 加载到 cs ，而 eip 和 esp 分别从 edx 和 ecx 中加载，因此返回前应该将压栈的用户态 eip(计算出来的) 和 esp(调用前用户态保存到 ebp 进行传递) 设置到这两个寄存器中。

在调用 syscall 时，会自动将 rip 保存到 rcx ，然后将 IA32_LSTAR 加载到 rip 。同时将 IA32_STAR[47:32] 加载到 cs ，IA32_STAR[47:32] + 8 加载到 ss 。栈顶指针的切换会延迟到内核态系统调用入口点 entry_SYSCALL_64 后进行处理，将用户态栈偏移 rsp 存到 per-cpu 变量 rsp_scratch 中，然后将 per-cpu 变量 cpu_current_top_of_stack ，即内核态的栈偏移加载到 rsp。返回时，sysret 将 IA32_STAR[63:48] 加载到 cs ，IA32_STAR[63:48] + 8 加载到 ss ，而 rip 从 rcx 中加载，因此返回前应该将压栈的用户态 rip 设置到 rcx 中。对于 rsp ，返回前根据先前压栈内容先设置为用户态 rsp。

文章中肯定有遗漏或理解错误的地方，欢迎留言指正，不胜感激。

参考：

https://0xax.gitbooks.io/linux-insides/content/SysCall/https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/http://www.ibm.com/developerworks/cn/linux/kernel/l-k26ncpu/index.htmlhttps://lwn.net/Articles/604287/https://lwn.net/Articles/604515/

往期精彩文章推荐

长按二维码关注我们

本文来自网友投稿或网络内容，如有侵犯您的权益请联系我们删除，联系邮箱：wyl860211@qq.com 。

二、int / iret

四、实验

五、syscall / sysret

六、总结

面试季：Java面试题汇总---整理版（附答案）

面试季：Java面试题汇总---基础版（附答案）

面试季：Java面试题汇总---升级版（附答案）

面试季：Java开发面试题汇总 -- 精选版（附答案）

Linux syscall过程分析(万字长文)

最新文章

热门文章

随机文章

Linux syscall过程分析(万字长文)

二、int / iret

四、实验

五、syscall / sysret

六、总结

面试季：Java面试题汇总---整理版（附答案）

面试季：Java面试题汇总---基础版（附答案）

面试季：Java面试题汇总---升级版（附答案）

面试季：Java开发面试题汇总 -- 精选版（附答案）

Linux内核硬中断 / 软中断的原理和实现

针对PHP服务器与物联网设备的自动化僵尸网络攻击激增

最新文章

热门文章

随机文章