一、系统调用在Linux Kernel中的map表(系统调用的数组)
在sys.c中定义了__SYSCALL宏
(kernel-4.19/arch/arm64/kernel/sys.c)#define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *);
8例如:
__SYSCALL(__NR_flock, sys_flock),其实就是定义__arm64_sys_flock函数
__SC_COMP(__NR_ioctl,sys_ioctl,compat_sys_ioctl),其实就是定义__arm64_compat_sys_ioctl函数
在sys.c中定义并初始化了系统调用的tab表
(kernel-4.19/arch/arm64/kernel/sys.c)#undef __SYSCALL#define __SYSCALL(nr, sym) [nr] = __arm64_##sym,const syscall_fn_t sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall,#include<asm/unistd.h> };
剖析这段代码,将asm/unistd.h引进来了,其实等价于下面这句
(kernel-4.19/arch/arm64/kernel/sys.c)const syscall_fn_t sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall,__arm64_compat_sys_io_setup,__arm64_sys_io_destroy,__arm64_compat_sys_io_submit...... };
二、系统调用的函数在Kernel中的实现
SYSCALL_DEFINE1(arm64_personality, unsignedint, personality){if (personality(personality) == PER_LINUX32 && !system_supports_32bit_el0())return -EINVAL;return ksys_personality(personality);}#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)#define SYSCALL_DEFINEx(x, sname, ...) \ SYSCALL_METADATA(sname, x, __VA_ARGS__) \ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
在kernel中使用SYSCALL_DEFINEx定义的地方,都是在定义系统调用函数,例如:这里定义的SYSCALL_DEFINE1(setgid, gid_t, gid),其实就是定义__arm64_sys_setgid。
SYSCALL_DEFINE1(setgid, gid_t, gid) {return __sys_setgid(gid); }
由于Userspace中C语言使用的libc库代码,我们在kernel中是看不到,所以就不做具体分析了。但可以知道的是,该系统调用的库中,最终是要调用到svc指令的,使cpu陷入svc异常,进而跳转到Linux Kernel中的el0_svc向量表中。
如下展示了系统调用进入Linux Kernel后的具体流程:
el0_svc,el0_svc_handler(),el0_svc_common(),invoke_syscall(), syscall_fn()。syscall_fn指向系统调用tab表中的具体函数
(kernel-4.19/arch/arm64/kernel/entry.S)el0_svc:movx0, spblel0_svc_handlerbret_to_userENDPROC(el0_svc)
(kernel-4.19/arch/arm64/kernel/syscall.c)asmlinkage voidel0_svc_handler(struct pt_regs *regs){ sve_user_discard(); el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table);}staticvoidel0_svc_common(struct pt_regs *regs, int scno, int sc_nr,constsyscall_fn_t syscall_table[]){unsignedlong flags = current_thread_info()->flags; regs->orig_x0 = regs->regs[0]; regs->syscallno = scno; cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); user_exit();if (has_syscall_work(flags)) {/* set default errno for user-issued syscall(-1) */if (scno == NO_SYSCALL) regs->regs[0] = -ENOSYS; scno = syscall_trace_enter(regs);if (scno == NO_SYSCALL)goto trace_exit; } invoke_syscall(regs, scno, sc_nr, syscall_table);/* * The tracing status may have changed under our feet, so we have to * check again. However, if we were tracing entry, then we always trace * exit regardless, as the old entry assembly did. */if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { local_daif_mask(); flags = current_thread_info()->flags;if (!has_syscall_work(flags)) {/* * We're off to userspace, where interrupts are * always enabled after we restore the flags from * the SPSR. */ trace_hardirqs_on();return; } local_daif_restore(DAIF_PROCCTX); }trace_exit: syscall_trace_exit(regs);}staticvoidinvoke_syscall(struct pt_regs *regs, unsignedint scno,unsignedint sc_nr,constsyscall_fn_t syscall_table[]){long ret;if (scno < sc_nr) {syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ret = __invoke_syscall(regs, syscall_fn); //syscall_fn 就是tab表中的函数 } else { ret = do_ni_syscall(regs, scno); } regs->regs[0] = ret;}staticlong __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn){return syscall_fn(regs); //调用tab表中的函数}
系统调用在Kernel中的map表,都在kernel-4.19/include/uapi/asm-generic/unistd.h 中,表的名字是:sys_call_table,表中成员的示例如下:
#define __NR_io_setup 0__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)#define __NR_io_destroy 1__SYSCALL(__NR_io_destroy, sys_io_destroy)#define __NR_io_submit 2__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)#define __NR_io_cancel 3__SYSCALL(__NR_io_cancel, sys_io_cancel)#define __NR_io_getevents 4__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)/* fs/xattr.c */#define __NR_setxattr 5__SYSCALL(__NR_setxattr, sys_setxattr)#define __NR_lsetxattr 6__SYSCALL(__NR_lsetxattr, sys_lsetxattr)#define __NR_fsetxattr 7__SYSCALL(__NR_fsetxattr, sys_fsetxattr)#define __NR_getxattr 8__SYSCALL(__NR_getxattr, sys_getxattr)#define __NR_lgetxattr 9__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
系统调用函数的定义,都是以SYSCALL_DEFINEx的宏定义的,例如:
SYSCALL_DEFINE1(setgid, gid_t, gid){return __sys_setgid(gid);}SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args){struct __sysctl_argstmp;size_t oldlen = 0;ssize_t result;if (copy_from_user(&tmp, args, sizeof(tmp)))return -EFAULT;if (tmp.oldval && !tmp.oldlenp)return -EFAULT;if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))return -EFAULT; result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, tmp.newval, tmp.newlen);if (result >= 0) { oldlen = result; result = 0; }if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))return -EFAULT;return result;}
