static struct file_operations kvm_chardev_ops = {.unlocked_ioctl = kvm_dev_ioctl,.llseek = noop_llseek,KVM_COMPAT(kvm_dev_ioctl),};static struct miscdevice kvm_dev = {KVM_MINOR,"kvm",&kvm_chardev_ops,};intkvm_init(unsigned vcpu_size, unsigned vcpu_align, structmodule *module){......r = misc_register(&kvm_dev);......return 0;}staticlongkvm_dev_ioctl(struct file *filp,unsigned int ioctl, unsigned long arg){int r = -EINVAL;switch (ioctl) {case KVM_GET_API_VERSION:if (arg)goto out;r = KVM_API_VERSION;break;case KVM_CREATE_VM:r = kvm_dev_ioctl_create_vm(arg);break;case KVM_CHECK_EXTENSION:r = kvm_vm_ioctl_check_extension_generic(NULL, arg);break;case KVM_GET_VCPU_MMAP_SIZE:if (arg)goto out;r = PAGE_SIZE; /* struct kvm_run */#ifdef CONFIG_X86r += PAGE_SIZE; /* pio data page */#endif#ifdef CONFIG_KVM_MMIOr += PAGE_SIZE; /* coalesced mmio ring page */#endifbreak;default:return kvm_arch_dev_ioctl(filp, ioctl, arg);}out:return r;}
kvm_vm_ioctl_create_vm会返回一个新的对应VM的fd,用该fd对VM进行专门的操作,所以开始是在内核准备fd对应的基础设置。
staticintkvm_dev_ioctl_create_vm(unsigned long type){char fdname[ITOA_MAX_LEN + 1];int r, fd;struct kvm *kvm;struct file *file;/* ========== 步骤1: 分配文件描述符 ========== */fd = get_unused_fd_flags(O_CLOEXEC);/* ========== 步骤2: 生成文件描述符名称 ========== */snprintf(fdname, sizeof(fdname), "%d", fd);/* ========== 步骤3: 创建VM核心对象 ========== */kvm = kvm_create_vm(type, fdname);/* ========== 步骤4: 创建匿名inode文件 ========== */file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);/* ========== 步骤5: 发送uevent通知 ========== *//** Don't call kvm_put_kvm anymore at this point; file->f_op is* already set, with ->release() being kvm_vm_release(). In error* cases it will be called by the final fput(file) and will take* care of doing kvm_put_kvm(kvm).*/kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);/* ========== 步骤6: 安装文件描述符 ========== */fd_install(fd, file);return fd;/* ========== 错误处理 ========== */}
重点关注kvm_create_vm:
static struct kvm *kvm_create_vm(unsigned long type, const char *fdname){struct kvm *kvm = kvm_arch_alloc_vm();struct kvm_memslots *slots;int r, i, j;/* ===== 第一部分: 基础初始化 ===== */KVM_MMU_LOCK_INIT(kvm);mmgrab(current->mm);kvm->mm = current->mm;kvm_eventfd_init(kvm);mutex_init(&kvm->lock);mutex_init(&kvm->irq_lock);mutex_init(&kvm->slots_lock);mutex_init(&kvm->slots_arch_lock);spin_lock_init(&kvm->mn_invalidate_lock);rcuwait_init(&kvm->mn_memslots_update_rcuwait);xa_init(&kvm->vcpu_array);#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTESxa_init(&kvm->mem_attr_array);#endifINIT_LIST_HEAD(&kvm->gpc_list);spin_lock_init(&kvm->gpc_lock);INIT_LIST_HEAD(&kvm->devices);kvm->max_vcpus = KVM_MAX_VCPUS;BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);/** Force subsequent debugfs file creations to fail if the VM directory* is not created (by kvm_create_vm_debugfs()).*/kvm->debugfs_dentry = ERR_PTR(-ENOENT);snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",task_pid_nr(current));/* ===== 第二部分: SRCU初始化 ===== */r = -ENOMEM;if (init_srcu_struct(&kvm->srcu))goto out_err_no_srcu;if (init_srcu_struct(&kvm->irq_srcu))goto out_err_no_irq_srcu;/* ===== 第三部分: IRQ路由初始化 ===== */r = kvm_init_irq_routing(kvm);/* ===== 第四部分: 引用计数初始化 ===== */refcount_set(&kvm->users_count, 1);/* ===== 第五部分: 内存槽初始化 ===== */for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {for (j = 0; j < 2; j++) {slots = &kvm->__memslots[i][j];atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);slots->hva_tree = RB_ROOT_CACHED;slots->gfn_tree = RB_ROOT;hash_init(slots->id_hash);slots->node_idx = j;/* Generations must be different for each address space. */slots->generation = i;}rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);}/* ===== 第六部分: I/O总线初始化 ===== */for (i = 0; i < KVM_NR_BUSES; i++) {rcu_assign_pointer(kvm->buses[i],kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));}/* ===== 第七部分: 架构特定初始化 ===== */r = kvm_arch_init_vm(kvm, type);/* ===== 第八部分: 启用虚拟化 ===== */r = kvm_enable_virtualization();/* ===== 第九部分: IRQ确认通知器 ===== */#ifdef CONFIG_HAVE_KVM_IRQCHIPINIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);#endif/* ===== 第十部分: MMU通知器初始化 ===== */r = kvm_init_mmu_notifier(kvm);/* ===== 第十一部分: 合并MMIO初始化 ===== */r = kvm_coalesced_mmio_init(kvm);/* ===== 第十二部分: 创建debugfs ===== */r = kvm_create_vm_debugfs(kvm, fdname);/* ===== 第十三部分: 添加到全局VM列表 ===== */mutex_lock(&kvm_lock);list_add(&kvm->vm_list, &vm_list);mutex_unlock(&kvm_lock);/* ===== 第十四部分: 通知器注册 ===== */preempt_notifier_inc();kvm_init_pm_notifier(kvm);return kvm;/* ===== 错误处理路径 ===== */}/** Allocate empty IRQ routing by default so that additional setup isn't needed* when userspace-driven IRQ routing is activated, and so that kvm->irq_routing* is guaranteed to be non-NULL.*/intkvm_init_irq_routing(struct kvm *kvm){struct kvm_irq_routing_table *new;int chip_size;new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT);new->nr_rt_entries = 1;chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS;memset(new->chip, -1, chip_size);RCU_INIT_POINTER(kvm->irq_routing, new);return 0;}intkvm_coalesced_mmio_init(struct kvm *kvm){struct page *page;page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);kvm->coalesced_mmio_ring = page_address(page);/** We're using this spinlock to sync access to the coalesced ring.* The list doesn't need its own lock since device registration and* unregistration should only happen when kvm->slots_lock is held.*/spin_lock_init(&kvm->ring_lock);INIT_LIST_HEAD(&kvm->coalesced_zones);return 0;}
其中kvm_arch_init_vm比较关键:
intkvm_arch_init_vm(struct kvm *kvm, unsigned long type){int ret;/* ===== 步骤1: 初始化配置锁 ===== */mutex_init(&kvm->arch.config_lock);#ifdef CONFIG_LOCKDEP/* Clue in lockdep that the config_lock must be taken inside kvm->lock */mutex_lock(&kvm->lock);mutex_lock(&kvm->arch.config_lock);mutex_unlock(&kvm->arch.config_lock);mutex_unlock(&kvm->lock);#endif/* ===== 步骤2: 初始化嵌套虚拟化 ===== */kvm_init_nested(kvm);/* ===== 步骤3: 共享KVM结构到Hyp ===== */ret = kvm_share_hyp(kvm, kvm + 1);/* ===== 步骤4: 分配支持的CPU掩码 ===== */if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {ret = -ENOMEM;goto err_unshare_kvm;}cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);/* ===== 步骤5: 初始化Stage-2 MMU ===== */ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);/* ===== 步骤6: Protected KVM初始化 ===== */if (is_protected_kvm_enabled()) {/** If any failures occur after this is successful, make sure to* call __pkvm_unreserve_vm to unreserve the VM in hyp.*/ret = pkvm_init_host_vm(kvm);}/* ===== 步骤7: VGIC早期初始化 ===== */kvm_vgic_early_init(kvm);/* ===== 步骤8: Timer初始化 ===== */kvm_timer_init_vm(kvm);/* ===== 步骤9: 设置最大vCPU数 ===== *//* The maximum number of VCPUs is limited by the host's GIC model */kvm->max_vcpus = kvm_arm_default_max_vcpus();/* ===== 步骤10: 初始化Hypercalls ===== */kvm_arm_init_hypercalls(kvm);/* ===== 步骤11: 清空vCPU特性位图 ===== */bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);return 0;/* ===== 错误处理 ===== */}/*** kvm_vgic_early_init() - Initialize static VGIC VCPU data structures* @kvm: The VM whose VGIC districutor should be initialized** Only do initialization of static structures that don't require any* allocation or sizing information from userspace. vgic_init() called* kvm_vgic_dist_init() which takes care of the rest.*/voidkvm_vgic_early_init(struct kvm *kvm){struct vgic_dist *dist = &kvm->arch.vgic;xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);}voidkvm_timer_init_vm(struct kvm *kvm){for (int i = 0; i < NR_KVM_TIMERS; i++)kvm->arch.timer_data.ppi[i] = default_ppi[i];}voidkvm_arm_init_hypercalls(struct kvm *kvm){struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat;smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES;smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES;smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;mt_init(&kvm->arch.smccc_filter);}
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type){int cpu, err;struct kvm_pgtable *pgt;/* ========== 步骤1: 检查是否已初始化 ========== *//** If we already have our page tables in place, and that the* MMU context is the canonical one, we have a bug somewhere,* as this is only supposed to ever happen once per VM.** Otherwise, we're building nested page tables, and that's* probably because userspace called KVM_ARM_VCPU_INIT more* than once on the same vcpu. Since that's actually legal,* don't kick a fuss and leave gracefully.*/if (mmu->pgt != NULL) {if (kvm_is_nested_s2_mmu(kvm, mmu))return 0;kvm_err("kvm_arch already initialized?\n");return -EINVAL;}/* ========== 步骤2: 初始化IPA范围 ========== */err = kvm_init_ipa_range(mmu, type);/* ========== 步骤3: 分配页表结构 ========== */pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);/* ========== 步骤4: 初始化页表 ========== */mmu->arch = &kvm->arch;err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops);/* ========== 步骤5: 关联页表到MMU ========== */mmu->pgt = pgt;if (is_protected_kvm_enabled())return 0;/* ========== 步骤6: 分配Per-CPU追踪 ========== */mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));/* ========== 步骤7: 初始化Per-CPU数据 ========== */for_each_possible_cpu(cpu)*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;/* ========== 步骤8: 配置页分割 ========== *//* The eager page splitting is disabled by default */mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;mmu->split_page_cache.gfp_zero = __GFP_ZERO;/* ========== 步骤9: 记录页表物理地址 ========== */mmu->pgd_phys = __pa(pgt->pgd);/* ========== 步骤10: 嵌套虚拟化初始化 ========== */if (kvm_is_nested_s2_mmu(kvm, mmu))kvm_init_nested_s2_mmu(mmu);return 0;/* ========== 错误处理 ========== */}
static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type){u32 kvm_ipa_limit = get_kvm_ipa_limit();u64 mmfr0, mmfr1;u32 phys_shift;/* ===== 步骤1: 验证type标志 ===== */if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)return -EINVAL;/* ===== 步骤2: 确定物理地址位数 ===== */phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);if (is_protected_kvm_enabled()) {// Protected KVM: 使用系统限制phys_shift = kvm_ipa_limit;} else if (phys_shift) {// 用户指定IPA大小: 验证范围if (phys_shift > kvm_ipa_limit ||phys_shift < ARM64_MIN_PARANGE_BITS)return -EINVAL;} else {// 未指定: 使用默认值phys_shift = KVM_PHYS_SHIFT;if (phys_shift > kvm_ipa_limit) {pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",current->comm);return -EINVAL;}}/* ===== 步骤3: 读取CPU特性寄存器 ===== */mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);/* ===== 步骤4: 计算VTCR_EL2值 ===== */mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);return 0;}u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift){u64 vtcr = VTCR_EL2_FLAGS;u8 lvls;// 1. 设置物理地址大小vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;// 2. 设置T0SZ (地址大小)vtcr |= VTCR_EL2_T0SZ(phys_shift);// 3. 设置页表级数lvls = stage2_pgtable_levels(phys_shift);vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);// 4. 设置页粒度if (PAGE_SIZE == SZ_64K)vtcr |= VTCR_EL2_TG0_64K;else if (PAGE_SIZE == SZ_16K)vtcr |= VTCR_EL2_TG0_16K;elsevtcr |= VTCR_EL2_TG0_4K;// 5. 设置shareability和cacheabilityvtcr |= VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA |VTCR_EL2_IRGN0_WBWA;return vtcr;}
[63:56] 保留[55:48] PS - 物理地址大小[47:32] 保留[31:30] TG0 - 页粒度 (00=4KB, 01=64KB, 10=16KB)[29:28] SH0 - Shareability (10=Inner shareable)[27:26] ORGN0 - Outer cacheability[25:24] IRGN0 - Inner cacheability[23:19] SL2 - Starting level 2 (AArch64.52)[18:16] SL0 - Starting level[15:14] T0SZ - 地址大小 (64 - IPA位数)[13:0] 其他控制位示例: 40位IPA,4KB页:
PS = 0x2 (40位物理地址)T0SZ = 24 (64 - 40)SL0 = 0x1 (从Level 1开始,3级页表)TG0 = 0x0 (4KB页)SH0 = 0x3 (Inner shareable)ORGN0 = 0x1 (Write-back, write-allocate)IRGN0 = 0x1 (Write-back, write-allocate)定义页表层级结构(起始级别、页面颗粒度)
绑定内存管理回调(如何申请/释放物理页用于存放页表项)
计算地址转换参数(IPA 范围、页表步进规则)
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,struct kvm_pgtable_mm_ops *mm_ops,enum kvm_pgtable_stage2_flags flags,kvm_pgtable_force_pte_cb_t force_pte_cb){size_t pgd_sz;u64 vtcr = mmu->vtcr;u32 ia_bits = VTCR_EL2_IPA(vtcr);u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;/* ===== 步骤1: 计算PGD大小 ===== */pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;/* ===== 步骤2: 分配PGD ===== */pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);/* ===== 步骤3: 初始化页表结构 ===== */pgt->ia_bits = ia_bits;pgt->start_level = start_level;pgt->mm_ops = mm_ops;pgt->mmu = mmu;pgt->flags = flags;pgt->force_pte_cb = force_pte_cb;/* ===== 步骤4: 确保PGD对硬件可见 ===== *//* Ensure zeroed PGD pages are visible to the hardware walker */dsb(ishst);return 0;}staticinline u32 kvm_pgd_pages(u32 ia_bits, s8 start_level){// 根据IPA位数和起始级别计算需要多少页return stage2_pgtable_levels(ia_bits) - start_level;}staticinline u32 stage2_pgtable_levels(u32 ipa_bits){u32 bits_resolved = PAGE_SHIFT;// 每级页表解析9位(对于4KB页)if (ipa_bits > bits_resolved)return DIV_ROUND_UP(ipa_bits - bits_resolved,PAGE_SHIFT - 3);return 1;}
页表级数示例(4KB页):
Level 0: [47:39] - 512GB/entry ├── Level 1: [38:30] - 1GB/entry │ ├── Level 2: [29:21] - 2MB/entry │ │ └── Level 3: [20:12] - 4KB/entry │ │ └── [11:0] - 页内偏移40位IPA示例:
IPA: 0x12_3456_7000 (40位)[39:30] = 0x048 → Level 1表项 #72[29:21] = 0x1A3 → Level 2表项 #419[20:12] = 0x067 → Level 3表项 #103[11:0] = 0x000 → 页内偏移物理地址转换:Level 1[72] → Level 2表基址Level 2[419] → Level 3表基址Level 3[103] → 物理页基址 + 偏移 = 最终PAstatic struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {.zalloc_pages_exact = stage2_memcache_zalloc_page,.free_pages_exact = stage2_free_pages_exact,.zalloc_page = stage2_memcache_zalloc_page,.phys_to_virt = kvm_host_va,.virt_to_phys = kvm_host_pa,.page_count = stage2_page_count,.get_page = stage2_get_page,.put_page = stage2_put_page,.dcache_clean_inval_poc = stage2_dcache_clean_inval_poc,.icache_inval_pou = stage2_icache_inval_pou,};
zalloc_pages_exact: 分配并清零页面
free_pages_exact: 释放页面
phys_to_virt: 物理地址转虚拟地址
virt_to_phys: 虚拟地址转物理地址
get_page/put_page: 引用计数管理
dcache_clean_inval_poc: 清理数据缓存
icache_inval_pou: 无效化指令缓存