通过/dev/kvm的ioctl:KVM_CREATE_VM创建虚拟机,将返回一个虚拟机的fd,使用该fd对虚拟机进行ioctl/mmap等操作:staticintkvm_dev_ioctl_create_vm(unsigned long type){ /* ========== 分配文件描述符 ========== */ fd = get_unused_fd_flags(O_CLOEXEC); ...... /* ========== 创建匿名inode文件 ========== */ file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); ...... fd_install(fd, file); return fd;}static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, .llseek = noop_llseek, KVM_COMPAT(kvm_vm_compat_ioctl),};staticlongkvm_vm_ioctl(structfile *filp, unsigned int ioctl, unsigned long arg){ struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r; switch (ioctl) { case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; ...... }}
对虚拟机最主要的两个操作是创建VCPU和设置Memory,有了这两样东西,就具备了运行程序的环境:staticintkvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsignedlong id){ struct kvm_vcpu *vcpu; struct page *page; // ... 前面的分配和初始化 ... vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); vcpu->run = page_address(page); kvm_vcpu_init(vcpu, kvm, id); r = kvm_arch_vcpu_create(vcpu); // 创建vcpu对应的fd r = create_vcpu_fd(vcpu); // ... 后续操作 ... // 返回vcpu对应的fd,或者负的错误代码 return r;}staticvoidkvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id){ mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; rwlock_init(&vcpu->pid_lock);#ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait);#endif kvm_async_pf_vcpu_init(vcpu); kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); vcpu->last_used_slot = NULL; /* Fill the stats id string for the vcpu */ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", task_pid_nr(current), id);}static struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, .mmap = kvm_vcpu_mmap, .llseek = noop_llseek, KVM_COMPAT(kvm_vcpu_compat_ioctl),};/* * Allocates an inode for the vcpu. */staticintcreate_vcpu_fd(struct kvm_vcpu *vcpu){ char name[8 + 1 + ITOA_MAX_LEN + 1]; snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);}
主要的函数是kvm_arch_vcpu_create:intkvm_arch_vcpu_create(struct kvm_vcpu *vcpu){ int err; /* ========== 步骤1: 初始化MP状态锁 ========== */ spin_lock_init(&vcpu->arch.mp_state_lock); /* ========== 步骤2: Lockdep配置 ========== */#ifdef CONFIG_LOCKDEP /* Inform lockdep that the config_lock is acquired after vcpu->mutex */ mutex_lock(&vcpu->mutex); mutex_lock(&vcpu->kvm->arch.config_lock); mutex_unlock(&vcpu->kvm->arch.config_lock); mutex_unlock(&vcpu->mutex);#endif /* ========== 步骤3: 清除初始化标志 ========== */ /* Force users to call KVM_ARM_VCPU_INIT */ vcpu_clear_flag(vcpu, VCPU_INITIALIZED); /* ========== 步骤4: 配置MMU页缓存 ========== */ vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; /* ========== 步骤5: 初始化Timer ========== */ /* Set up the timer */ kvm_timer_vcpu_init(vcpu); /* ========== 步骤6: 初始化PMU ========== */ kvm_pmu_vcpu_init(vcpu); /* ========== 步骤7: 初始化PV Time (Paravirtualization Timer) ========== */ kvm_arm_pvtime_vcpu_init(&vcpu->arch); /* ========== 步骤8: 关联硬件MMU ========== */ vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; /* ========== 步骤9: 清理MPIDR数据 (Multiprocessor Affinity Register)========== */ /* * This vCPU may have been created after mpidr_data was initialized. * Throw out the pre-computed mappings if that is the case which forces * KVM to fall back to iteratively searching the vCPUs. */ kvm_destroy_mpidr_data(vcpu->kvm); /* ========== 步骤10: 初始化VGIC ========== */ err = kvm_vgic_vcpu_init(vcpu); /* ========== 步骤11: 共享到Hypervisor, VHE使能的时候Bypass ========== */ err = kvm_share_hyp(vcpu, vcpu + 1); return err;}
主要是timer和VGIC的初始化,相关数据结构定义如下:
struct kvm_vcpu { struct kvm *kvm; int cpu; int vcpu_id; /* id given by userspace at creation */ int vcpu_idx; /* index into kvm->vcpu_array */ struct kvm_run *run; struct kvm_vcpu_arch arch; struct kvm_vcpu_stat stat;};struct kvm_vcpu_arch { /* VGIC state */ struct vgic_cpu vgic_cpu; struct arch_timer_cpu timer_cpu; struct kvm_pmu pmu;}
Timer相关定义:
struct arch_timer_cpu { struct arch_timer_context timers[NR_KVM_TIMERS]; /* Background timer used when the guest is not running */ struct hrtimer bg_timer; /* Is the timer enabled */ bool enabled;};struct arch_timer_context { /* Emulated Timer (may be unused) */ struct hrtimer hrtimer; u64 ns_frac; /* Offset for this counter/timer */ struct arch_timer_offset offset; /* * We have multiple paths which can save/restore the timer state onto * the hardware, so we need some way of keeping track of where the * latest state is. */ bool loaded; /* Output level of the timer IRQ */ struct { bool level; } irq; /* Who am I? */ enum kvm_arch_timers timer_id; /* Duplicated state from arch_timer.c for convenience */ u32 host_timer_irq;};#define vcpu_timer(v) (&(v)->arch.timer_cpu)#define vcpu_get_timer(v,t) (&vcpu_timer(v)->timers[(t)])#define vcpu_vtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_VTIMER])#define vcpu_ptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_PTIMER])#define vcpu_hvtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HVTIMER])#define vcpu_hptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HPTIMER])
VGIC相关定义:
struct vgic_cpu { /* CPU vif control registers for world switch */ union { struct vgic_v2_cpu_if vgic_v2; struct vgic_v3_cpu_if vgic_v3; }; struct vgic_irq *private_irqs; /* * Members below are used with GICv3 emulation only and represent * parts of the redistributor. */ struct vgic_io_device rd_iodev; struct vgic_redist_region *rdreg;};struct vgic_v3_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_sre; /* Restored only, change ignored */ u32 vgic_ap0r[4]; u32 vgic_ap1r[4]; u64 vgic_lr[VGIC_V3_MAX_LRS]; /* * GICv4 ITS per-VPE data, containing the doorbell IRQ, the * pending table pointer, the its_vm pointer and a few other * HW specific things. As for the its_vm structure, this is * linking the Linux IRQ subsystem and the ITS together. */ struct its_vpe its_vpe; unsigned int used_lrs;};
定时器初始化kvm_timer_vcpu_init:
voidkvm_timer_vcpu_init(struct kvm_vcpu *vcpu){ struct arch_timer_cpu *timer = vcpu_timer(vcpu); /* ===== 初始化所有定时器上下文 ===== */ /* * 遍历所有定时器(TIMER_PTIMER, TIMER_VTIMER, TIMER_HVTIMER, TIMER_HPTIMER) * NR_KVM_TIMERS = 4 (定义在 include/kvm/arm_arch_timer.h) * * timer_context_init() 会为每个定时器: * 1. 设置 timer_id * 2. 配置 offset.vm_offset(VTIMER用voffset,其他用poffset) * 3. 初始化 hrtimer(每个定时器都有自己的hrtimer) * 4. 设置 host_timer_irq(物理timer用host_ptimer_irq,虚拟timer用host_vtimer_irq) */ for (int i = 0; i < NR_KVM_TIMERS; i++) timer_context_init(vcpu, i); /* ===== 同步定时器偏移 ===== */ /* * 如果VM级别的计数器偏移还未设置(首次初始化vCPU): * - VTIMER: 设置偏移为当前物理计数器值(让虚拟计数器从0开始) * - PTIMER: 偏移为0(直接使用物理计数器) * * KVM_ARCH_FLAG_VM_COUNTER_OFFSET标志表示用户空间通过 * KVM_ARM_SET_COUNTER_OFFSET ioctl显式设置了偏移值 */ if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) { timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read()); timer_set_offset(vcpu_ptimer(vcpu), 0); } /* ===== 初始化后台定时器 ===== */ /* * bg_timer (background timer) 是一个全局的hrtimer,用于: * - 在vCPU未运行时模拟定时器到期 * - 当定时器到期时注入中断 * - 与每个定时器上下文中的hrtimer配合工作 * * 使用 hrtimer_setup() 而不是旧的 hrtimer_init() * HRTIMER_MODE_ABS_HARD: 绝对时间模式,硬中断上下文安全 */ hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);}staticinlinevoidtimer_set_offset(struct arch_timer_context *ctxt, u64 offset){ WRITE_ONCE(*ctxt->offset.vm_offset, offset);}staticvoidtimer_context_init(struct kvm_vcpu *vcpu, int timerid){ struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid); struct kvm *kvm = vcpu->kvm; ctxt->timer_id = timerid; if (timerid == TIMER_VTIMER) ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset; else ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset; hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); switch (timerid) { case TIMER_PTIMER: case TIMER_HPTIMER: ctxt->host_timer_irq = host_ptimer_irq; break; case TIMER_VTIMER: case TIMER_HVTIMER: ctxt->host_timer_irq = host_vtimer_irq; break; }}
VGIC的初始化:
intkvm_vgic_vcpu_init(struct kvm_vcpu *vcpu){ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; int ret = 0; /* ===== 步骤1: 初始化Redistributor地址和AP列表 ===== */ /* * Redistributor的基址初始为VGIC_ADDR_UNDEF, * 表示尚未被用户空间通过KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION配置。 */ vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; /* * ap_list_head是Active-and-Pending中断队列的链表头。 * 待注入中断在进入guest前会从此队列写入硬件List Registers。 * ap_list_lock保护该链表的并发访问。 */ INIT_LIST_HEAD(&vgic_cpu->ap_list_head); raw_spin_lock_init(&vgic_cpu->ap_list_lock); /* * 初始化vLPI (virtual LPI) 计数器为0, * 用于GICv4 ITS直通中断的引用计数。 */ atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0); /* ===== 步骤2: 如果没有内核irqchip则直接返回 ===== */ /* * 如果使用用户空间irqchip(不推荐),VGIC未在内核中初始化, * 私有中断的分配和redistibutor注册均不需要。 */ if (!irqchip_in_kernel(vcpu->kvm)) return 0; /* ===== 步骤3: 分配私有中断(SGI + PPI)===== */ /* * vgic_allocate_private_irqs()在config_lock保护下调用 * vgic_allocate_private_irqs_locked(),完成以下工作: * * 1. 分配VGIC_NR_PRIVATE_IRQS (32)个vgic_irq结构体 * 2. 为每个中断初始化: * - INIT_LIST_HEAD(&irq->ap_list) * - raw_spin_lock_init(&irq->irq_lock) * - irq->intid = i * - irq->vcpu = NULL(动态分配,非固定) * - irq->target_vcpu = vcpu * - refcount_set(&irq->refcount, 0) * - SGI (0-15): enabled=1, config=VGIC_CONFIG_EDGE * - PPI (16-31): config=VGIC_CONFIG_LEVEL * - GICv3: group=1, mpidr=kvm_vcpu_get_mpidr_aff(vcpu) * - GICv2: group=0, targets=BIT(vcpu->vcpu_id) */ ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); if (ret) return ret; /* ===== 步骤4: GICv3 Redistributor iodev注册 ===== */ /* * GICv3每个vCPU对应一个Redistributor。 * 用户空间通过MMIO访问Redistributor来管理PPI/SGI中断。 * 注册iodev后,guest对Redistributor地址范围的MMIO访问 * 会被KVM拦截并交由vgic_register_redist_iodev处理。 * 注册过程需要持有slots_lock以保护MMIO槽位的并发修改。 */ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { mutex_lock(&vcpu->kvm->slots_lock); ret = vgic_register_redist_iodev(vcpu); mutex_unlock(&vcpu->kvm->slots_lock); } return ret;}
Memory region的设置,分配一段用户态内存userspace_addr,该段内存映射到VM的某个slot,其物理地址guest_phys_addr:
struct kvm_userspace_memory_region region = { .slot = memory_slot_count, .flags = flags, .guest_phys_addr = guest_addr, .memory_size = memory_len, .userspace_addr = (uint64_t) mem, }; memory_slot_count++; ioctl_exit_on_error(vmfd, KVM_SET_USER_MEMORY_REGION, "KVM_SET_USER_MEMORY_REGION", ®ion);
在KVM里面:
case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region2 mem; if (copy_from_user(&mem, argp, size)) goto out; r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
kvm_vm_ioctl_set_memory_region主要做一些检查,然后调用kvm_set_memslot:staticintkvm_set_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change){ r = kvm_prepare_memory_region(kvm, old, new, change); ...... if (change == KVM_MR_CREATE) kvm_create_memslot(kvm, new); else ....... ...... /* * No need to refresh new->arch, changes after dropping slots_arch_lock * will directly hit the final, active memslot. Architectures are * responsible for knowing that new->arch may be stale. */ kvm_commit_memory_region(kvm, old, new, change); return 0;}
kvm_prepare_memory_region将调用kvm_arch_prepare_memory_region:
// arch/arm64/kvm/mmu.cintkvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old, struct kvm_memory_slot *new, enum kvm_mr_change change){ hva_t hva, reg_end; int ret = 0; if (change != KVM_MR_CREATE && change != KVM_MR_MOVE) return 0; // 预先建立Stage-2映射 hva = new->userspace_addr; reg_end = hva + (new->npages << PAGE_SHIFT); ret = stage2_set_pte_wrprotect(kvm, new, hva, reg_end); return ret;}
kvm_create_memslot:
staticvoidkvm_create_memslot(struct kvm *kvm, struct kvm_memory_slot *new){/* Add the new memslot to the inactive set and activate. */kvm_replace_memslot(kvm, NULL, new);kvm_activate_memslot(kvm, NULL, new);}
kvm_commit_memory_region会调用kvm_arch_commit_memory_region,对ARM并没有太多操作:
staticvoidkvm_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change){ int old_flags = old ? old->flags : 0; int new_flags = new ? new->flags : 0; /* ========== 步骤1: 更新总页数 ========== */ if (change == KVM_MR_DELETE) kvm->nr_memslot_pages -= old->npages; else if (change == KVM_MR_CREATE) kvm->nr_memslot_pages += new->npages; /* ========== 步骤2: 更新脏页追踪计数 ========== */ if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) { int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1; atomic_set(&kvm->nr_memslots_dirty_logging, atomic_read(&kvm->nr_memslots_dirty_logging) + change); } /* ========== 步骤3: 架构特定提交 ========== */ kvm_arch_commit_memory_region(kvm, old, new, change); /* ========== 步骤4: 清理旧槽资源 ========== */ switch (change) { case KVM_MR_CREATE: /* Nothing more to do. */ break; case KVM_MR_DELETE: /* Free the old memslot and all its metadata. */ kvm_free_memslot(kvm, old); break; case KVM_MR_MOVE: case KVM_MR_FLAGS_ONLY: /* Free the dirty bitmap as needed */ if (old->dirty_bitmap && !new->dirty_bitmap) kvm_destroy_dirty_bitmap(old); /* * The final quirk. Free the detached, old slot, but only its * memory, not any metadata. Metadata, including arch specific * data, may be reused by @new. */ kfree(old); break; default: BUG(); }}