当前位置：首页>Linux>Linux KVM虚拟机运行环境的设置:VCPU/Memory

Linux KVM虚拟机运行环境的设置:VCPU/Memory

2026-02-28 01:15:30

通过/dev/kvm的ioctl：KVM_CREATE_VM创建虚拟机，将返回一个虚拟机的fd，使用该fd对虚拟机进行ioctl/mmap等操作：

staticintkvm_dev_ioctl_create_vm(unsigned long type){    /* ========== 分配文件描述符 ========== */    fd = get_unused_fd_flags(O_CLOEXEC);    ......    /* ========== 创建匿名inode文件 ========== */    file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);    ......    fd_install(fd, file);    return fd;}static struct file_operations kvm_vm_fops = {    .release        = kvm_vm_release,    .unlocked_ioctl = kvm_vm_ioctl,    .llseek     = noop_llseek,    KVM_COMPAT(kvm_vm_compat_ioctl),};staticlongkvm_vm_ioctl(structfile *filp,               unsigned int ioctl, unsigned long arg){    struct kvm *kvm = filp->private_data;    void __user *argp = (void __user *)arg;    int r;    switch (ioctl) {    case KVM_CREATE_VCPU:        r = kvm_vm_ioctl_create_vcpu(kvm, arg);        break;    case KVM_SET_USER_MEMORY_REGION2:    case KVM_SET_USER_MEMORY_REGION:        r = kvm_vm_ioctl_set_memory_region(kvm, &mem);        break;    ......    }}

对虚拟机最主要的两个操作是创建VCPU和设置Memory，有了这两样东西，就具备了运行程序的环境：

staticintkvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsignedlong id){    struct kvm_vcpu *vcpu;    struct page *page;    // ... 前面的分配和初始化 ...    vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);    page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);    vcpu->run = page_address(page);    kvm_vcpu_init(vcpu, kvm, id);    r = kvm_arch_vcpu_create(vcpu);    // 创建vcpu对应的fd    r = create_vcpu_fd(vcpu);    // ... 后续操作 ...    // 返回vcpu对应的fd，或者负的错误代码    return r;}staticvoidkvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id){    mutex_init(&vcpu->mutex);    vcpu->cpu = -1;    vcpu->kvm = kvm;    vcpu->vcpu_id = id;    vcpu->pid = NULL;    rwlock_init(&vcpu->pid_lock);#ifndef __KVM_HAVE_ARCH_WQP    rcuwait_init(&vcpu->wait);#endif    kvm_async_pf_vcpu_init(vcpu);    kvm_vcpu_set_in_spin_loop(vcpu, false);    kvm_vcpu_set_dy_eligible(vcpu, false);    vcpu->preempted = false;    vcpu->ready = false;    preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);    vcpu->last_used_slot = NULL;    /* Fill the stats id string for the vcpu */    snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",         task_pid_nr(current), id);}static struct file_operations kvm_vcpu_fops = {	.release        = kvm_vcpu_release,	.unlocked_ioctl = kvm_vcpu_ioctl,	.mmap           = kvm_vcpu_mmap,	.llseek		= noop_llseek,        KVM_COMPAT(kvm_vcpu_compat_ioctl),};/* * Allocates an inode for the vcpu. */staticintcreate_vcpu_fd(struct kvm_vcpu *vcpu){    char name[8 + 1 + ITOA_MAX_LEN + 1];    snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);    return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);}

主要的函数是kvm_arch_vcpu_create：

intkvm_arch_vcpu_create(struct kvm_vcpu *vcpu){    int err;    /* ========== 步骤1: 初始化MP状态锁 ========== */    spin_lock_init(&vcpu->arch.mp_state_lock);    /* ========== 步骤2: Lockdep配置 ========== */#ifdef CONFIG_LOCKDEP    /* Inform lockdep that the config_lock is acquired after vcpu->mutex */    mutex_lock(&vcpu->mutex);    mutex_lock(&vcpu->kvm->arch.config_lock);    mutex_unlock(&vcpu->kvm->arch.config_lock);    mutex_unlock(&vcpu->mutex);#endif    /* ========== 步骤3: 清除初始化标志 ========== */    /* Force users to call KVM_ARM_VCPU_INIT */    vcpu_clear_flag(vcpu, VCPU_INITIALIZED);    /* ========== 步骤4: 配置MMU页缓存 ========== */    vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;    /* ========== 步骤5: 初始化Timer ========== */    /* Set up the timer */    kvm_timer_vcpu_init(vcpu);    /* ========== 步骤6: 初始化PMU ========== */    kvm_pmu_vcpu_init(vcpu);    /* ========== 步骤7: 初始化PV Time (Paravirtualization Timer) ========== */    kvm_arm_pvtime_vcpu_init(&vcpu->arch);    /* ========== 步骤8: 关联硬件MMU ========== */    vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;    /* ========== 步骤9: 清理MPIDR数据 (Multiprocessor Affinity Register)========== */    /*     * This vCPU may have been created after mpidr_data was initialized.     * Throw out the pre-computed mappings if that is the case which forces     * KVM to fall back to iteratively searching the vCPUs.     */    kvm_destroy_mpidr_data(vcpu->kvm);    /* ========== 步骤10: 初始化VGIC ========== */    err = kvm_vgic_vcpu_init(vcpu);    /* ========== 步骤11: 共享到Hypervisor， VHE使能的时候Bypass ========== */    err = kvm_share_hyp(vcpu, vcpu + 1);    return err;}

主要是timer和VGIC的初始化，相关数据结构定义如下：

struct kvm_vcpu {    struct kvm *kvm;    int cpu;    int vcpu_id; /* id given by userspace at creation */    int vcpu_idx; /* index into kvm->vcpu_array */    struct kvm_run *run;    struct kvm_vcpu_arch arch;    struct kvm_vcpu_stat stat;};struct kvm_vcpu_arch {    /* VGIC state */    struct vgic_cpu vgic_cpu;    struct arch_timer_cpu timer_cpu;    struct kvm_pmu pmu;}

Timer相关定义：

struct arch_timer_cpu {    struct arch_timer_context timers[NR_KVM_TIMERS];    /* Background timer used when the guest is not running */    struct hrtimer          bg_timer;    /* Is the timer enabled */    bool            enabled;};struct arch_timer_context {    /* Emulated Timer (may be unused) */    struct hrtimer          hrtimer;    u64             ns_frac;    /* Offset for this counter/timer */    struct arch_timer_offset    offset;    /*     * We have multiple paths which can save/restore the timer state onto     * the hardware, so we need some way of keeping track of where the     * latest state is.     */    bool                loaded;    /* Output level of the timer IRQ */    struct {        bool            level;    } irq;    /* Who am I? */    enum kvm_arch_timers        timer_id;    /* Duplicated state from arch_timer.c for convenience */    u32             host_timer_irq;};#define vcpu_timer(v)   (&(v)->arch.timer_cpu)#define vcpu_get_timer(v,t) (&vcpu_timer(v)->timers[(t)])#define vcpu_vtimer(v)  (&(v)->arch.timer_cpu.timers[TIMER_VTIMER])#define vcpu_ptimer(v)  (&(v)->arch.timer_cpu.timers[TIMER_PTIMER])#define vcpu_hvtimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HVTIMER])#define vcpu_hptimer(v) (&(v)->arch.timer_cpu.timers[TIMER_HPTIMER])

VGIC相关定义：

struct vgic_cpu {    /* CPU vif control registers for world switch */    union {        struct vgic_v2_cpu_if   vgic_v2;        struct vgic_v3_cpu_if   vgic_v3;    };    struct vgic_irq *private_irqs;    /*     * Members below are used with GICv3 emulation only and represent     * parts of the redistributor.     */    struct vgic_io_device   rd_iodev;    struct vgic_redist_region *rdreg;};struct vgic_v3_cpu_if {    u32     vgic_hcr;    u32     vgic_vmcr;    u32     vgic_sre;   /* Restored only, change ignored */    u32     vgic_ap0r[4];    u32     vgic_ap1r[4];    u64     vgic_lr[VGIC_V3_MAX_LRS];    /*     * GICv4 ITS per-VPE data, containing the doorbell IRQ, the     * pending table pointer, the its_vm pointer and a few other     * HW specific things. As for the its_vm structure, this is     * linking the Linux IRQ subsystem and the ITS together.     */    struct its_vpe  its_vpe;    unsigned int used_lrs;};

定时器初始化kvm_timer_vcpu_init：

voidkvm_timer_vcpu_init(struct kvm_vcpu *vcpu){    struct arch_timer_cpu *timer = vcpu_timer(vcpu);    /* ===== 初始化所有定时器上下文 ===== */    /*     * 遍历所有定时器（TIMER_PTIMER, TIMER_VTIMER, TIMER_HVTIMER, TIMER_HPTIMER）     * NR_KVM_TIMERS = 4 (定义在 include/kvm/arm_arch_timer.h)     *      * timer_context_init() 会为每个定时器：     * 1. 设置 timer_id     * 2. 配置 offset.vm_offset（VTIMER用voffset，其他用poffset）     * 3. 初始化 hrtimer（每个定时器都有自己的hrtimer）     * 4. 设置 host_timer_irq（物理timer用host_ptimer_irq，虚拟timer用host_vtimer_irq）     */    for (int i = 0; i < NR_KVM_TIMERS; i++)        timer_context_init(vcpu, i);    /* ===== 同步定时器偏移 ===== */    /*     * 如果VM级别的计数器偏移还未设置（首次初始化vCPU）：     * - VTIMER: 设置偏移为当前物理计数器值（让虚拟计数器从0开始）     * - PTIMER: 偏移为0（直接使用物理计数器）     *      * KVM_ARCH_FLAG_VM_COUNTER_OFFSET标志表示用户空间通过     * KVM_ARM_SET_COUNTER_OFFSET ioctl显式设置了偏移值     */    if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) {        timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read());        timer_set_offset(vcpu_ptimer(vcpu), 0);    }    /* ===== 初始化后台定时器 ===== */    /*     * bg_timer (background timer) 是一个全局的hrtimer，用于：     * - 在vCPU未运行时模拟定时器到期     * - 当定时器到期时注入中断     * - 与每个定时器上下文中的hrtimer配合工作     *      * 使用 hrtimer_setup() 而不是旧的 hrtimer_init()     * HRTIMER_MODE_ABS_HARD: 绝对时间模式，硬中断上下文安全     */    hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC,              HRTIMER_MODE_ABS_HARD);}staticinlinevoidtimer_set_offset(struct arch_timer_context *ctxt, u64 offset){    WRITE_ONCE(*ctxt->offset.vm_offset, offset);}staticvoidtimer_context_init(struct kvm_vcpu *vcpu, int timerid){    struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid);    struct kvm *kvm = vcpu->kvm;    ctxt->timer_id = timerid;    if (timerid == TIMER_VTIMER)        ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset;    else        ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset;    hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);    switch (timerid) {    case TIMER_PTIMER:    case TIMER_HPTIMER:        ctxt->host_timer_irq = host_ptimer_irq;        break;    case TIMER_VTIMER:    case TIMER_HVTIMER:        ctxt->host_timer_irq = host_vtimer_irq;        break;    }}

VGIC的初始化：

intkvm_vgic_vcpu_init(struct kvm_vcpu *vcpu){    struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;    struct vgic_dist *dist = &vcpu->kvm->arch.vgic;    int ret = 0;    /* ===== 步骤1: 初始化Redistributor地址和AP列表 ===== */    /*     * Redistributor的基址初始为VGIC_ADDR_UNDEF，     * 表示尚未被用户空间通过KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION配置。     */    vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;    /*     * ap_list_head是Active-and-Pending中断队列的链表头。     * 待注入中断在进入guest前会从此队列写入硬件List Registers。     * ap_list_lock保护该链表的并发访问。     */    INIT_LIST_HEAD(&vgic_cpu->ap_list_head);    raw_spin_lock_init(&vgic_cpu->ap_list_lock);    /*     * 初始化vLPI (virtual LPI) 计数器为0，     * 用于GICv4 ITS直通中断的引用计数。     */    atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);    /* ===== 步骤2: 如果没有内核irqchip则直接返回 ===== */    /*     * 如果使用用户空间irqchip（不推荐），VGIC未在内核中初始化，     * 私有中断的分配和redistibutor注册均不需要。     */    if (!irqchip_in_kernel(vcpu->kvm))        return 0;    /* ===== 步骤3: 分配私有中断（SGI + PPI）===== */    /*     * vgic_allocate_private_irqs()在config_lock保护下调用     * vgic_allocate_private_irqs_locked()，完成以下工作：     *     * 1. 分配VGIC_NR_PRIVATE_IRQS (32)个vgic_irq结构体     * 2. 为每个中断初始化：     *    - INIT_LIST_HEAD(&irq->ap_list)     *    - raw_spin_lock_init(&irq->irq_lock)     *    - irq->intid = i     *    - irq->vcpu = NULL（动态分配，非固定）     *    - irq->target_vcpu = vcpu     *    - refcount_set(&irq->refcount, 0)     *    - SGI (0-15): enabled=1, config=VGIC_CONFIG_EDGE     *    - PPI (16-31): config=VGIC_CONFIG_LEVEL     *    - GICv3: group=1, mpidr=kvm_vcpu_get_mpidr_aff(vcpu)     *    - GICv2: group=0, targets=BIT(vcpu->vcpu_id)     */    ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model);    if (ret)        return ret;    /* ===== 步骤4: GICv3 Redistributor iodev注册 ===== */    /*     * GICv3每个vCPU对应一个Redistributor。     * 用户空间通过MMIO访问Redistributor来管理PPI/SGI中断。     * 注册iodev后，guest对Redistributor地址范围的MMIO访问     * 会被KVM拦截并交由vgic_register_redist_iodev处理。     * 注册过程需要持有slots_lock以保护MMIO槽位的并发修改。     */    if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {        mutex_lock(&vcpu->kvm->slots_lock);        ret = vgic_register_redist_iodev(vcpu);        mutex_unlock(&vcpu->kvm->slots_lock);    }    return ret;}

Memory region的设置，分配一段用户态内存userspace_addr，该段内存映射到VM的某个slot，其物理地址guest_phys_addr：

    struct kvm_userspace_memory_region region = {            .slot = memory_slot_count,            .flags = flags,            .guest_phys_addr = guest_addr,            .memory_size = memory_len,            .userspace_addr = (uint64_t) mem,            };    memory_slot_count++;    ioctl_exit_on_error(vmfd, KVM_SET_USER_MEMORY_REGION, "KVM_SET_USER_MEMORY_REGION", &region);

在KVM里面：

    case KVM_SET_USER_MEMORY_REGION: {        struct kvm_userspace_memory_region2 mem;        if (copy_from_user(&mem, argp, size))            goto out;        r = kvm_vm_ioctl_set_memory_region(kvm, &mem);

kvm_vm_ioctl_set_memory_region主要做一些检查，然后调用kvm_set_memslot：

staticintkvm_set_memslot(struct kvm *kvm,               struct kvm_memory_slot *old,               struct kvm_memory_slot *new,               enum kvm_mr_change change){    r = kvm_prepare_memory_region(kvm, old, new, change);    ......    if (change == KVM_MR_CREATE)        kvm_create_memslot(kvm, new);    else         .......    ......    /*     * No need to refresh new->arch, changes after dropping slots_arch_lock     * will directly hit the final, active memslot.  Architectures are     * responsible for knowing that new->arch may be stale.     */    kvm_commit_memory_region(kvm, old, new, change);    return 0;}

kvm_prepare_memory_region将调用kvm_arch_prepare_memory_region：

// arch/arm64/kvm/mmu.cintkvm_arch_prepare_memory_region(struct kvm *kvm,                   const struct kvm_memory_slot *old,                   struct kvm_memory_slot *new,                   enum kvm_mr_change change){    hva_t hva, reg_end;    int ret = 0;    if (change != KVM_MR_CREATE && change != KVM_MR_MOVE)        return 0;    // 预先建立Stage-2映射    hva = new->userspace_addr;    reg_end = hva + (new->npages << PAGE_SHIFT);    ret = stage2_set_pte_wrprotect(kvm, new, hva, reg_end);    return ret;}

kvm_create_memslot：

staticvoidkvm_create_memslot(struct kvm *kvm,			       struct kvm_memory_slot *new){/* Add the new memslot to the inactive set and activate. */kvm_replace_memslot(kvm, NULL, new);kvm_activate_memslot(kvm, NULL, new);}

kvm_commit_memory_region会调用kvm_arch_commit_memory_region，对ARM并没有太多操作：

staticvoidkvm_commit_memory_region(struct kvm *kvm,                     struct kvm_memory_slot *old,                     const struct kvm_memory_slot *new,                     enum kvm_mr_change change){    int old_flags = old ? old->flags : 0;    int new_flags = new ? new->flags : 0;    /* ========== 步骤1: 更新总页数 ========== */    if (change == KVM_MR_DELETE)        kvm->nr_memslot_pages -= old->npages;    else if (change == KVM_MR_CREATE)        kvm->nr_memslot_pages += new->npages;    /* ========== 步骤2: 更新脏页追踪计数 ========== */    if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {        int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;        atomic_set(&kvm->nr_memslots_dirty_logging,               atomic_read(&kvm->nr_memslots_dirty_logging) + change);    }    /* ========== 步骤3: 架构特定提交 ========== */    kvm_arch_commit_memory_region(kvm, old, new, change);    /* ========== 步骤4: 清理旧槽资源 ========== */    switch (change) {    case KVM_MR_CREATE:        /* Nothing more to do. */        break;    case KVM_MR_DELETE:        /* Free the old memslot and all its metadata. */        kvm_free_memslot(kvm, old);        break;    case KVM_MR_MOVE:    case KVM_MR_FLAGS_ONLY:        /* Free the dirty bitmap as needed */        if (old->dirty_bitmap && !new->dirty_bitmap)            kvm_destroy_dirty_bitmap(old);        /*         * The final quirk.  Free the detached, old slot, but only its         * memory, not any metadata.  Metadata, including arch specific         * data, may be reused by @new.         */        kfree(old);        break;    default:        BUG();    }}

本文来自网友投稿或网络内容，如有侵犯您的权益请联系我们删除，联系邮箱：wyl860211@qq.com 。

Linux KVM虚拟机运行环境的设置:VCPU/Memory

最新文章

热门文章

随机文章

Linux KVM虚拟机运行环境的设置:VCPU/Memory

Linux内存管理:物理内存初始化

用 Python 实现语音识别(把语音转文字)

最新文章

热门文章

随机文章