Linux KVM也实现了ARM体系架构的硬件虚拟化功能,怎么通过Hello World类型的示例来演示?Github上面有一个很好的例子:https://github.com/Lenz-K/arm64-kvm-hello-world
左边是运行在虚拟机里面的hello world可执行文件,它的代码包括汇编和C两部分。汇编先设置堆栈指针stack_top,stack_top是在链接脚本里面定义的label。随后跳转到main。当main执行完成后,调用hvc关闭虚拟机:arm64-kvm-hello-world/bare-metal-aarch64$ cat startup.s.global _start_start: ldr x30, =stack_top /* Retrieve initial stack address */ mov sp, x30 /* Set stack address */ bl main /* Branch to main() */.global system_offsystem_off: ldr x0, =0x84000008 /* SYSTEM_OFF function ID */ hvc #0 /* Hypervisor call */sleep: /* This point should not be reached */ wfi /* Wait for interrupt */ b sleep /* Endless loop */
C语言就是写Hello World到地址0x10000000。所以这里假设地址0x10000000是串口的MMIO地址,当写的时候期望Exit虚拟机到Host,由Host截获然后再相应的处理:arm64-kvm-hello-world/bare-metal-aarch64$ cat hello_world.cvolatile unsigned int * const UART0DR = (unsigned int *) 0x10000000;voidprint_uart0(constchar *s){ while(*s != '\0') { /* Loop until end of string */ *UART0DR = (unsigned int)(*s); /* Transmit char */ s++; /* Next char */ }}voidmain(){ print_uart0("Hello World!\n");}
链接脚本如下,定义运行代码加载到0x04000000,堆栈在地址0x04020000:ENTRY(_start)MEMORY{ RAM ( rxw ) : ORIGIN = 0x04000000, LENGTH = 0x02000000}SECTIONS{ . = 0x0; .startup . : { startup.o(.text) } .text : { *(.text) } .data : { *(.data) } .bss : { *(.bss COMMON) } . = ALIGN(16); . = 0x04020000; stack_top = .;}
在Host端,主要是创建虚拟机,以及设置虚拟机运行的必要环境:内存,VCPU。 /* Get the KVM file descriptor */ kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC); /* Make sure we have the stable version of the API */ ret = ioctl(kvm, KVM_GET_API_VERSION, NULL); /* Create a VM and receive the VM file descriptor */ printf("Creating VM\n"); vmfd = ioctl_exit_on_error(kvm, KVM_CREATE_VM, "KVM_CREATE_VM", (unsigned long) 0);
intioctl_exit_on_error(int file_descriptor, unsignedlong request, string name, ...){ va_list ap; va_start(ap, name); void *arg = va_arg(ap, void *); va_end(ap); int ret = ioctl(file_descriptor, request, arg); if (ret < 0) { printf("System call '%s' failed: %s - %d\n", name.c_str(), strerror(errno), ret); exit(ret); } return ret;}
虚拟机运行的内存通过/dev/kvm的ioctl调用KVM_SET_USER_MEMORY_REGION设置,主要参数信息是虚拟机物理地址与Host端虚拟地址:/** * Allocates memory and assigns it to the VM as guest memory. * * @param memory_len The length of the memory that shall be allocated. * @param guest_addr The address of the memory in the guest. * @return A pointer to the allocated memory. */uint64_t *allocate_memory_to_vm(size_t memory_len, uint64_t guest_addr, uint32_t flags = 0){ void *void_mem = mmap(NULL, memory_len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); uint64_t *mem = static_cast<uint64_t *>(void_mem); if (!mem) { printf("Error while allocating guest memory: %s\n", strerror(errno)); exit(-1); } struct kvm_userspace_memory_region region = { .slot = memory_slot_count, .flags = flags, .guest_phys_addr = guest_addr, .memory_size = memory_len, .userspace_addr = (uint64_t) mem, }; memory_slot_count++; ioctl_exit_on_error(vmfd, KVM_SET_USER_MEMORY_REGION, "KVM_SET_USER_MEMORY_REGION", ®ion); return mem;}
/* * MEMORY MAP * One memory block of 0x1000 B will be assigned to every part of the memory: * * Start | Name | Description * -----------+-------+------------ * 0x00000000 | ROM | * 0x04000000 | RAM | * 0x04010000 | Heap | increases * 0x0401F000 | Stack | decreases, so the stack pointer is initially 0x04020000 * 0x10000000 | MMIO | */
所以要多次调用allocate_memory_to_vm: printf("Setting up memory\n"); check_vm_extension(KVM_CAP_USER_MEMORY, "KVM_CAP_USER_MEMORY"); /* ROM Memory */ memory_mappings[0].guest_phys_addr = 0x0; memory_mappings[0].memory_size = MEMORY_BLOCK_SIZE; mem = allocate_memory_to_vm(memory_mappings[0].memory_size, memory_mappings[0].guest_phys_addr); memory_mappings[0].userspace_addr = mem; /* RAM Memory */ memory_mappings[1].guest_phys_addr = 0x04000000; memory_mappings[1].memory_size = MEMORY_BLOCK_SIZE; mem = allocate_memory_to_vm(memory_mappings[1].memory_size, memory_mappings[1].guest_phys_addr); memory_mappings[1].userspace_addr = mem; /* Heap Memory */ mem = allocate_memory_to_vm(MEMORY_BLOCK_SIZE * 2, 0x04010000); /* Stack Memory */ // mem = allocate_memory_to_vm(MEMORY_BLOCK_SIZE, 0x04020000); /* MMIO Memory */ check_vm_extension(KVM_CAP_READONLY_MEM, "KVM_CAP_READONLY_MEM"); // This will cause a write to 0x10000000, to result in a KVM_EXIT_MMIO. mem = allocate_memory_to_vm(MEMORY_BLOCK_SIZE, 0x10000000, KVM_MEM_READONLY);
这个例子里面,虚拟机的ELF编出来后,并没有通过objdump之类的提取代码,而是自己实现了一个elf loader,从ELF里面自己提取。在提取过程中,会分析获得代码应该拷贝到内存的目标地址,然后再memcpy到Host端为虚拟机分配的User Memory里面:/** * Copies the code into the memory of the specified memory mapping. * * @param code The code blok that will be copied into the VM memory. * @param memsz The size of the code block. * @param target_addr The VM memory address that the code will be copied to. * @param mmi The index of the memory mapping that will be used for copying. * @return Returns the index of the memory mapping or -1 if no mapping was found. */intcopy_section_into_memory(uint32_t *code, size_t memsz, uint64_t target_addr, int mmi){ // There can be an offset between memory mapping and the target address. uint64_t offset = target_addr - memory_mappings[mmi].guest_phys_addr; // If the offset plus the code size is bigger than the memory mapping size, do nothing. if (offset + memsz > memory_mappings[mmi].memory_size) { printf("Memory mapping too small. Mapping offset: 0x%08lX - Mapping size: 0x%08lX\n", offset, memory_mappings[mmi].memory_size); return -1; } // Copy the code into the VM memory memcpy(memory_mappings[mmi].userspace_addr + offset, code, memsz); printf("Section loaded. Host address: %p - Guest address: 0x%08lX\n", memory_mappings[mmi].userspace_addr + offset, target_addr); return 0;}/** * Copies the required sections of the ELF file into the memory of the VM. * * @return 0 on success, -1 if an error occurred. */intcopy_elf_into_memory(){ string elf_name = "bare-metal-aarch64/hello_world.elf"; // Open the ELF file that will be loaded into memory if (open_elf(elf_name.c_str()) != 0) return -1; uint32_t *code; size_t memsz; uint64_t target_addr; // Iterate over the segments in the ELF file and load them into the memory of the VM while (has_next_section_to_load()) { if (get_next_section_to_load(&code, &memsz, &target_addr) < 0) return -1; int mmi = find_mapping_for_section(target_addr); if (mmi < 0) return -1; if (copy_section_into_memory(code, memsz, target_addr, mmi) < 0) return -1; } close_elf(); return 0;}
KVM需要创建虚拟CPU(VCPU),VCPU是虚拟机程序运行的CPU,也是通过一系列的ioctl调用: /* Create a virtual CPU and receive its file descriptor */ printf("Creating VCPU\n"); vcpufd = ioctl_exit_on_error(vmfd, KVM_CREATE_VCPU, "KVM_CREATE_VCPU", (unsigned long) 0); /* Get CPU information for VCPU init */ printf("Retrieving physical CPU information\n"); struct kvm_vcpu_init preferred_target; ioctl_exit_on_error(vmfd, KVM_ARM_PREFERRED_TARGET, "KVM_ARM_PREFERRED_TARGET", &preferred_target); /* Enable the PSCI v0.2 CPU feature, to be able to shut down the VM */ check_vm_extension(KVM_CAP_ARM_PSCI_0_2, "KVM_CAP_ARM_PSCI_0_2"); preferred_target.features[0] |= 1 << KVM_ARM_VCPU_PSCI_0_2; /* Initialize VCPU */ printf("Initializing VCPU\n"); ioctl_exit_on_error(vcpufd, KVM_ARM_VCPU_INIT, "KVM_ARM_VCPU_INIT", &preferred_target);
PSCI (Power State Coordination Interface)提供了一套通用的电源管理API接口,最常用的包括:CPU_ON,CPU_OFF,CPU_SUSPEND,SYSTEM_OFF,SYSTEM_RESET等。电源管理涉及敏感的底层寄存器(如 CPU 复位地址)。PSCI 将这些权限交给更高特权级的软件(Secure Monitor),操作系统通过调用(SMC/HVC)来请求操作。通过设置KVM_ARM_VCPU_PSCI_0_2,KVM 就会拦截(Trap)虚拟机发出的 PSCI 相关指令(start.S中的HVC SYSTEM_OFF指令),并按 v0.2 标准进行处理。如果多个VCPU,主核(Primary CPU)可以通过标准的PSCI调用来唤醒其他从核(Secondary CPUs)。kvm_run数据结构是虚拟机与Host交互的接口,这里通过mmap映射: /* Map the shared kvm_run structure and following data. */ ret = ioctl_exit_on_error(kvm, KVM_GET_VCPU_MMAP_SIZE, "KVM_GET_VCPU_MMAP_SIZE", NULL); mmap_size = ret; if (mmap_size < sizeof(*run)) printf("KVM_GET_VCPU_MMAP_SIZE unexpectedly small"); void *void_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0); run = static_cast<kvm_run *>(void_mem); if (!run) printf("Error while mmap vcpu");
kvm_run主要信息如下。其中的union是非常大的一个数据结构,根据exit_reason来选取union里面的更详细的信息:/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */struct kvm_run { /* in */ __u8 request_interrupt_window; __u8 HINT_UNSAFE_IN_KVM(immediate_exit); __u8 padding1[6]; /* out */ __u32 exit_reason; __u8 ready_for_interrupt_injection; __u8 if_flag; __u16 flags; /* in (pre_kvm_run), out (post_kvm_run) */ __u64 cr8; __u64 apic_base; union { /* KVM_EXIT_UNKNOWN */ struct { __u64 hardware_exit_reason; } hw; /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; __u32 cpu; } fail_entry; /* KVM_EXIT_EXCEPTION */ struct { __u32 exception; __u32 error_code; } ex; /* KVM_EXIT_IO */ struct {#define KVM_EXIT_IO_IN 0#define KVM_EXIT_IO_OUT 1 __u8 direction; __u8 size; /* bytes */ __u16 port; __u32 count; __u64 data_offset; /* relative to kvm_run start */ } io; /* KVM_EXIT_DEBUG */ struct { struct kvm_debug_exit_arch arch; } debug; /* KVM_EXIT_MMIO */ struct { __u64 phys_addr; __u8 data[8]; __u32 len; __u8 is_write; } mmio; ............ }; .............};
通过KVM_SET_ONE_REG来设置PC指针,指向ELF的开始执行的代码,也就是汇编里面的_start,这样虚拟机启动后从PC指定的位置开始执行: /* Set program counter to entry address */ check_vm_extension(KVM_CAP_ONE_REG, "KVM_CAP_ONE_REG"); uint64_t pc_index = offsetof(struct kvm_regs, regs.pc) / sizeof(__u32); uint64_t pc_id = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | pc_index; uint64_t entry_addr = get_entry_address(); printf("Setting program counter to entry address 0x%08lX\n", entry_addr); struct kvm_one_reg pc = {.id = pc_id, .addr = (uint64_t)&entry_addr}; ret = ioctl_exit_on_error(vcpufd, KVM_SET_ONE_REG, "KVM_SET_ONE_REG", &pc); if (ret < 0) return ret;
上述代码先在kvm_regs里面获取PC寄存器的index,然后通过kvm_one_reg数据结构,将index以及要设置的值传入:struct kvm_one_reg { __u64 id; __u64 addr;};/* * User structures for general purpose, floating point and debug registers. */struct user_pt_regs { __u64 regs[31]; __u64 sp; __u64 pc; __u64 pstate;};struct kvm_regs {struct user_pt_regs regs; /* sp = sp_el0 */ __u64 sp_el1; __u64 elr_el1; __u64 spsr[KVM_NR_SPSR];struct user_fpsimd_state fp_regs;};
发起运行请求: 用户态线程调用 ioctl(vcpufd, KVM_RUN, 0)。此时,这个线程从用户态陷入内核态。
进入内核态处理: KVM 内核模块接收到请求,进行一系列检查(例如确认上下文已初始化)。
上下文切换: 内核执行__kvm_vcpu_run。它会将Host的寄存器状态保存到堆栈,并加载 Primary VCPU 的寄存器状态(也就是前面设置的PC)。
执行 ERET(异常返回): 这是关键的一步。内核执行 eret 指令。由于此时处理器处于异常模式,eret 会根据 ELR(异常链接寄存器,此时存的是虚拟机PC)和 SPSR,直接将CPU的特权级和执行流跳转到虚拟机的起始地址。
当KVM_RUN返回的时候,便是虚拟机有一个操作需要Host接管。其中的KVM_EXIT_MMIO接管从虚拟机发出的对模拟UART的写访问: /* Repeatedly run code and handle VM exits. */ printf("Running code\n"); bool shut_down = false; for (int i = 0; i < MAX_VM_RUNS && !shut_down; i++) { printf("\nKVM_RUN Loop %d:\n", i+1); ret = ioctl(vcpufd, KVM_RUN, NULL); if (ret < 0) { printf("System call 'KVM_RUN' failed: %d - %s\n", errno, strerror(errno)); printf("Error Numbers: EINTR=%d; ENOEXEC=%d; ENOSYS=%d; EPERM=%d\n", EINTR, ENOEXEC, ENOSYS, EPERM); return ret; } switch (run->exit_reason) { case KVM_EXIT_MMIO: printf("Exit Reason: KVM_EXIT_MMIO\n"); mmio_exit_handler(); break; case KVM_EXIT_SYSTEM_EVENT: // This happens when the VCPU has done a HVC based PSCI call. printf("Exit Reason: KVM_EXIT_SYSTEM_EVENT\n"); print_system_event_exit_reason(); shut_down = true; break; case KVM_EXIT_INTR: printf("Exit Reason: KVM_EXIT_INTR\n"); break; case KVM_EXIT_FAIL_ENTRY: printf("Exit Reason: KVM_EXIT_FAIL_ENTRY\n"); break; case KVM_EXIT_INTERNAL_ERROR: printf("Exit Reason: KVM_EXIT_INTERNAL_ERROR\n"); break; default: printf("Exit Reason: other\n"); } }
虚拟机里面对UART写入的字符通过kvm_run返回,mmio_exit_handler将其保存在mmio_buffer里面:/** * Handles a MMIO exit from KVM_RUN. */void mmio_exit_handler() { printf("Is Write: %d\n", run->mmio.is_write); if (run->mmio.is_write) { printf("Length: %d\n", run->mmio.len); uint64_t data = 0; for (int j = 0; j < run->mmio.len; j++) { data |= run->mmio.data[j]<<8*j; } mmio_buffer[mmio_buffer_index] = data; mmio_buffer_index++; printf("Guest wrote 0x%08lX to 0x%08llX\n", data, run->mmio.phys_addr); }}
在Host代码最后,打印出mmio_buffer的内容,实际为虚拟机写到模拟UART的字符串: printf("\nVM MMIO Output:\n"); for(int i = 0; i < mmio_buffer_index; i++) { printf("%c", mmio_buffer[i]); }
arm64-kvm-hello-world$ ./kvm_testCreating VMSetting up memoryOpening ELF fileIt contains 3 sectionsSection 0 needs to be loadedSection loaded. Host address: 0xffff8aa84000 - Guest address: 0x00000000Section 1 needs to be loadedSection loaded. Host address: 0xffff8aa7c000 - Guest address: 0x04000000Section 2 does not need to be loadedClosing ELF fileCreating VCPURetrieving physical CPU informationInitializing VCPUSetting program counter to entry address 0x00000000Running codeKVM_RUN Loop 1:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x00000048 to 0x10000000KVM_RUN Loop 2:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x00000065 to 0x10000000KVM_RUN Loop 3:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x0000006C to 0x10000000KVM_RUN Loop 4:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x0000006C to 0x10000000KVM_RUN Loop 5:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x0000006F to 0x10000000KVM_RUN Loop 6:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x00000020 to 0x10000000KVM_RUN Loop 7:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x00000057 to 0x10000000KVM_RUN Loop 8:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x0000006F to 0x10000000KVM_RUN Loop 9:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x00000072 to 0x10000000KVM_RUN Loop 10:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x0000006C to 0x10000000KVM_RUN Loop 11:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x00000064 to 0x10000000KVM_RUN Loop 12:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x00000021 to 0x10000000KVM_RUN Loop 13:Exit Reason: KVM_EXIT_MMIOIs Write: 1Length: 4Guest wrote 0x0000000A to 0x10000000KVM_RUN Loop 14:Exit Reason: KVM_EXIT_SYSTEM_EVENTCause: ShutdownVM MMIO Output:Hello World!
可以看到KVM_RUN返回的主要原因是虚拟机里面的程序通过MMIO打印Hello World的操作。最后一个是通过hpc调用关闭虚拟机。