8.2.1 虛擬機狀態的維護
本節通過Cpu,內存和硬盤來分析虛擬機的狀態維護
(1) CPU
cpu_exec_init ==>
vmstate_register(NULL, cpu_index,&vmstate_cpu_common, env);
register_savevm(NULL, "cpu", cpu_index,CPU_SAVE_VERSION,
cpu_save, cpu_load, env);//用戶保存和恢復cpu的狀態
static constVMStateDescription vmstate_cpu_common = {
.name = "cpu_common",
.version_id = 1,
.minimum_version_id = 1,
.minimum_version_id_old = 1,
.post_load = cpu_common_post_load,
.fields = (VMStateField []) {
VMSTATE_UINT32(halted, CPUArchState),
VMSTATE_UINT32(interrupt_request,CPUArchState),
VMSTATE_END_OF_LIST()
}
};
void cpu_save(QEMUFile *f,void *opaque) {
vmstate_save_state(f, &vmstate_cpu,opaque);
}
int cpu_load(QEMUFile *f,void *opaque, int version_id) {
return vmstate_load_state(f,&vmstate_cpu, opaque, version_id);
}
vmstate_cpu 也是VMStateDescription ; 它裏面會保存cpu的寄存器。
(2)內存
pc_memory_init==> vmstate_register_ram_global(ram) ==>
vmstate_register_ram(mr, NULL);
voidvmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
{
qemu_ram_set_idstr(memory_region_get_ram_addr(mr) &TARGET_PAGE_MASK,
memory_region_name(mr),dev);
}
找到要保存內存在ram_list中的位置,ram_list在qemu_ram_alloc_from_ptr中分配。
vl.c ==> register_savevm_live(NULL, "ram", 0,4, &savevm_ram_handlers, NULL);
SaveVMHandlerssavevm_ram_handlers = {
.save_live_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
.save_live_complete = ram_save_complete,
.load_state = ram_load,
.cancel = ram_migration_cancel,
};
ram_save_iterate ==> ram_save_block ==>save_block_hdr
將虛擬機內存持久化到snapshot文件中.
(3) 硬盤
block 層
blk_mig_init==>register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
&block_mig_state);
SaveVMHandlerssavevm_block_handlers = {
.set_params = block_set_params,
.save_live_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
.save_live_complete = block_save_complete,
.load_state = block_load,
.cancel = block_migration_cancel,
.is_active = block_is_active,
};
ide controller:
vmstate_register(&d->dev.qdev,0, &vmstate_ide_pci, d);
const VMStateDescriptionvmstate_ide_pci = {
.name = "ide",
.version_id = 3,
.minimum_version_id = 0,
.minimum_version_id_old = 0,
.post_load = ide_pci_post_load,
.fields = (VMStateField []) {
VMSTATE_PCI_DEVICE(dev, PCIIDEState),
VMSTATE_STRUCT_ARRAY(bmdma,PCIIDEState, 2, 0,
vmstate_bmdma,BMDMAState),
VMSTATE_IDE_BUS_ARRAY(bus, PCIIDEState,2),
VMSTATE_IDE_DRIVES(bus[0].ifs, PCIIDEState),
VMSTATE_IDE_DRIVES(bus[1].ifs,PCIIDEState),
VMSTATE_END_OF_LIST()
}
};
pci_piix_init_ports ==》 qemu_add_vm_change_state_handler(d->bus[i].dma->ops->restart_cb,
&d->bmdma[i].dma);
restart_cb = bmdma_restart_cb
static voidbmdma_restart_cb(void *opaque, int running, RunState state)
{
........
if (!running) //還未running在結束
return;
if (!bm->bh) {
bm->bh =qemu_bh_new(bmdma_restart_bh, &bm->dma);
qemu_bh_schedule(bm->bh);
}
}
註冊了vm_change的notify
8.2.2虛擬機vm status數據結構
structVMStateDescription {
const char *name;
int unmigratable;
int version_id;
int minimum_version_id;
int minimum_version_id_old;
LoadStateHandler *load_state_old;
int (*pre_load)(void *opaque);
int (*post_load)(void *opaque, intversion_id);
void (*pre_save)(void *opaque);
VMStateField *fields;
const VMStateSubsection *subsections;
};
其中VMStateField 用於保存 opaque變量中的某些字段。
例子1:VMStateField結構體指定了在opaque中的偏移和長度。對於cpu而言,opaque爲env.;
VMSTATE_UINT32(halted,CPUArchState) 的含義爲:env- 的類別爲CPUArchState, 保存其成員halted. 即env->halted。
#define VMSTATE_UINT32_V(_f,_s, _v) \
VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint32,uint32_t)
const VMStateInfovmstate_info_uint32 = {
.name = "uint32",
.get = get_uint32,
.put = put_uint32,
};
例子2:VMSTATE_STRUCT_ARRAY(bmdma, PCIIDEState, 2, 0,
vmstate_bmdma,BMDMAState),
PCIIDEState結構體成員數組BMDMAState bmdma[2];
intregister_savevm(DeviceState *dev,
const char *idstr, intinstance_id,
int version_id,
SaveStateHandler*save_state,
LoadStateHandler *load_state,
void *opaque)
{
SaveVMHandlers *ops =g_malloc0(sizeof(SaveVMHandlers));
ops->save_state = save_state;
ops->load_state =load_state;
return register_savevm_live(dev, idstr,instance_id, version_id,
ops, opaque);
}
register_savevm_live ==》 {
SaveStateEntry *se;
se = g_malloc0(sizeof(SaveStateEntry));
se->version_id = version_id;
se->section_id = global_section_id++;
se->ops = ops;
se->opaque = opaque;
se->vmsd = NULL;
se->no_migrate = 0;
。。。。。。
pstrcat(se->idstr, sizeof(se->idstr),idstr);
if (instance_id == -1)
se->instance_id =calculate_new_instance_id(se->idstr);
else
se->instance_id = instance_id;
QTAILQ_INSERT_TAIL(&savevm_handlers, se, entry);
}
savevm_handlers記錄了所有的save 單元
vmstate_register ==> vmstate_register_with_alias_id{
se = g_malloc0(sizeof(SaveStateEntry));
se->version_id = vmsd->version_id;
se->section_id = global_section_id++;
se->opaque = opaque;
se->vmsd = vmsd;
se->alias_id = alias_id;
se->no_migrate = vmsd->unmigratable;
..............
}
與上一個的區別在於沒有給ops賦值
8.2.3虛擬機Save流程
(1) save 主流程
vm save/load的主流程在vmsave.c; 本節重點分析虛擬機save的流程.其入口函數爲:
savevm.c: void do_savevm(Monitor*mon, const QDict *qdict)
a. 得到能做snapshot的blockdevicebs
b. 停止虛擬機
saved_vm_running = runstate_is_running();
vm_stop(RUN_STATE_SAVE_VM) ==> do_vm_stop
static void do_vm_stop(RunState state) {
if (runstate_is_running()) {
cpu_disable_ticks();
pause_all_vcpus(); //停止vcpu的運行,停止運行vcpu的線程
runstate_set(state);
vm_state_notify(0, state); //vm state nofity 的回調會被調用,如ide的
bdrv_drain_all();
bdrv_flush_all();
monitor_protocol_event(QEVENT_STOP, NULL);
}
}
c. 得到虛擬機停止時的時間
sn->vm_clock_nsec =qemu_get_clock_ns(vm_clock);
d. 如果old snapshot文件存在,刪除之
e. 保存虛擬機狀態
QEMUFile * f = qemu_fopen_bdrv(bs, 1);
QEMUFile 結構提供了2MB的緩存,當緩存滿時,會調用
QEMUFilePutBufferFunc *put_buffer; =block_put_buffer
QEMUFileGetBufferFunc *get_buffer; =block_get_buffer
將數據寫/讀到後背block device.
ret = qemu_savevm_state(f);
vm_state_size = qemu_ftell(f);
qemu_fclose(f);
f. 創建snapshot
(2) vm sate save
qemu_savevm_state {
//默認se->no_migrate爲0,如果有1的情況, savevm將不能繼續
qemu_savevm_state_blocked(NULL));
ret = qemu_savevm_state_begin(f,¶ms);
do {
ret = qemu_savevm_state_iterate(f);
} while (ret == 0);
ret = qemu_savevm_state_complete(f);
}
qemu_savevm_state_begin:
a. 生成vm state magic info
qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
qemu_put_be32(f, QEMU_VM_FILE_VERSION);
b. 對每個SaveStateEntr se
如果
if (!se->ops ||!se->ops->save_live_setup) {
continue;
if(!se->ops->is_active(se->opaque))
continue;
否則就保存 (內存 & block都會執行該分支):
qemu_put_byte(f,QEMU_VM_SECTION_START);
qemu_put_be32(f, se->section_id);
len = strlen(se->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t*)se->idstr, len);
qemu_put_be32(f, se->instance_id);
qemu_put_be32(f, se->version_id);
ret = se->ops->save_live_setup(f,se->opaque);
qemu_savevm_state_iterate:
對每個SaveStateEntr se
if (!se->ops ||!se->ops->save_live_iterate)
continue;
if (se->ops &&se->ops->is_active)
if(!se->ops->is_active(se->opaque)) {
continue;
}
qemu_put_byte(f, QEMU_VM_SECTION_PART);
qemu_put_be32(f, se->section_id);
ret =se->ops->save_live_iterate(f, se->opaque);
qemu_savevm_state_complete:
a. 對每個SaveStateEntr se
if (!se->ops ||!se->ops->save_live_iterate)
continue;
if (se->ops && se->ops->is_active)
if(!se->ops->is_active(se->opaque)) {
continue;
}
qemu_put_byte(f, QEMU_VM_SECTION_END);
qemu_put_be32(f, se->section_id);
ret =se->ops->save_live_complete(f, se->opaque);
b. 對每個SaveStateEntr se
if ((!se->ops ||!se->ops->save_state) && !se->vmsd) {
continue;
qemu_put_byte(f, QEMU_VM_SECTION_FULL);
qemu_put_be32(f, se->section_id);
len = strlen(se->idstr);
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t*)se->idstr, len);
qemu_put_be32(f, se->instance_id);
qemu_put_be32(f, se->version_id);
vmstate_save(f, se); //call vmstate_save_state(f,se->vmsd,se->opaque);
vmstate_save_state根據vmsd的VMStateField數組信息,保存其描述的在se->opaque對應內存位置的值,如果對應位置是結構體,則遞歸調用:
if (field->flags & VMS_STRUCT){
vmstate_save_state(f,field->vmsd, addr);
} else {
field->info->put(f,addr, size);
}
(3) vm load流程簡介
qemu_loadvm_state(savevm.c) ,其流程與savevm相反:其流程如下:
(1) 獲取要恢復的設備信息:
根據保存的 instance_id =qemu_get_be32(f);
version_id = qemu_get_be32(f);
和savevm_handler, 得到要恢復設備的對象信息
se = find_se(idstr, instance_id);
並將其加入到
le = g_malloc0(sizeof(*le));
le->se = se;
le->section_id = section_id;
le->version_id = version_id;
QLIST_INSERT_HEAD(&loadvm_handlers, le, entry);
(2) 對每個個單元做數據恢復
vmstate_load(f, le->se,le->version_id);
(3) 恢復vcpu的寄存器
cpu_synchronize_all_post_init==》 kvm_arch_put_registers
8.2.4 動態遷移
動態遷移的目的是:主要目標就是在客戶機沒有感覺的情況下,將客戶機,遷移到另一個物理機器上,從而保證了服務器正常使用。由於在遷移過程中會出現從源主機中遷移操作開始到目的主機上客戶機服務處於不可用狀態的時間,此時源主機上客戶機已經暫停服務,目的主機上的客戶機還未恢復服務。因此設計目標是儘可能的縮短該時間。動態遷移要滿足如下條件才能進行:
1)源宿主機和目的宿主機直接儘量用網絡共享的存儲系統來保存客戶機磁盤鏡像,儘管kvm動態遷移也支持聯通磁盤鏡像一起復制,共享存儲(如NFS )來源宿主機和目的上的掛載位置必須完全一致
2) 爲了提高動態遷移的成功率,儘量在同類型cpu的主機上面進行動態遷移, 3) 64位的客戶機只能運行在64宿主機之間的遷移,而32位客戶機可以在32宿主機和64位宿主機之間遷移。
4) 動態遷移的源宿主機和目的宿主機對NX(一種安全特性) 位的設置是相同,要麼同爲關閉狀態,要麼同爲打開狀態。在Intel平臺上的linux系統中,用“cat /proc/couinfo |grep nx ”命令可以查看是否有NX的支持
5) 在進行動態遷移時,被遷移客戶機的名稱是唯一的,在目的宿主機上不能有與源宿主機被遷移客戶機同名的客戶機存在
使用步驟如下:
a. 源和目的宿主機上掛載nfs文件系統
b. 增加選項-incoming tcp:0(允許來自任何主機的連接):xxxx(端口號) 在目的宿主機上啓動虛擬機
c.源虛擬機的monitor輸入migrate tcp:目的機ip:xxxx
下面簡單分析動態遷移的代碼流程:
(1) 源端:
qmp_migrate(migration.c) ==》
migrate_init(¶ms);
if (strstart(uri, "tcp:",&p))
ret = tcp_start_outgoing_migration(s,p, errp);
}else if (strstart(uri, "exec:",&p))
ret = exec_start_outgoing_migration(s,p);
}else if (strstart(uri, "unix:",&p))
ret = unix_start_outgoing_migration(s,p);
}else if (strstart(uri, "fd:",&p))
ret = fd_start_outgoing_migration(s,p);
tcp_start_outgoing_migration==>inet_nonblocking_connect(host_port, tcp_wait_for_connect,s,
errp);
tcp_wait_for_connect ==>migrate_fd_connect
voidmigrate_fd_connect(MigrationState *s)
{
int ret;
s->state = MIG_STATE_ACTIVE;
s->file = qemu_fopen_ops_buffered(s, s->bandwidth_limit,
migrate_fd_put_buffer,
migrate_fd_put_ready,
migrate_fd_wait_for_unfreeze,
migrate_fd_close);
ret = qemu_savevm_state_begin(s->file,&s->params); //保存支持動態遷移的設備信息
migrate_fd_put_ready(s);
}
static voidmigrate_fd_put_ready(void *opaque)
{
ret = qemu_savevm_state_iterate(s->file);
if (ret < 0) {
migrate_fd_error(s);
} else if (ret == 1) {
int old_vm_running =runstate_is_running();
qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
if(qemu_savevm_state_complete(s->file) < 0) {
migrate_fd_error(s);
} else {
migrate_fd_completed(s);
}
s->total_time =qemu_get_clock_ms(rt_clock) - s->total_time;
if (s->state != MIG_STATE_COMPLETED){
if (old_vm_running) {
vm_start();
}
}
}
}
(2) 目的端:
main(vl.c):
if (incoming) {
Error *errp = NULL;
int ret =qemu_start_incoming_migration(incoming, &errp);
}
qemu_start_incoming_migration==> tcp_start_incoming_migration ==>
qemu_set_fd_handler2(s, NULL,tcp_accept_incoming_migration, NULL,
(void *)(intptr_t)s);
tcp_accept_incoming_migration==> process_incoming_migration
voidprocess_incoming_migration(QEMUFile *f)
{
if (qemu_loadvm_state(f) < 0) {
fprintf(stderr, "load of migrationfailed\n");
exit(0);
}
qemu_announce_self();
bdrv_clear_incoming_migration_all();
bdrv_invalidate_cache_all();
if (autostart) {
vm_start();
} else {
runstate_set(RUN_STATE_PRELAUNCH);
}
}