kvm中斷虛擬化

1、x86平臺主要使用的中斷類型有pic、apic及msi中斷,在多核系統下的apic結構圖如下所示,每個cpu有一個lapic,外部中斷通過ioapic轉發到lapic,如果是msi中斷,則繞過了io apic直接發給lapic。

2、kvm初始化過程爲每個虛擬機維護一個pic主控制器、一個pic備控制器以及一個ioapic控制器,每個vcpu維護一個lapic控制器。同時每個虛擬機有一張中斷路由表(kvm_irq_routing_table)。中斷路由表裏的chip二維數組保存非msi中斷的gsi號,每個中斷都有自己的routing_entry,routing_entry保存了中斷的類型(pci、ioapic、msi)、中斷號、以及set觸發函數,所有的routing_entry以gsi爲索引信息掛接到route_table的map鏈表裏(可能同一個中斷號會同時關聯pic、ioapic兩種中斷type)。

ioapic裏還維護了一張中斷重映射表(redirtbl),負責爲每個ioapic引腳(總共24個引腳)收到的中斷選擇路由到哪個lapic,每個vcpu的lapic控制器則模擬了主要的apic寄存器(IRR、ISR、EOI)。

3、中斷路由表初始過程

kvm創建好pci、ioapic控制器後,會先使用default_routing(kvm/irq_common.c)安裝默認的中斷路由表。

kvm_arch_vm_ioctl
    kvm_create_pic
    kvm_ioapic_init
    kvm_setup_default_irq_routing
        kvm_set_irq_routing
            setup_routing_entry

static int setup_routing_entry(struct kvm *kvm,
			       struct kvm_irq_routing_table *rt,
			       struct kvm_kernel_irq_routing_entry *e,
			       const struct kvm_irq_routing_entry *ue)
{
	int r = -EINVAL;
	struct kvm_kernel_irq_routing_entry *ei;

	/*
	 * Do not allow GSI to be mapped to the same irqchip more than once.
	 * Allow only one to one mapping between GSI and non-irqchip routing.
	 */
	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
			return r;

	e->gsi = ue->gsi;
	e->type = ue->type;
        //設置每個routing_entry信息
	r = kvm_set_routing_entry(kvm, e, ue);
	if (r)
		goto out;
	if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
		rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi;
     
        //將routing_entry連接到route_table的map鏈表
	hlist_add_head(&e->link, &rt->map[e->gsi]);
	r = 0;
out:
	return r;
}

int kvm_set_routing_entry(struct kvm *kvm,
			  struct kvm_kernel_irq_routing_entry *e,
			  const struct kvm_irq_routing_entry *ue)
{
	int r = -EINVAL;
	int delta;
	unsigned max_pin;

	switch (ue->type) {
	case KVM_IRQ_ROUTING_IRQCHIP:
		delta = 0;
		switch (ue->u.irqchip.irqchip) {
		case KVM_IRQCHIP_PIC_MASTER:
			e->set = kvm_set_pic_irq;
			max_pin = PIC_NUM_PINS;
			break;
		case KVM_IRQCHIP_PIC_SLAVE:
			e->set = kvm_set_pic_irq;
			max_pin = PIC_NUM_PINS;
			delta = 8;
			break;
		case KVM_IRQCHIP_IOAPIC:
			max_pin = KVM_IOAPIC_NUM_PINS;
			e->set = kvm_set_ioapic_irq;
			break;
		default:
			goto out;
		}
		e->irqchip.irqchip = ue->u.irqchip.irqchip;
		e->irqchip.pin = ue->u.irqchip.pin + delta;
		if (e->irqchip.pin >= max_pin)
			goto out;
		break;
	case KVM_IRQ_ROUTING_MSI:
		e->set = kvm_set_msi;
		e->msi.address_lo = ue->u.msi.address_lo;
		e->msi.address_hi = ue->u.msi.address_hi;
		e->msi.data = ue->u.msi.data;

		if (kvm_msi_route_invalid(kvm, e))
			goto out;
		break;
	default:
		goto out;
	}

	r = 0;
out:
	return r;
}

setup_routing_entry的ue參數即爲default_routing,以上的流程主要就是將default_routing定義的路由信息保存到routing_table裏,default_routing初始化定義了0-24號中斷的基本信息,如中斷type(都是非msi的IRQCHIP類型,包括pic、ioapic),中斷gsi號等。中斷路由表除了初始化安裝外,還可以通過KVM_SET_GSI_ROUTING重新安裝。

#define IOAPIC_ROUTING_ENTRY(irq) \
	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)

#define PIC_ROUTING_ENTRY(irq) \
	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
	  .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
#define ROUTING_ENTRY2(irq) \
	IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)

static const struct kvm_irq_routing_entry default_routing[] = {
	ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
	ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
	ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
	ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
	ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
	ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
	ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
	ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
	ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
	ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
	ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
	ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
};

4、中斷觸發流程

當vfio或vhost等後端通過eventfd喚醒kvm中斷處理函數後,會進入irqfd_inject,然後調用kvm_set_irq,kvm_set_irq主要是查找中斷路由表,找到中斷對應的routing_entry,然後調用其set觸發函數,如果是ioapic類型的中斷,則會調用kvm_set_ioapic_irq,最後進入ioapic_service處理函數。ioapic_service主要是找到中斷的重映射表,然後查找中斷的目的地信息並轉發到對應vcpu的lapic去處理。

static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
{
	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
	struct kvm_lapic_irq irqe;
	int ret;

	if (entry->fields.mask)
		return -1;

	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
		     "vector=%x trig_mode=%x\n",
		     entry->fields.dest_id, entry->fields.dest_mode,
		     entry->fields.delivery_mode, entry->fields.vector,
		     entry->fields.trig_mode);

	irqe.dest_id = entry->fields.dest_id;
	irqe.vector = entry->fields.vector;
	irqe.dest_mode = entry->fields.dest_mode;
	irqe.trig_mode = entry->fields.trig_mode;
	irqe.delivery_mode = entry->fields.delivery_mode << 8;
	irqe.level = 1;
	irqe.shorthand = 0;
	irqe.msi_redir_hint = false;

	if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
		ioapic->irr_delivered |= 1 << irq;

	if (irq == RTC_GSI && line_status) {
		/*
		 * pending_eoi cannot ever become negative (see
		 * rtc_status_pending_eoi_check_valid) and the caller
		 * ensures that it is only called if it is >= zero, namely
		 * if rtc_irq_check_coalesced returns false).
		 */
		BUG_ON(ioapic->rtc_status.pending_eoi != 0);
		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
					       &ioapic->rtc_status.dest_map);
		ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
	} else
		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);

	if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
		entry->fields.remote_irr = 1;

	return ret;
}

lapic收到中斷後,會根據不同的delivery_mode調用不同的處理函數,以常見的APIC_DM_FIXED爲例,處理函數還會判斷是否啓用apicv功能,使用apicv和不使用apicv走不同的觸發流程。

static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
			     int vector, int level, int trig_mode,
			     struct dest_map *dest_map)
{
	case APIC_DM_FIXED:

		//設置觸發模式
		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
			if (trig_mode)
				kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
			else
				apic_clear_vector(vector, apic->regs + APIC_TMR);
		}
		//判斷是否使用apicv
		if (vcpu->arch.apicv_active)
			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
		else {
			//irr寄存器對應bit位置1
			kvm_lapic_set_irr(vector, apic);
			//標記中斷請求事件
			kvm_make_request(KVM_REQ_EVENT, vcpu);
			//把vcpu拉回到host
			kvm_vcpu_kick(vcpu);
		}
		break;
}

1)、如果使能了apicv,最終調用vmx_deliver_posted_interrupt,使用中斷posting的方式來通知vcpu處理中斷。

static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	int r;
	//嵌套虛擬化的場景
	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
	if (!r)
		return;
	//將pi_desc對應的bit位置1
	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
		return;
	//設置pi_desc.on爲1,表明有中斷需要處理
	r = pi_test_and_set_on(&vmx->pi_desc);
	kvm_make_request(KVM_REQ_EVENT, vcpu);
	//判斷vcpu是否處在Guest running狀態,如果是,則給vcpu發送IPI中斷POSTED_INTR_VECTOR
	//該IPI中斷vcpu可以直接在non-root模式下處理,不需要vm-exit
	//如果vcpu處於非running狀態,則將vcpu喚醒,這樣vcpu執行vm_entry的時候就能感知到有中斷需要處理
	if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
		kvm_vcpu_kick(vcpu);
}

2)、如果沒有使能apicv功能,則標記lapic的IRR寄存器,通過kvm_make_request標記vcpu有中斷請求事件,然後觸發vcpu vm-exit。當vcpu重新回到Guest模式時,會檢查是否有中斷請求事件,如果有,則設置ISR、PPR等寄存器信息。

vcpu_enter_guest
    inject_pending_event
        kvm_cpu_get_interrupt
            kvm_get_apic_interrupt
                kvm_queue_interrupt

int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
	//從irr寄存器獲取優先級最高的中斷向量
	int vector = kvm_apic_has_interrupt(vcpu);
	struct kvm_lapic *apic = vcpu->arch.apic;

	if (vector == -1)
		return -1;

	/*
	 * We get here even with APIC virtualization enabled, if doing
	 * nested virtualization and L1 runs with the "acknowledge interrupt
	 * on exit" mode.  Then we cannot inject the interrupt via RVI,
	 * because the process would deliver it through the IDT.
	 */
	//設置isr寄存器,表明vcpu正在處理該中斷
	apic_set_isr(vector, apic);
	//設置ppr寄存器
	apic_update_ppr(apic);
	apic_clear_irr(vector, apic);
	return vector;
}

最後再調用vmx_inject_irq將之前保存在kvm_queued_interrupt的中斷信息寫到vmcs的VM_ENTRY_INTR_INFO_FIELD,等vcpu執行vm_entry時,就能感知到該中斷的存在。

static void vmx_inject_irq(struct kvm_vcpu *vcpu)
{
	struct vcpu_vmx *vmx = to_vmx(vcpu);
	uint32_t intr;
	int irq = vcpu->arch.interrupt.nr;

	trace_kvm_inj_virq(irq);

	++vcpu->stat.irq_injections;
	if (vmx->rmode.vm86_active) {
		int inc_eip = 0;
		if (vcpu->arch.interrupt.soft)
			inc_eip = vcpu->arch.event_exit_inst_len;
		if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
		return;
	}
	intr = irq | INTR_INFO_VALID_MASK;
	if (vcpu->arch.interrupt.soft) {
		intr |= INTR_TYPE_SOFT_INTR;
		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
			     vmx->vcpu.arch.event_exit_inst_len);
	} else
		intr |= INTR_TYPE_EXT_INTR;
	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章