1、x86平臺主要使用的中斷類型有pic、apic及msi中斷,在多核系統下的apic結構圖如下所示,每個cpu有一個lapic,外部中斷通過ioapic轉發到lapic,如果是msi中斷,則繞過了io apic直接發給lapic。
2、kvm初始化過程爲每個虛擬機維護一個pic主控制器、一個pic備控制器以及一個ioapic控制器,每個vcpu維護一個lapic控制器。同時每個虛擬機有一張中斷路由表(kvm_irq_routing_table)。中斷路由表裏的chip二維數組保存非msi中斷的gsi號,每個中斷都有自己的routing_entry,routing_entry保存了中斷的類型(pci、ioapic、msi)、中斷號、以及set觸發函數,所有的routing_entry以gsi爲索引信息掛接到route_table的map鏈表裏(可能同一個中斷號會同時關聯pic、ioapic兩種中斷type)。
ioapic裏還維護了一張中斷重映射表(redirtbl),負責爲每個ioapic引腳(總共24個引腳)收到的中斷選擇路由到哪個lapic,每個vcpu的lapic控制器則模擬了主要的apic寄存器(IRR、ISR、EOI)。
3、中斷路由表初始過程
kvm創建好pci、ioapic控制器後,會先使用default_routing(kvm/irq_common.c)安裝默認的中斷路由表。
kvm_arch_vm_ioctl
kvm_create_pic
kvm_ioapic_init
kvm_setup_default_irq_routing
kvm_set_irq_routing
setup_routing_entry
static int setup_routing_entry(struct kvm *kvm,
struct kvm_irq_routing_table *rt,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
struct kvm_kernel_irq_routing_entry *ei;
/*
* Do not allow GSI to be mapped to the same irqchip more than once.
* Allow only one to one mapping between GSI and non-irqchip routing.
*/
hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return r;
e->gsi = ue->gsi;
e->type = ue->type;
//設置每個routing_entry信息
r = kvm_set_routing_entry(kvm, e, ue);
if (r)
goto out;
if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi;
//將routing_entry連接到route_table的map鏈表
hlist_add_head(&e->link, &rt->map[e->gsi]);
r = 0;
out:
return r;
}
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
int delta;
unsigned max_pin;
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP:
delta = 0;
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
max_pin = KVM_IOAPIC_NUM_PINS;
e->set = kvm_set_ioapic_irq;
break;
default:
goto out;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
e->irqchip.pin = ue->u.irqchip.pin + delta;
if (e->irqchip.pin >= max_pin)
goto out;
break;
case KVM_IRQ_ROUTING_MSI:
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
if (kvm_msi_route_invalid(kvm, e))
goto out;
break;
default:
goto out;
}
r = 0;
out:
return r;
}
setup_routing_entry的ue參數即爲default_routing,以上的流程主要就是將default_routing定義的路由信息保存到routing_table裏,default_routing初始化定義了0-24號中斷的基本信息,如中斷type(都是非msi的IRQCHIP類型,包括pic、ioapic),中斷gsi號等。中斷路由表除了初始化安裝外,還可以通過KVM_SET_GSI_ROUTING重新安裝。
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
#define PIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
#define ROUTING_ENTRY2(irq) \
IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
static const struct kvm_irq_routing_entry default_routing[] = {
ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
};
4、中斷觸發流程
當vfio或vhost等後端通過eventfd喚醒kvm中斷處理函數後,會進入irqfd_inject,然後調用kvm_set_irq,kvm_set_irq主要是查找中斷路由表,找到中斷對應的routing_entry,然後調用其set觸發函數,如果是ioapic類型的中斷,則會調用kvm_set_ioapic_irq,最後進入ioapic_service處理函數。ioapic_service主要是找到中斷的重映射表,然後查找中斷的目的地信息並轉發到對應vcpu的lapic去處理。
static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
{
union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
struct kvm_lapic_irq irqe;
int ret;
if (entry->fields.mask)
return -1;
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
"vector=%x trig_mode=%x\n",
entry->fields.dest_id, entry->fields.dest_mode,
entry->fields.delivery_mode, entry->fields.vector,
entry->fields.trig_mode);
irqe.dest_id = entry->fields.dest_id;
irqe.vector = entry->fields.vector;
irqe.dest_mode = entry->fields.dest_mode;
irqe.trig_mode = entry->fields.trig_mode;
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
irqe.shorthand = 0;
irqe.msi_redir_hint = false;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
ioapic->irr_delivered |= 1 << irq;
if (irq == RTC_GSI && line_status) {
/*
* pending_eoi cannot ever become negative (see
* rtc_status_pending_eoi_check_valid) and the caller
* ensures that it is only called if it is >= zero, namely
* if rtc_irq_check_coalesced returns false).
*/
BUG_ON(ioapic->rtc_status.pending_eoi != 0);
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
&ioapic->rtc_status.dest_map);
ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
} else
ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
entry->fields.remote_irr = 1;
return ret;
}
lapic收到中斷後,會根據不同的delivery_mode調用不同的處理函數,以常見的APIC_DM_FIXED爲例,處理函數還會判斷是否啓用apicv功能,使用apicv和不使用apicv走不同的觸發流程。
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
struct dest_map *dest_map)
{
case APIC_DM_FIXED:
//設置觸發模式
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
else
apic_clear_vector(vector, apic->regs + APIC_TMR);
}
//判斷是否使用apicv
if (vcpu->arch.apicv_active)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
//irr寄存器對應bit位置1
kvm_lapic_set_irr(vector, apic);
//標記中斷請求事件
kvm_make_request(KVM_REQ_EVENT, vcpu);
//把vcpu拉回到host
kvm_vcpu_kick(vcpu);
}
break;
}
1)、如果使能了apicv,最終調用vmx_deliver_posted_interrupt,使用中斷posting的方式來通知vcpu處理中斷。
static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;
//嵌套虛擬化的場景
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
if (!r)
return;
//將pi_desc對應的bit位置1
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
//設置pi_desc.on爲1,表明有中斷需要處理
r = pi_test_and_set_on(&vmx->pi_desc);
kvm_make_request(KVM_REQ_EVENT, vcpu);
//判斷vcpu是否處在Guest running狀態,如果是,則給vcpu發送IPI中斷POSTED_INTR_VECTOR
//該IPI中斷vcpu可以直接在non-root模式下處理,不需要vm-exit
//如果vcpu處於非running狀態,則將vcpu喚醒,這樣vcpu執行vm_entry的時候就能感知到有中斷需要處理
if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
kvm_vcpu_kick(vcpu);
}
2)、如果沒有使能apicv功能,則標記lapic的IRR寄存器,通過kvm_make_request標記vcpu有中斷請求事件,然後觸發vcpu vm-exit。當vcpu重新回到Guest模式時,會檢查是否有中斷請求事件,如果有,則設置ISR、PPR等寄存器信息。
vcpu_enter_guest
inject_pending_event
kvm_cpu_get_interrupt
kvm_get_apic_interrupt
kvm_queue_interrupt
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
//從irr寄存器獲取優先級最高的中斷向量
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
if (vector == -1)
return -1;
/*
* We get here even with APIC virtualization enabled, if doing
* nested virtualization and L1 runs with the "acknowledge interrupt
* on exit" mode. Then we cannot inject the interrupt via RVI,
* because the process would deliver it through the IDT.
*/
//設置isr寄存器,表明vcpu正在處理該中斷
apic_set_isr(vector, apic);
//設置ppr寄存器
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
return vector;
}
最後再調用vmx_inject_irq將之前保存在kvm_queued_interrupt的中斷信息寫到vmcs的VM_ENTRY_INTR_INFO_FIELD,等vcpu執行vm_entry時,就能感知到該中斷的存在。
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
trace_kvm_inj_virq(irq);
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
if (vcpu->arch.interrupt.soft)
inc_eip = vcpu->arch.event_exit_inst_len;
if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
intr = irq | INTR_INFO_VALID_MASK;
if (vcpu->arch.interrupt.soft) {
intr |= INTR_TYPE_SOFT_INTR;
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);