7.3.1 IOMMU 初始化
kvm用到的iommu功能源碼位於:drivers/iommu,主要代碼爲iommu.c, iova.c, dmar.c,intel-iommu.c.
7.3.1.1 IOMMU模塊初始化
dmar.c: IOMMU_INIT_POST(detect_intel_iommu); (dmar是dma remapping縮寫)
int __init detect_intel_iommu(void) {
a. ret =dmar_table_detect(); //檢察acpi表中是否存在dmar table
b. 如果有則x86_init.iommu.iommu_init =intel_iommu_init;
}
arch\x86\kernel\pci_dma.c: rootfs_initcall(pci_iommu_init);
pci_iommu_init ==> x86_init.iommu.iommu_init;
int __init intel_iommu_init(void) {
a) iommu_init_mempool();該模塊要用到的內存池初始化iova, domain,dev_info三種cache
b) dmar_table_init()遍歷dmar table, 對每個damr設備建立一個intel_iommu *iommu;
c) 對每個intel_iommu,默認禁用轉換iommu_disable_translation(iommu);
d) 遍歷系統中的pci_dev,建立哪些dmar unit 管理哪些pci_dev的映射dmar_dev_scope_init
e) init_no_remapping_devices();(一開始設置所有pci_dev都不用iommu,將
dev->archdata.iommu= DUMMY_DEVICE_DOMAIN_INFO);
f) init_dmars,初始化dmar硬件併爲每個dmar建立一個dmar_domain數組; 爲pci_dev準備rmrr. rmrr區域的內存是vm host要保留出來使用的, 因此在爲device建立iommu時要特別考慮(不建立iommu)
g) dma_ops= &intel_dma_ops;
h) bus_set_iommu(&pci_bus_type,&intel_iommu_ops);
bus_register_notifier(&pci_bus_type,&device_nb);
}
(1) ACPI 表中的先關結構
dmar_table_init ==> parse_dmar_table
dmar的描述位於ACPI表中,每個dmar硬件設備能管理多個pci_dev.
dmar_parse_one_drhd 用於解析一個硬件,每個硬件對應一個intel_iommu結構.
struct dmar_drhd_unit {
struct list_head list; /* list of drhd units */
struct acpi_dmar_header *hdr; /* ACPI header */
u64 reg_base_addr; /*硬件的寄存器基地址*/
struct dmar_dev_scope *devices;/* 用於關聯其上的pci設備*/
int devices_cnt; //pci設備數量,從acpi獲得
u16 segment; /*PCI domain 從acpi獲得 */
u8 ignored:1; /*ignore drhd */
u8 include_all:1;
struct intel_iommu*iommu; // alloc_iommu函數創建,並初始化其寄存器地址與屬性
};
關於寄存器的定義與acpi表中定義請參考intel vt-d spec.
dmar_table_init ==> dmar_parse_one_rmrr
struct dmar_rmrr_unit {
struct list_head list; /* list of rmrr units */
struct acpi_dmar_header*hdr; /* ACPI header */
u64 base_address; /*reserved memory base_addr */
u64 end_address; /*reserved memory end address */
struct dmar_dev_scope*devices; /* target devices */
int devices_cnt; /*target device count */
};
dmar_table_init ==> dmar_parse_one_atsr
struct dmar_atsr_unit {
struct list_head list; /* list of ATSR units */
struct acpi_dmar_header*hdr; /* ACPI header */
struct dmar_dev_scope*devices; /* target devices */
int devices_cnt; /* target device count */
u8 include_all:1; /* include all ports */
};
(2) pci device與dmar的關係建立
int __init dmar_dev_scope_init(void){
。。。。。。
for_each_pci_dev(dev){ //遍歷pci device
if(dev->is_virtfn)
continue;
info =dmar_alloc_pci_notify_info(dev,
BUS_NOTIFY_ADD_DEVICE);
if (!info) {
returndmar_dev_scope_status;
} else {
dmar_pci_bus_add_dev(info);
dmar_free_pci_notify_info(info);
}
}
}
dmar_alloc_pci_notify_info 建立如下結構:
struct dmar_pci_notify_info {
struct pci_dev *dev;
unsigned long event;//爲BUS_NOTIFY_ADD_DEVICE
int bus; //pci bus number
u16 seg; //爲:pci_domain_nr(dev->bus);
u16 level; //在bus 結構上的層次
structacpi_dmar_pci_path path[];
} __attribute__((packed));
dmar_pci_bus_add_dev 根據_pci_notify_info,初始化dmar_drhd_unit->devices中的項
dmar_pci_bus_add_dev ==》 dmar_insert_dev_scope
a.遍歷dmar_unit,根據apci查看該dev是否和dmar_unit對應
b. for_each_dev_scope(devices,devices_cnt, i, tmp)
if(tmp == NULL) {
devices[i].bus= info->dev->bus->number;
devices[i].devfn= info->dev->devfn;
rcu_assign_pointer(devices[i].dev,
get_device(dev));
return1;
}完成dmar_unit與device號的關聯
(3) intel_iommu與dmar_domain的綁定
init_dmars
a) 對每個dmar unit 作
ret = iommu_init_domains(iommu);
iommu_alloc_root_entry(iommu);
b) 對每個dmar unit (該步驟爲寄存器操作)
清楚錯誤狀態dmar_fault(-1, iommu);
禁止queued invalidation dmar_disable_qi(iommu);
c) 開啓queued invalidation
dmar_enable_qi; 需要一個page的物理內存 ==》
1. dmar_writeq(iommu->reg + DMAR_IQA_REG,virt_to_phys(qi->desc));
iommu->flush.flush_context = qi_flush_context;
iommu->flush.flush_iotlb = qi_flush_iotlb;
d) queued invalidation用於iotlb, 可以一次提交多個invalidation請求
e) 爲設備建立iommu等值映射(物理地址=虛擬地址)
iommu_prepare_static_identity_mapping
f) 對每個dmar,
開啓dmar 硬件的中斷dmar_set_interrupt
設置root_entry的物理地址iommu_set_root_entry
使能iommu iommu_enable_translation
iommu_init_domains ==> 一個iommu有多個domain,domain數量有寄存器DMAR_CAP_REG決定。
iommu_alloc_root_entry ==>
每個iommu對應一個iommu->root_entry= alloc_pgtable_page(iommu->node);;
iommu_prepare_static_identity_mapping:
a) si_domain_init爲iommu添加一個domain,該domain爲等值映射domain
b) 將pci device加入到等值映射domain中去 dev_prepare_static_identity_mapping
其中dev_prepare_static_identity_mapping ==> iommu_should_identity_map==>device_has_rmrr會對有rmrr的device作特殊處理
c) 對acpi中的設備加入到等值映射domain中去 dev_prepare_static_identity_mapping
si_domain_init ==>
a. si_domain =alloc_domain(false);
b. 對所有iommu iommu_attach_domain(si_domain, iommu);
c. 爲所有物理內存做等值映射iommu_domain_identity_map
iommu_domain_identity_map:
a) unsignedlong first_vpfn = start >> VTD_PAGE_SHIFT;
unsignedlong last_vpfn = end >> VTD_PAGE_SHIFT;
reserve_iova(&domain->iovad,dma_to_mm_pfn(first_vpfn),
dma_to_mm_pfn(last_vpfn)); //爲物理內存建立一個iova虛擬內存區域,iova用於管理gpa區域
b) __domain_mapping完成iommu設置(下一節會分析該函數)
dev_prepare_static_identity_mapping==》domain_add_dev_info
a. 每個dev 對應一個device_domain_info
info->bus = bus;
info->devfn = devfn;
info->dev = dev;
info->domain =domain;
info->iommu = iommu;
b. 將info加入到list_add(&info->link, &domain->devices);中
c. dev->archdata.iommu = info;
dev_prepare_static_identity_mapping==》domain_context_mapping ==>
domain_context_mapping_one: 將pci_device根據bus 號加入到
iommu->root_entry[bus]中,
root =&iommu->root_entry[bus];
context =get_context_addr_from_root(root);
entry = context[devfn]爲設備對應的實際entry位置
關聯entry 與(domain->id, domain->pgd)
通過上面分析發現當iommu開啓時:
(1)驅動爲所有pcidevice做了物理地址與虛擬地址等值映射的domain.
(2) intel_iommu對iommu層提供了接口bus_set_iommu(&pci_bus_type, &intel_iommu_ops)
static struct iommu_ops intel_iommu_ops = {
.domain_init = intel_iommu_domain_init,
.domain_destroy =intel_iommu_domain_destroy,
.attach_dev = intel_iommu_attach_device,
.detach_dev = intel_iommu_detach_device,
.map = intel_iommu_map,
.unmap = intel_iommu_unmap,
.iova_to_phys = intel_iommu_iova_to_phys,
.domain_has_cap =intel_iommu_domain_has_cap,
.add_device = intel_iommu_add_device,
.remove_device = intel_iommu_remove_device,
.pgsize_bitmap = INTEL_IOMMU_PGSIZES,
};
下一節將分析kvm中對iommu的調用和iommu映射的建立過程
7.3.2 IOMMU Map
(1) attach
iommu_attach_device(struct iommu_domain *domain, struct device*dev)
==> domain->ops->attach_dev(domain, dev);
由於在bus_set_iommu(&pci_bus_type, &intel_iommu_ops)
kvm_vm_ioctl_assign_device ==》 kvm_iommu_map_guest ==》
kvm->arch.iommu_domain =iommu_domain_alloc(&pci_bus_type);
而struct iommu_domain *iommu_domain_alloc(struct bus_type *bus)
{
。。。。。。
domain = kzalloc(sizeof(*domain),GFP_KERNEL);
domain->ops = bus->iommu_ops;
ret =domain->ops->domain_init(domain);
if (ret)
goto out_free;
return domain;
}
而在bus_set_iommu(&pci_bus_type, &intel_iommu_ops) 會使
bus->iommu_ops = intel_iommu_ops; 所以domain->ops->attachment = intel_iommu_attach_device.同時domain->ops->domain_init =intel_iommu_domain_init
intel_iommu_domain_init (struct iommu_domain *domain) {
dmar_domain =alloc_domain(true);
domain->priv =dmar_domain;
}
intel_iommu_attach_device(struct iommu_domain *domain,
struct device *dev)
a. 若設備已有domain,則移除原來的domain, domain_remove_dev_info
這樣原來等值映射的domain被移除了
b. domain_add_dev_info 將設備和新的domain關聯
(2) iommu map
kvm_iommu_map_pages ==> iommu_map(domain, gfn_to_gpa(gfn),pfn_to_hpa(pfn),
page_size, flags);
iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot)
{
.........
while (size) {
size_t pgsize =iommu_pgsize(domain, iova | paddr, size);
//iova 爲gpa, paddr 爲hpa
ret =domain->ops->map(domain, iova, paddr, pgsize, prot);
if (ret)
break;
iova += pgsize;
paddr += pgsize;
size -= pgsize;
}
}
intel_iommu_map(struct iommu_domain *domain,
unsigned long iova, phys_addr_t hpa,
size_t size, int iommu_prot) ==>
domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
hpa >> VTD_PAGE_SHIFT, size, prot);==> __domain_mapping
映射的目的是能更具iov_pfn得到hpa地址, 該過程類似爲虛擬地址和物理地址映射的過程,簡化後的代碼如下:
prot &=DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
{
sg_res = nr_pages +1;
pteval =((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
}
while (nr_pages > 0){
uint64_t tmp;
if (!pte) {
largepage_lvl =hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
first_pte = pte= pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
if(largepage_lvl > 1) {
pteval |=DMA_PTE_LARGE_PAGE;
dma_pte_clear_range(domain,iov_pfn,
iov_pfn + lvl_to_nr_pages(largepage_lvl) -1);
dma_pte_free_pagetable(domain,iov_pfn,
iov_pfn + lvl_to_nr_pages(largepage_lvl) -1);
} else {
pteval &=~(uint64_t)DMA_PTE_LARGE_PAGE;
}
}
tmp =cmpxchg64_local(&pte->val, 0ULL, pteval);
lvl_pages =lvl_to_nr_pages(largepage_lvl);
nr_pages -=lvl_pages;
iov_pfn +=lvl_pages;
phys_pfn +=lvl_pages;
pteval += lvl_pages* VTD_PAGE_SIZE;
sg_res -= lvl_pages;
pte++;
if (!nr_pages ||first_pte_in_page(pte) ||
(largepage_lvl > 1 && sg_res< lvl_pages)) {
domain_flush_cache(domain,first_pte,
(void *)pte - (void *)first_pte);
pte = NULL;
}
}
這裏要補充的一個概念是agaw時客戶機地址寬度修正值,如host的物理內存是4GB,而客戶機內存爲2G, 所以GAW爲31. AGAW爲:
static inline int guestwidth_to_adjustwidth(int gaw) {
int agaw;
int r = (gaw - 12) % 9;
if (r == 0)
agaw = gaw;
else
agaw = gaw + 9 - r;
if (agaw > 64)
agaw = 64;
return agaw;
}
agaw決定了IO多級頁表的級數:
agaw |
Io頁表級數 |
30 |
2 |
39 |
3 |
48 |
4 |
57 |
5 |
64 |
6 |