remap_pfn_range: 將bar空間對映到user space pci_map_device
網上的Linux PCI驅動教程基本就沒有有用的。扯半天PCI配置空間就完了。但是PCI配置空間是最容易訪問的,只是核心啟動時掃描PCI裝置時比較重要。對於PCI驅動,更常用的是PCI裝置的IO空間和記憶體空間。
以前只知道在PCI裝置的配置空間中,BAR0-BAR5能夠讀取到PCI裝置的IO空間或地址空間的基址,但是如何區分這個BAR代表的到底是IO空間還是記憶體地址空間呢?
在PCI網絡卡的示例程式(pci-skeleton.c)中:
- pio_start=pci_resource_start(pdev,0);
- pio_end=pci_resource_end(pdev,0);
- pio_flags=pci_resource_flags(pdev,0);
- pio_len=pci_resource_len(pdev,0);
- mmio_start=pci_resource_start(pdev,1);
- mmio_end=pci_resource_end(pdev,1);
- mmio_flags=pci_resource_flags(pdev,1);
- mmio_len=pci_resource_len(pdev,1);
- /* make sure PCI base addr 0 is PIO */
- if(!(pio_flags&IORESOURCE_IO)){
- dev_err(&pdev->dev,“region#0nota PIO resource,aborting\n”);
- rc=-ENODEV;
- gotoerr_out;
- }
- /* make sure PCI base addr 1 is MMIO */
- if(!(mmio_flags&IORESOURCE_MEM)){
- dev_err(&pdev->dev,“region#1notan MMIO resource,aborting\n”);
- rc=-ENODEV;
- gotoerr_out;
- }
可以看到如果只寫驅動程式的話,核心在掃描pci裝置的時候早就把裝置的BAR的屬性識別好了。當然,到底有幾個BAR,每個BAR到底是IO空間還是PCI地址空間可以直接問製作PCI裝置的硬體工程師。
此外,非常重要的一個概念是,BAR讀取到的是PCI地址空間中的地址,不等同於CPU認識的記憶體地址。雖然在x86上如果沒有開啟IOMMU時,它們的值一般是相同的,但是對於其他構架的CPU如PowerPC就可以是不一樣的。
所以正確的使用BAR空間的方法:
pciaddr=pci_resource_start(pdev,1);
if(pciaddr!=NULL)
{
ioremap(pciaddr,xx_SIZE);
}
錯誤的方法:
pci_read_config_dword(pdev,1,&pciaddr);
ioremap(pciaddr,xx_SIZE);
dma_mem_map
static int vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, uint64_t len, int do_map) { struct vfio_iommu_type1_dma_map dma_map; struct vfio_iommu_type1_dma_unmap dma_unmap; int ret; if (do_map != 0) { memset(&dma_map, 0, sizeof(dma_map)); dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); dma_map.vaddr = vaddr; dma_map.size = len; dma_map.iova = iova; dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; //VFIO_IOMMU_MAP_DMA這個命令就是將iova通過IOMMU對映到vaddr對應的實體地址上去。 ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); if (ret) { /** * In case the mapping was already done EEXIST will be * returned from kernel. */ if (errno == EEXIST) { RTE_LOG(DEBUG, EAL, " Memory segment is already mapped," " skipping"); } else { RTE_LOG(ERR, EAL, " cannot set up DMA remapping," " error %i (%s)\n", errno, strerror(errno)); return -1; } } } else { memset(&dma_unmap, 0, sizeof(dma_unmap)); dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); dma_unmap.size = len; dma_unmap.iova = iova; ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap); if (ret) { RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", errno, strerror(errno)); return -1; } } return 0; }
static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) { struct vfio_pci_device *vdev = device_data; struct pci_dev *pdev = vdev->pdev; unsigned int index; u64 phys_len, req_len, pgoff, req_start; int ret; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (vma->vm_end < vma->vm_start) return -EINVAL; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; if (index >= VFIO_PCI_ROM_REGION_INDEX) return -EINVAL; if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) return -EINVAL; phys_len = pci_resource_len(pdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); req_start = pgoff << PAGE_SHIFT; if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) return -EINVAL; if (index == vdev->msix_bar) { /* * Disallow mmaps overlapping the MSI-X table; users don't * get to touch this directly. We could find somewhere * else to map the overlap, but page granularity is only * a recommendation, not a requirement, so the user needs * to know which bits are real. Requiring them to mmap * around the table makes that clear. */ /* If neither entirely above nor below, then it overlaps */ if (!(req_start >= vdev->msix_offset + vdev->msix_size || req_start + req_len <= vdev->msix_offset)) return -EINVAL; } /* * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. */ if (!vdev->barmap[index]) { ret = pci_request_selected_regions(pdev, 1 << index, "vfio-pci"); if (ret) return ret; vdev->barmap[index] = pci_iomap(pdev, index, 0); } vma->vm_private_data = vdev; vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, req_len, vma->vm_page_prot); }
vfio-pci與igb_uio對映硬體資源
DPDK(version 20.02)函式rte_pci_map_device用來對映pci device resource到使用者態:
/* Map pci device, only reserve skeleton codes */
int
rte_pci_map_device(struct rte_pci_device *dev)
{
switch (dev->kdrv) {
case RTE_KDRV_VFIO:
pci_vfio_map_resource(dev);
break;
case RTE_KDRV_IGB_UIO:
pci_uio_map_resource(dev);
break;
}
一 vfio-pci
當裝置繫結到vfio-pci時,呼叫函式pci_vfio_map_resource
1.1 函式pci_vfio_map_resource
我們在此對函式pci_vfio_map_resource_primary的主要部分進行分析。
static int pci_vfio_map_resource_primary(struct rte_pci_device *dev) { struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; char pci_addr[PATH_MAX] = {0}; int vfio_dev_fd; struct rte_pci_addr *loc = &dev->addr; int i, ret; struct mapped_pci_resource *vfio_res = NULL; struct mapped_pci_res_list *vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list); struct pci_map *maps; dev->intr_handle.fd = -1; #ifdef HAVE_VFIO_DEV_REQ_INTERFACE dev->vfio_req_intr_handle.fd = -1; #endif /* store PCI address string */ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT, loc->domain, loc->bus, loc->devid, loc->function); ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr, &vfio_dev_fd, &device_info); if (ret) return ret; /* allocate vfio_res and get region info */ vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0); if (vfio_res == NULL) { RTE_LOG(ERR, EAL, "%s(): cannot store vfio mmap details\n", __func__); goto err_vfio_dev_fd; } memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr)); /* get number of registers (up to BAR5) */ vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions, VFIO_PCI_BAR5_REGION_INDEX + 1); /* map BARs */ maps = vfio_res->maps; vfio_res->msix_table.bar_index = -1; /* get MSI-X BAR, if any (we have to know where it is because we can't * easily mmap it when using VFIO) */ ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table); if (ret < 0) { RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", pci_addr); goto err_vfio_res; } /* if we found our MSI-X BAR region, check if we can mmap it */ if (vfio_res->msix_table.bar_index != -1) { int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, vfio_res->msix_table.bar_index); if (ret < 0) { RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n"); goto err_vfio_res; } else if (ret != 0) { /* we can map it, so we don't care where it is */ RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n"); vfio_res->msix_table.bar_index = -1; } } for (i = 0; i < (int) vfio_res->nb_maps; i++) { struct vfio_region_info *reg = NULL; void *bar_addr; ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); if (ret < 0) { RTE_LOG(ERR, EAL, " %s cannot get device region info " "error %i (%s)\n", pci_addr, errno, strerror(errno)); goto err_vfio_res; } /* chk for io port region */ ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); if (ret < 0) { free(reg); goto err_vfio_res; } else if (ret) { RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", i); free(reg); continue; } /* skip non-mmapable BARs */ if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { free(reg); continue; } /* try mapping somewhere close to the end of hugepages */ if (pci_map_addr == NULL) pci_map_addr = pci_find_max_end_va(); bar_addr = pci_map_addr; pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); maps[i].addr = bar_addr; maps[i].offset = reg->offset; maps[i].size = reg->size; maps[i].path = NULL; /* vfio doesn't have per-resource paths */ ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); if (ret < 0) { RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", pci_addr, i, strerror(errno)); free(reg); goto err_vfio_res; } dev->mem_resource[i].addr = maps[i].addr; free(reg); } if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { RTE_LOG(ERR, EAL, " %s setup device failed\n", pci_addr); goto err_vfio_res; } }
1.1.1 rte_vfio_setup_device
此函式的主要工作內容如下:
首先要獲取device對應的iommu_group,找到iommu_group id, 並開啟對應的字元裝置
/* 此函式通過sys檔案系統獲取iommu_group的id號 */
int
rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num)
/* 此函式開啟字元裝置/dev/vfio/{iommu_group},並返回字元裝置控制代碼 */
int
rte_vfio_get_group_fd(int iommu_group_num)
{
struct vfio_config *vfio_cfg;
/* get the vfio_config it belongs to */
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
獲取當前裝置所屬iommu_group的配置
struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
int vfio_active_groups;
const struct vfio_iommu_type *vfio_iommu_type;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
struct user_mem_maps mem_maps;
};
/* get the vfio_config it belongs to */
struct vfio_config *vfio_cfg;
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
vfio_container_fd = vfio_cfg->vfio_container_fd;
user_mem_maps = &vfio_cfg->mem_maps;
? 將剛剛開啟的字元裝置新增到container中,並完成iommu的記憶體對映,在Intel架構中,呼叫函式vfio_type1_dma_map做對映,DPDK對映的記憶體有(看上去是所有DPDK管理的記憶體都做了對映)。。。。。
獲取device fd及device info並返回。
/* get a file descriptor for the device */
*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
/* test and setup the device */
ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
1.1.2 pci_vfio_get_msix_bar
通過讀取裝置的PCI配置空間,讀取的方法是通過上一步取得的裝置控制代碼,獲取msix的配置資訊。並儲存到vfio_res結構體中。
/* get MSI-X BAR, if any (we have to know where it is because we can't
* easily mmap it when using VFIO)
*/
ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
1.1.3 pci_vfio_get_region_info & pci_vfio_mmap_bar
獲取裝置的BAR REGION(暫存器,中斷等資訊),並完成暫存器的mmap對映,讓使用者態程式能夠直接訪問PCI裝置的暫存器。
1.1.4 pci_rte_vfio_setup_device
這個函式首先設定中斷,將第一個中斷新增到系統的中斷輪訓連結串列去。
然後設定開啟裝置,並對裝置復位。
static int
pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
{
if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
return -1;
}
/* set bus mastering for the device */
if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
return -1;
}
/*
* Reset the device. If the device is not capable of resetting,
* then it updates errno as EINVAL.
*/
if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
errno, strerror(errno));
return -1;
}
return 0;
}