1. 程式人生 > 實用技巧 >remap_pfn_range: 將bar空間對映到user space pci_map_device

remap_pfn_range: 將bar空間對映到user space pci_map_device

網上的Linux PCI驅動教程基本就沒有有用的。扯半天PCI配置空間就完了。但是PCI配置空間是最容易訪問的,只是核心啟動時掃描PCI裝置時比較重要。對於PCI驅動,更常用的是PCI裝置的IO空間和記憶體空間。
以前只知道在PCI裝置的配置空間中,BAR0-BAR5能夠讀取到PCI裝置的IO空間或地址空間的基址,但是如何區分這個BAR代表的到底是IO空間還是記憶體地址空間呢
在PCI網絡卡的示例程式(pci-skeleton.c)中:

  1. pio_start=pci_resource_start(pdev,0);
  2. pio_end=pci_resource_end(pdev,0);
  3. pio_flags=pci_resource_flags(pdev,0);
  4. pio_len=pci_resource_len(pdev,0);
  5. mmio_start=pci_resource_start(pdev,1);
  6. mmio_end=pci_resource_end(pdev,1);
  7. mmio_flags=pci_resource_flags(pdev,1);
  8. mmio_len=pci_resource_len(pdev,1);
  9. /* make sure PCI base addr 0 is PIO */
  10. if(!(pio_flags&IORESOURCE_IO)){
  11. dev_err(&pdev->dev,“region#0nota PIO resource,aborting\n”);
  12. rc=-ENODEV;
  13. gotoerr_out;
  14. }
  15. /* make sure PCI base addr 1 is MMIO */
  16. if(!(mmio_flags&IORESOURCE_MEM)){
  17. dev_err(&pdev->dev,“region#1notan MMIO resource,aborting\n”);
  18. rc=-ENODEV;
  19. gotoerr_out;
  20. }

可以看到如果只寫驅動程式的話,核心在掃描pci裝置的時候早就把裝置的BAR的屬性識別好了。當然,到底有幾個BAR,每個BAR到底是IO空間還是PCI地址空間可以直接問製作PCI裝置的硬體工程師。

那麼核心是如何獲得這個flags呢?我跟了半天原始碼也沒找到。只是知道,PCI匯流排規範規定直接讀BAR,返回的是BAR空間基址。先寫全1到BAR再 讀,就能讀取到BAR空間大小和屬性。選最低的一位非0的,比如讀到0xFFFFFF00,那個空間的大小就為0x100個Byte ,最後一位為0說明是地址區域,為1則這個BAR是IO空間。

此外,非常重要的一個概念是,BAR讀取到的是PCI地址空間中的地址,不等同於CPU認識的記憶體地址。雖然在x86上如果沒有開啟IOMMU時,它們的值一般是相同的,但是對於其他構架的CPU如PowerPC就可以是不一樣的。
所以正確的使用BAR空間的方法:

pciaddr=pci_resource_start(pdev,1);
if(pciaddr!=NULL)
{
ioremap(pciaddr,xx_SIZE);
}

錯誤的方法:

pci_read_config_dword(pdev,1,&pciaddr);
ioremap(pciaddr,xx_SIZE);

dma_mem_map
static int
vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
        uint64_t len, int do_map)
{
    struct vfio_iommu_type1_dma_map dma_map;
    struct vfio_iommu_type1_dma_unmap dma_unmap;
    int ret;

    if (do_map != 0) {
        memset(&dma_map, 0, sizeof(dma_map));
        dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
        dma_map.vaddr = vaddr;
        dma_map.size = len;
        dma_map.iova = iova;
        dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
                VFIO_DMA_MAP_FLAG_WRITE;

//VFIO_IOMMU_MAP_DMA這個命令就是將iova通過IOMMU對映到vaddr對應的實體地址上去。
        ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
        if (ret) {
            /**
             * In case the mapping was already done EEXIST will be
             * returned from kernel.
             */
            if (errno == EEXIST) {
                RTE_LOG(DEBUG, EAL,
                    " Memory segment is already mapped,"
                    " skipping");
            } else {
                RTE_LOG(ERR, EAL,
                    "  cannot set up DMA remapping,"
                    " error %i (%s)\n",
                    errno, strerror(errno));
                return -1;
            }
        }
    } else {
        memset(&dma_unmap, 0, sizeof(dma_unmap));
        dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
        dma_unmap.size = len;
        dma_unmap.iova = iova;

        ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
                &dma_unmap);
        if (ret) {
            RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
                    errno, strerror(errno));
            return -1;
        }
    }

    return 0;
}

static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
{
    struct vfio_pci_device *vdev = device_data;
    struct pci_dev *pdev = vdev->pdev;
    unsigned int index;
    u64 phys_len, req_len, pgoff, req_start;
    int ret;
    index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
    if (vma->vm_end < vma->vm_start)
        return -EINVAL;
    if ((vma->vm_flags & VM_SHARED) == 0)
        return -EINVAL;
    if (index >= VFIO_PCI_ROM_REGION_INDEX)
        return -EINVAL;
    if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
        return -EINVAL;
    phys_len = pci_resource_len(pdev, index);
    req_len = vma->vm_end - vma->vm_start;
    pgoff = vma->vm_pgoff &
        ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
    req_start = pgoff << PAGE_SHIFT;
    if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
        return -EINVAL;
    if (index == vdev->msix_bar) {
        /*
         * Disallow mmaps overlapping the MSI-X table; users don't
         * get to touch this directly.  We could find somewhere
         * else to map the overlap, but page granularity is only
         * a recommendation, not a requirement, so the user needs
         * to know which bits are real.  Requiring them to mmap
         * around the table makes that clear.
         */
        /* If neither entirely above nor below, then it overlaps */
        if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
              req_start + req_len <= vdev->msix_offset))
            return -EINVAL;
    }
    /*
     * Even though we don't make use of the barmap for the mmap,
     * we need to request the region and the barmap tracks that.
     */
    if (!vdev->barmap[index]) {
        ret = pci_request_selected_regions(pdev,
                           1 << index, "vfio-pci");
        if (ret)
            return ret;
        vdev->barmap[index] = pci_iomap(pdev, index, 0);
    }
    vma->vm_private_data = vdev;
    vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
    vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
    vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
    return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
                   req_len, vma->vm_page_prot);
}

vfio-pci與igb_uio對映硬體資源

DPDK(version 20.02)函式rte_pci_map_device用來對映pci device resource到使用者態:

/* Map pci device, only reserve skeleton codes */
int
rte_pci_map_device(struct rte_pci_device *dev)
{
    switch (dev->kdrv) {
    case RTE_KDRV_VFIO:
        pci_vfio_map_resource(dev);
        break;
    case RTE_KDRV_IGB_UIO:
        pci_uio_map_resource(dev);
        break;
}

一 vfio-pci
當裝置繫結到vfio-pci時,呼叫函式pci_vfio_map_resource

1.1 函式pci_vfio_map_resource

我們在此對函式pci_vfio_map_resource_primary的主要部分進行分析。

static int
pci_vfio_map_resource_primary(struct rte_pci_device *dev)
{
    struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
    char pci_addr[PATH_MAX] = {0};
    int vfio_dev_fd;
    struct rte_pci_addr *loc = &dev->addr;
    int i, ret;
    struct mapped_pci_resource *vfio_res = NULL;
    struct mapped_pci_res_list *vfio_res_list =
        RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);

    struct pci_map *maps;

    dev->intr_handle.fd = -1;
#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
    dev->vfio_req_intr_handle.fd = -1;
#endif

    /* store PCI address string */
    snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
            loc->domain, loc->bus, loc->devid, loc->function);

    ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
                    &vfio_dev_fd, &device_info);
    if (ret)
        return ret;

    /* allocate vfio_res and get region info */
    vfio_res = rte_zmalloc("VFIO_RES", sizeof(*vfio_res), 0);
    if (vfio_res == NULL) {
        RTE_LOG(ERR, EAL,
            "%s(): cannot store vfio mmap details\n", __func__);
        goto err_vfio_dev_fd;
    }
    memcpy(&vfio_res->pci_addr, &dev->addr, sizeof(vfio_res->pci_addr));

    /* get number of registers (up to BAR5) */
    vfio_res->nb_maps = RTE_MIN((int) device_info.num_regions,
            VFIO_PCI_BAR5_REGION_INDEX + 1);

    /* map BARs */
    maps = vfio_res->maps;

    vfio_res->msix_table.bar_index = -1;
    /* get MSI-X BAR, if any (we have to know where it is because we can't
     * easily mmap it when using VFIO)
     */
    ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);
    if (ret < 0) {
        RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
                pci_addr);
        goto err_vfio_res;
    }
    /* if we found our MSI-X BAR region, check if we can mmap it */
    if (vfio_res->msix_table.bar_index != -1) {
        int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
                vfio_res->msix_table.bar_index);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is mappable\n");
            goto err_vfio_res;
        } else if (ret != 0) {
            /* we can map it, so we don't care where it is */
            RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as mappable\n");
            vfio_res->msix_table.bar_index = -1;
        }
    }

    for (i = 0; i < (int) vfio_res->nb_maps; i++) {
        struct vfio_region_info *reg = NULL;
        void *bar_addr;

        ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "  %s cannot get device region info "
                "error %i (%s)\n", pci_addr, errno,
                strerror(errno));
            goto err_vfio_res;
        }

        /* chk for io port region */
        ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
        if (ret < 0) {
            free(reg);
            goto err_vfio_res;
        } else if (ret) {
            RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
                    i);
            free(reg);
            continue;
        }

        /* skip non-mmapable BARs */
        if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
            free(reg);
            continue;
        }

        /* try mapping somewhere close to the end of hugepages */
        if (pci_map_addr == NULL)
            pci_map_addr = pci_find_max_end_va();

        bar_addr = pci_map_addr;
        pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);

        maps[i].addr = bar_addr;
        maps[i].offset = reg->offset;
        maps[i].size = reg->size;
        maps[i].path = NULL; /* vfio doesn't have per-resource paths */

        ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
        if (ret < 0) {
            RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
                    pci_addr, i, strerror(errno));
            free(reg);
            goto err_vfio_res;
        }

        dev->mem_resource[i].addr = maps[i].addr;

        free(reg);
    }

    if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
        RTE_LOG(ERR, EAL, "  %s setup device failed\n", pci_addr);
        goto err_vfio_res;
    }

 
}

1.1.1 rte_vfio_setup_device
此函式的主要工作內容如下:

首先要獲取device對應的iommu_group,找到iommu_group id, 並開啟對應的字元裝置
/* 此函式通過sys檔案系統獲取iommu_group的id號 */
int
rte_vfio_get_group_num(const char *sysfs_base,
const char *dev_addr, int *iommu_group_num)

/* 此函式開啟字元裝置/dev/vfio/{iommu_group},並返回字元裝置控制代碼 */
int
rte_vfio_get_group_fd(int iommu_group_num)
{
struct vfio_config *vfio_cfg;

/* get the vfio_config it belongs to */
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;

return vfio_get_group_fd(vfio_cfg, iommu_group_num);
}
獲取當前裝置所屬iommu_group的配置
struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
int vfio_active_groups;
const struct vfio_iommu_type *vfio_iommu_type;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
struct user_mem_maps mem_maps;
};

/* get the vfio_config it belongs to */
struct vfio_config *vfio_cfg;
vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
vfio_container_fd = vfio_cfg->vfio_container_fd;
user_mem_maps = &vfio_cfg->mem_maps;
? 將剛剛開啟的字元裝置新增到container中,並完成iommu的記憶體對映,在Intel架構中,呼叫函式vfio_type1_dma_map做對映,DPDK對映的記憶體有(看上去是所有DPDK管理的記憶體都做了對映)。。。。。

獲取device fd及device info並返回。

/* get a file descriptor for the device */
*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);

/* test and setup the device */
ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
1.1.2 pci_vfio_get_msix_bar
通過讀取裝置的PCI配置空間,讀取的方法是通過上一步取得的裝置控制代碼,獲取msix的配置資訊。並儲存到vfio_res結構體中。

/* get MSI-X BAR, if any (we have to know where it is because we can't
* easily mmap it when using VFIO)
*/
ret = pci_vfio_get_msix_bar(vfio_dev_fd, &vfio_res->msix_table);

1.1.3 pci_vfio_get_region_info & pci_vfio_mmap_bar
獲取裝置的BAR REGION(暫存器,中斷等資訊),並完成暫存器的mmap對映,讓使用者態程式能夠直接訪問PCI裝置的暫存器。

1.1.4 pci_rte_vfio_setup_device
這個函式首先設定中斷,將第一個中斷新增到系統的中斷輪訓連結串列去。
然後設定開啟裝置,並對裝置復位。


static int
pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
{
if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
return -1;
}

/* set bus mastering for the device */
if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
return -1;
}

/*
* Reset the device. If the device is not capable of resetting,
* then it updates errno as EINVAL.
*/
if (ioctl(vfio_dev_fd, VFIO_DEVICE_RESET) && errno != EINVAL) {
RTE_LOG(ERR, EAL, "Unable to reset device! Error: %d (%s)\n",
errno, strerror(errno));
return -1;
}

return 0;
}