1. 程式人生 > >nvme 驅動詳解 之1

nvme 驅動詳解 之1

按照老的套路,在分析一個driver時,我們首先看這個driver相關的kconfig及Makefile檔案,察看相關的原始碼檔案.

在開始閱讀一個driver,通常都是從module_initor syscall_init函式看起。

下面讓我們開始nvme的旅程吧。

首先開啟driver/block下的kconfig檔案,其中定義了BLK_DEV_NVMEconfig,如下。

config BLK_DEV_NVME

         tristate "NVMExpress block device"

         depends on PCI

         ---help---

           The NVM Express driver is for solid statedrives directly

           connected to the PCI or PCI Express bus.  If you know you

           don't have one of these, it is safe to answerN.

           To compile this driver as a module, choose Mhere: the

           module will be called nvme.

通過console,輸入makemenuconfig,搜尋BLK_DEV_NEME得到如下依賴關係。

Symbol: BLK_DEV_NVME [=m]                                                                      

  | Type : tristate                                                                                

  | Prompt: NVM Express block device                                                                

  |  Location:                                                                                     

  |    -> Device Drivers                                                                            

  | (1)  -> Block devices (BLK_DEV [=y])                                                           

  |  Defined at drivers/block/Kconfig:313                                                          

  |   Dependson: BLK_DEV [=y] && PCI [=y]

可以看到nemv 依賴於BLKPCI

開啟driver/block/Makefile,搜尋NVME,可以看到:

obj-$(CONFIG_BLK_DEV_NVME)    += nvme.o

nvme-y              := nvme-core.o nvme-scsi.o

關於和BLK相關的檔案,開啟block/Makefile:

obj-$(CONFIG_BLOCK) := bio.oelevator.o blk-core.o blk-tag.o blk-sysfs.o \

                            blk-flush.o blk-settings.o blk-ioc.oblk-map.o \

                            blk-exec.o blk-merge.o blk-softirq.oblk-timeout.o \

                            blk-iopoll.o blk-lib.o blk-mq.oblk-mq-tag.o \

                            blk-mq-sysfs.o blk-mq-cpu.oblk-mq-cpumap.o ioctl.o \

                            genhd.o scsi_ioctl.o partition-generic.oioprio.o \

                            partitions/

哇塞,是不是很多?不要擔心,NVME也只是用了BLOCK層的一些函式而已,不用把所用與BLOCK相關的檔案都看了,除非你有精力去研究。

好了,到目前為止,我們知道了要看哪些檔案了,nvme-core.cnvme-scsi.c是必須的,剩下的就是當我們的driver呼叫到block層哪些函式再去研究。

開啟nvme-core,檢視入口函式,module_init(nvme_init);

static int __init nvme_init(void)

{

         int result;

         init_waitqueue_head(&nvme_kthread_wait);//建立等待佇列

         nvme_workq =create_singlethread_workqueue("nvme");//建立工作佇列

         if (!nvme_workq)

                   return -ENOMEM;

         result= register_blkdev(nvme_major, "nvme");//註冊塊裝置

         if (result < 0)

                   goto kill_workq;

         else if (result > 0)

                   nvme_major = result;

         result= pci_register_driver(&nvme_driver);//註冊pci driver

         if (result)

                   goto unregister_blkdev;

         return 0;

 unregister_blkdev:

         unregister_blkdev(nvme_major, "nvme");

 kill_workq:

         destroy_workqueue(nvme_workq);

         return result;

}

註冊pci driver後,會呼叫nvme_driver中的probe函式。發現開始總是美好的,函式是如此的簡潔,不要高興的太早,痛苦的經歷正在逼近。

static int nvme_probe(struct pci_dev*pdev, const struct pci_device_id *id)

{

         int node, result = -ENOMEM;

         struct nvme_dev *dev;

         node = dev_to_node(&pdev->dev);//獲取node節點,與NUMA系統有關。

         if (node == NUMA_NO_NODE)

                   set_dev_node(&pdev->dev, 0);

         dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);

         if (!dev)

                   return -ENOMEM;

         dev->entry = kzalloc_node(num_possible_cpus() *sizeof(*dev->entry),//分配msix-entry

                                                                 GFP_KERNEL,node);

         if (!dev->entry)

                   goto free;

         dev->queues = kzalloc_node((num_possible_cpus() + 1) *sizeof(void *),//分配queues 資源,

                                                                 GFP_KERNEL,node);//這裡之所以多1,是因為有admin-queues

         if (!dev->queues)

                   goto free;

         INIT_LIST_HEAD(&dev->namespaces);//初始化namespaces連結串列。

         dev->reset_workfn = nvme_reset_failed_dev;

         INIT_WORK(&dev->reset_work, nvme_reset_workfn);

         dev->pci_dev = pci_dev_get(pdev);

         pci_set_drvdata(pdev, dev);

         result = nvme_set_instance(dev);//設定pci裝置的控制代碼instance,代表該裝置。

         if (result)

                   goto put_pci;

         result = nvme_setup_prp_pools(dev);//設定dma需要的prp記憶體池。

         if (result)

                   goto release;

         kref_init(&dev->kref);

         result = nvme_dev_start(dev);//建立admin queue io queue request irq

         if (result)

                   goto release_pools;

         if (dev->online_queues > 1)

                   result = nvme_dev_add(dev);//初始化mq,並增加一個實際可用的nvme dev,並且admin_queue可以傳送cmd

         if (result)

                   goto shutdown;

         scnprintf(dev->name, sizeof(dev->name),"nvme%d", dev->instance);

         dev->miscdev.minor = MISC_DYNAMIC_MINOR;

         dev->miscdev.parent = &pdev->dev;

         dev->miscdev.name = dev->name;

         dev->miscdev.fops = &nvme_dev_fops;

         result = misc_register(&dev->miscdev);//註冊一個misc裝置

         if (result)

                   goto remove;

         nvme_set_irq_hints(dev);

         dev->initialized = 1;

         return 0;

 remove:

         nvme_dev_remove(dev);

         nvme_dev_remove_admin(dev);

         nvme_free_namespaces(dev);

 shutdown:

         nvme_dev_shutdown(dev);

 release_pools:

         nvme_free_queues(dev, 0);

         nvme_release_prp_pools(dev);

 release:

         nvme_release_instance(dev);

 put_pci:

         pci_dev_put(dev->pci_dev);

 free:

         kfree(dev->queues);

         kfree(dev->entry);

         kfree(dev);

         return result;

}

上面每一個主要功能的函式都簡單了註釋了一下,描述了做的哪些工作,下面具體看看那些函式怎麼實現的。

static int nvme_set_instance(structnvme_dev *dev)

{

         int instance, error;

         do {

                   if (!ida_pre_get(&nvme_instance_ida,GFP_KERNEL))

                            return -ENODEV;

                   spin_lock(&dev_list_lock);

                   error = ida_get_new(&nvme_instance_ida,&instance);

                   spin_unlock(&dev_list_lock);

         } while (error == -EAGAIN);

         if (error)

                   return -ENODEV;

         dev->instance = instance;//該函式獲得裝置的instance,相當於該裝置的id,代表著該裝置。

         return 0;

}

Nvme_setup_prp_pools用來建立dma時所用的記憶體池,prp_page_pool是虛擬核心地址,

static int nvme_setup_prp_pools(structnvme_dev *dev)

{

         struct device *dmadev = &dev->pci_dev->dev;

         dev->prp_page_pool = dma_pool_create("prp listpage", dmadev,

                                                        PAGE_SIZE,PAGE_SIZE, 0);

         if (!dev->prp_page_pool)

                   return -ENOMEM;

         /* Optimisation for I/Os between 4k and 128k */

         dev->prp_small_pool = dma_pool_create("prp list256", dmadev,

                                                        256, 256, 0);

         if (!dev->prp_small_pool) {

                   dma_pool_destroy(dev->prp_page_pool);

                   return -ENOMEM;

         }

         return 0;

}

下面是一個重量級的函式之一,nvme_dev_start;

static intnvme_dev_start(struct nvme_dev *dev)

{

         int result;

         bool start_thread = false;

         result = nvme_dev_map(dev);

         if (result)

                   return result;

         result = nvme_configure_admin_queue(dev);//配置adminsubmit queue 和complete queue,64 depth

         if (result)

                   goto unmap;

         spin_lock(&dev_list_lock);

         if (list_empty(&dev_list) &&IS_ERR_OR_NULL(nvme_thread)) {

                   start_thread = true;

                   nvme_thread = NULL;

         }

         list_add(&dev->node, &dev_list);

         spin_unlock(&dev_list_lock);

         if (start_thread) {

                   nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");

                   wake_up_all(&nvme_kthread_wait);

         } else

                   wait_event_killable(nvme_kthread_wait,nvme_thread);

         if (IS_ERR_OR_NULL(nvme_thread)) {

                   result = nvme_thread ? PTR_ERR(nvme_thread) :-EINTR;

                   goto disable;

         }

         nvme_init_queue(dev->queues[0],0);//始化queue,並online_queues++

         result = nvme_alloc_admin_tags(dev);

         if (result)

                   goto disable;

         result = nvme_setup_io_queues(dev);

         if (result)

                   goto free_tags;

         nvme_set_irq_hints(dev);

         return result;

 free_tags:

         nvme_dev_remove_admin(dev);

 disable:

         nvme_disable_queue(dev, 0);

         nvme_dev_list_remove(dev);

 unmap:

         nvme_dev_unmap(dev);

         return result;

}

首先看nvme_configure_admin_queue(dev) 這個函式。

static intnvme_configure_admin_queue(struct nvme_dev *dev)

{

         int result;

         u32 aqa;

         u64 cap = readq(&dev->bar->cap);//讀cap暫存器

         struct nvme_queue *nvmeq;

         unsigned page_shift = PAGE_SHIFT;

         unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;

         unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;

         if (page_shift < dev_page_min) {

                   dev_err(&dev->pci_dev->dev,

                                     "Minimum device page size(%u) too large for "

                                     "host (%u)\n", 1<< dev_page_min,

                                     1 << page_shift);

                   return -ENODEV;

         }

         if (page_shift > dev_page_max) {

                   dev_info(&dev->pci_dev->dev,

                                     "Device maximum page size(%u) smaller than "

                                     "host (%u); enablingwork-around\n",

                                     1 << dev_page_max, 1<< page_shift);

                   page_shift = dev_page_max;

         }

         result = nvme_disable_ctrl(dev, cap);//disable controller

         if (result < 0)

                   return result;

         nvmeq = dev->queues[0];

         if (!nvmeq) {

                   nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);//如果nvmeq==null,就建立nvmeq

                   if (!nvmeq)

                            return -ENOMEM;

         }

         aqa = nvmeq->q_depth - 1;

         aqa |= aqa << 16;

         dev->page_size = 1 << page_shift;

         dev->ctrl_config = NVME_CC_CSS_NVM;

         dev->ctrl_config |= (page_shift - 12) <<NVME_CC_MPS_SHIFT;

         dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;

         dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;

         writel(aqa, &dev->bar->aqa);

         writeq(nvmeq->sq_dma_addr, &dev->bar->asq);

         writeq(nvmeq->cq_dma_addr, &dev->bar->acq); //該語句是建立nvmeqsubmit queuecomplete queue

         result = nvme_enable_ctrl(dev, cap);

         if (result)

                   goto free_nvmeq;

         nvmeq->cq_vector = 0;

         result = queue_request_irq(dev, nvmeq, nvmeq->irqname);//註冊中斷

         if (result)

                   goto free_nvmeq;

         return result;

 free_nvmeq:

         nvme_free_queues(dev, 0);

         return result;

}

下面看一下在nvme_alloc_queue函式中作了什麼。

static struct nvme_queue *nvme_alloc_queue(structnvme_dev *dev, int qid,

                                                                 intdepth)

{

         struct device *dmadev = &dev->pci_dev->dev;

         struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq),GFP_KERNEL);

         if (!nvmeq)

                   return NULL;

         nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),

                                                 &nvmeq->cq_dma_addr, GFP_KERNEL); //分配complete queue cmds空間,深度為depth個。

         if (!nvmeq->cqes)

                   goto free_nvmeq;

         nvmeq->sq_cmds = dma_alloc_coherent(dmadev,SQ_SIZE(depth),

                                               &nvmeq->sq_dma_addr,GFP_KERNEL);//分配submit queuecmds空間,深度為depth個。

         if (!nvmeq->sq_cmds)

                   goto free_cqdma;

         nvmeq->q_dmadev = dmadev;

         nvmeq->dev = dev;

         snprintf(nvmeq->irqname, sizeof(nvmeq->irqname),"nvme%dq%d",

                            dev->instance, qid);//設定nvmeqirqname

         spin_lock_init(&nvmeq->q_lock);

         nvmeq->cq_head = 0;

         nvmeq->cq_phase = 1;

         nvmeq->q_db = &dev->dbs[qid * 2 *dev->db_stride];

         nvmeq->q_depth = depth;

         nvmeq->qid = qid;

         dev->queue_count++;

         dev->queues[qid] = nvmeq;//將分配的nvmeq儲存在dev->queues[qid]位置

         return nvmeq;//返回得到的nvmeq

 free_cqdma:

         dma_free_coherent(dmadev, CQ_SIZE(depth), (void*)nvmeq->cqes,

                                                                 nvmeq->cq_dma_addr);

 free_nvmeq:

         kfree(nvmeq);

         return NULL;

}

到此,我們完成了admin queue的complete queue和submit queue的建立和中斷的註冊。下面一句是nvme_kthread 守護程序的建立,這個我們稍候再講。我們先看一下下面的函式。

static void nvme_init_queue(structnvme_queue *nvmeq, u16 qid)

{

         struct nvme_dev *dev = nvmeq->dev;

         spin_lock_irq(&nvmeq->q_lock);

         nvmeq->sq_tail = 0;//完成一些nvmeq的初始化工作

         nvmeq->cq_head = 0;

         nvmeq->cq_phase = 1;

         nvmeq->q_db = &dev->dbs[qid * 2 *dev->db_stride];

         memset((void *)nvmeq->cqes, 0,CQ_SIZE(nvmeq->q_depth));

         dev->online_queues++;//將dev->online_queues++,代表online_queues增加1

         spin_unlock_irq(&nvmeq->q_lock);

}

 

下面的函式時nvme使用mq的核心。

static int nvme_alloc_admin_tags(structnvme_dev *dev)

{

         if (!dev->admin_q) {//初始化admin_qnull,故進入if分支

                   dev->admin_tagset.ops = &nvme_mq_admin_ops;//初始化blk_mq_tag_set結構體,nvme_mq_admin_opsrun request會用到

                   dev->admin_tagset.nr_hw_queues = 1;//hardware queue個數為1

                   dev->admin_tagset.queue_depth = NVME_AQ_DEPTH -1;

                   dev->admin_tagset.timeout = ADMIN_TIMEOUT;

                   dev->admin_tagset.numa_node =dev_to_node(&dev->pci_dev->dev);

                   dev->admin_tagset.cmd_size = sizeof(structnvme_cmd_info);

                   dev->admin_tagset.driver_data = dev;

                   if (blk_mq_alloc_tag_set(&dev->admin_tagset))//分配一個tag set與一個或多個request queues關聯。

                            return -ENOMEM;

                   dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);//初始化request_queue

                   if (IS_ERR(dev->admin_q)) {

                            blk_mq_free_tag_set(&dev->admin_tagset);

                            return -ENOMEM;

                   }

                   if (!blk_get_queue(dev->admin_q)){

                            nvme_dev_remove_admin(dev);

                            return -ENODEV;

                   }

         } else

                   blk_mq_unfreeze_queue(dev->admin_q);

         return 0;

}

下面依次介紹blk_mq中相關的函式。

先看張圖,一個mq的schdule.

blk_mq_alloc_tag_set(&dev->admin_tagset)這個函式所做工作可以用下圖簡單概括.

/*

 * Alloc a tag set to be associated with one ormore request queues.

 * May fail with EINVAL for various errorconditions. May adjust the

 * requested depth down, if if it too large. Inthat case, the set

 * value will be stored in set->queue_depth.

 */

int blk_mq_alloc_tag_set(struct blk_mq_tag_set*set)

{

         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 <<BLK_MQ_UNIQUE_TAG_BITS);

         if (!set->nr_hw_queues)

                   return -EINVAL;

         if (!set->queue_depth)

                   return -EINVAL;

         if (set->queue_depth < set->reserved_tags +BLK_MQ_TAG_MIN)

                   return -EINVAL;

         if (!set->nr_hw_queues || !set->ops->queue_rq ||!set->ops->map_queue)

                   return -EINVAL;

         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {

                   pr_info("blk-mq: reduced tag depth to%u\n",

                            BLK_MQ_MAX_DEPTH);

                   set->queue_depth = BLK_MQ_MAX_DEPTH;

         }

         /*

          * If a crashdump isactive, then we are potentially in a very

          * memory constrainedenvironment. Limit us to 1 queue and

          * 64 tags to preventusing too much memory.

          */

         if (is_kdump_kernel()) {

                   set->nr_hw_queues = 1;

                   set->queue_depth = min(64U,set->queue_depth);

         }

         set->tags = kmalloc_node(set->nr_hw_queues *   //在這裡給tags分配與nr_hw_queues個空間

                                      sizeof(struct blk_mq_tags *),

                                      GFP_KERNEL, set->numa_node);

         if (!set->tags)

                   return -ENOMEM;

         if (blk_mq_alloc_rq_maps(set))

                   goto enomem;

         mutex_init(&set->tag_list_lock);

         INIT_LIST_HEAD(&set->tag_list);

         return 0;

enomem:

         kfree(set->tags);

         set->tags = NULL;

         return -ENOMEM;

}

/*

 * Allocate the request maps associated withthis tag_set. Note that this

 * may reduce the depth asked for, if memory istight. set->queue_depth

 * will be updated to reflect the allocateddepth.

 */

static int blk_mq_alloc_rq_maps(structblk_mq_tag_set *set)

{

         unsigned int depth;

         int err;

         depth = set->queue_depth;

         do {

                   err = __blk_mq_alloc_rq_maps(set);//如果成功,則跳出,set->tags[xxx]等資源初始化完畢,否則,將queue_depth減半,建立。

                   if (!err)

                            break;

                   set->queue_depth >>= 1;

                   if (set->queue_depth < set->reserved_tags+ BLK_MQ_TAG_MIN) {

                            err = -ENOMEM;

                            break;

                   }

         } while (set->queue_depth);

         if (!set->queue_depth || err) {

                   pr_err("blk-mq: failed to allocate requestmap\n");

                   return -ENOMEM;

         }

         if (depth != set->queue_depth)

                   pr_info("blk-mq: reduced tag depth (%u ->%u)\n",

                                                        depth,set->queue_depth);

         return 0;

}

static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set*set)

{

         int i;

         for (i = 0; i <set->nr_hw_queues; i++) {//根據nr_hw_queues迴圈

                   set->tags[i] = blk_mq_init_rq_map(set, i);//初始化tag[i]

                   if (!set->tags[i])

                            goto out_unwind;

         }

         return 0;

out_unwind:

         while (--i >= 0)

                   blk_mq_free_rq_map(set,set->tags[i], i);

         return -ENOMEM;

}

static structblk_mq_tags *blk_mq_init_rq_map(structblk_mq_tag_set *set,

                   unsigned int hctx_idx)

{

         struct blk_mq_tags *tags;

         unsigned int i, j, entries_per_page,max_order = 4;

         size_t rq_size, left;

         tags = blk_mq_init_tags(set->queue_depth,set->reserved_tags,

                                     set->numa_node);//初始化tags

         if (!tags)

                   return NULL;

         INIT_LIST_HEAD(&tags->page_list);

         tags->rqs =kzalloc_node(set->queue_depth * sizeof(struct request *),

                                      GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,

                                      set->numa_node);//分配requests資源,每個queuequeue_depth

         if (!tags->rqs) {

                   blk_mq_free_tags(tags);

                   return NULL;

         }

         /*

          * rq_size is the size of the request plusdriver payload, rounded

          * to the cacheline size

          */

         rq_size = round_up(sizeof(structrequest) + set->cmd_size,

                                     cache_line_size());//設定request的大小,request大小為request 結構體與cmd_set結構體之和

         left = rq_size * set->queue_depth;

         for (i = 0; i < set->queue_depth;) {

                   int this_order = max_order;

                   struct page *page;

                   int to_do;

                   void *p;

                   while (left <order_to_size(this_order - 1) && this_order)

                            this_order--;

                   do {

                            page =alloc_pages_node(set->numa_node,

                                     GFP_KERNEL| __GFP_NOWARN | __GFP_NORETRY,

                                     this_order);

                            if (page)

                                     break;

                            if (!this_order--)

                                     break;

                            if(order_to_size(this_order) < rq_size)

                                     break;

                   } while (1);

                   if (!page)

                            goto fail;

                   page->private =this_order;

                   list_add_tail(&page->lru,&tags->page_list);

                   p = page_address(page);

                   entries_per_page =order_to_size(this_order) / rq_size;

                   to_do = min(entries_per_page,set->queue_depth - i);

                   left -= to_do * rq_size;

                   for (j = 0; j < to_do;j++) {

                            tags->rqs[i] = p;

                            tags->rqs[i]->atomic_flags= 0;

                            tags->rqs[i]->cmd_flags= 0;

                            if(set->ops->init_request) {

                                     if(set->ops->init_request(set->driver_data,

                                                        tags->rqs[i],hctx_idx, i,

                                                        set->numa_node)){//這裡呼叫init_request初始化request

                                               tags->rqs[i]= NULL;

                                               gotofail;

                                     }

                            }

                            p += rq_size;

                            i++;

                   }

         }

         return tags;

fail:

         blk_mq_free_rq_map(set, tags,hctx_idx);

         return NULL;

}

structblk_mq_tags *blk_mq_init_tags(unsignedint total_tags,

                                          unsigned int reserved_tags, int node)

{

         struct blk_mq_tags *tags;

         if (total_tags > BLK_MQ_TAG_MAX) {

                   pr_err("blk-mq: tagdepth too large\n");

                   return NULL;

         }

         tags = kzalloc_node(sizeof(*tags),GFP_KERNEL, node);//分配tags資源

         if (!tags)

                   return NULL;

         tags->nr_tags = total_tags;

         tags->nr_reserved_tags =reserved_tags;

         return blk_mq_init_bitmap_tags(tags,node);//初始化bitmap tags

}

static structblk_mq_tags *blk_mq_init_bitmap_tags(structblk_mq_tags *tags,

                                                           int node)

{

         unsigned int depth = tags->nr_tags -tags->nr_reserved_tags;//depth為總共的tags-保留的tags數。

         if (bt_alloc(&tags->bitmap_tags,depth, node, false))//初始化bitmap_tags

                   goto enomem;

         if(bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))//初始化breserved_tags

                   goto enomem;

         return tags;

enomem:

         bt_free(&tags->bitmap_tags);

         kfree(tags);

         return NULL;

}

static intbt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,

                            int node, boolreserved)

{

         int i;

         bt->bits_per_word = ilog2(BITS_PER_LONG);//BITS_PER_LONG 定義為64,則bits_per_word=6

         /*

          * Depth can be zero for reserved tags, that'snot a failure

          * condition.

          */

         if (depth) {//此處depth=64

                   unsigned int nr,tags_per_word;

                   tags_per_word = (1 <<bt->bits_per_word);

                   /*

                    * If the tag space is small, shrink the numberof tags

                    * per word so we spread over a few cachelines,at least.

                    * If less than 4 tags, just forget about it,it's not

                    * going to work optimally anyway.

                    */

                   if (depth >= 4) {

                            while (tags_per_word* 4 > depth) {

                                     bt->bits_per_word--;

                                     tags_per_word= (1 << bt->bits_per_word);

                            }

                   }

                   nr = ALIGN(depth,tags_per_word) / tags_per_word;//align函式是以tags_per_word整數倍對齊,每個word記錄的tags數為tags_per_word,這樣depthtags需要的word數為nr

                   bt->map = kzalloc_node(nr* sizeof(struct blk_align_bitmap),

                                                        GFP_KERNEL,node);//於是分配nrmap來記錄這個tags

                   if (!bt->map)

                            return -ENOMEM;

                   bt->map_nr = nr;

         }

         bt->bs = kzalloc(BT_WAIT_QUEUES *sizeof(*bt->bs), GFP_KERNEL);

         if (!bt->bs) {

                   kfree(bt->map);

                   return -ENOMEM;

         }

         bt_update_count(bt, depth);//更新map中的depth

         for (i = 0; i < BT_WAIT_QUEUES; i++){

                   init_waitqueue_head(&bt->bs[i].wait);

                   atomic_set(&bt->bs[i].wait_cnt,bt->wake_cnt);

         }

         return 0;

}

/*

         初始化requestqueue

*/

structrequest_queue *blk_mq_init_queue(structblk_mq_tag_set *set)

{

         struct blk_mq_hw_ctx **hctxs;

         struct blk_mq_ctx __percpu *ctx;

         struct request_queue *q;

         unsigned int *map;

         int i;

         ctx = alloc_percpu(struct blk_mq_ctx);//分配ctx結構體空間

         if (!ctx)

                   return ERR_PTR(-ENOMEM);

         hctxs =kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,//分配nr hw queue hctxs結構體空間

                            set->numa_node);

         if (!hctxs)

                   goto err_percpu;

         map = blk_mq_make_queue_map(set);//得到cpuhwQueued對映map

         if (!map)

                   goto err_map;

         for (i = 0; i < set->nr_hw_queues;i++) {

                   int node =blk_mq_hw_queue_to_node(map, i);

                   hctxs[i] =kzalloc_node(sizeof(struct blk_mq_hw_ctx),//hctxs[i]的一些屬性進行賦值

                                               GFP_KERNEL,node);

                   if (!hctxs[i])

                            goto err_hctxs;

                   if(!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,

                                                        node))

                            goto err_hctxs;

                   atomic_set(&hctxs[i]->nr_active,0);

                   hctxs[i]->numa_node =node;

                   hctxs[i]->queue_num = i;

         }

         q = blk_alloc_queue_node(GFP_KERNEL,set->numa_node);//分配一個request_queue資源,並初始化

         if (!q)

                   goto err_hctxs;

         /*

          *Init percpu_ref in atomic mode so that it's faster to shutdown.

          * See blk_register_queue() for details.

          */

         if(percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,

                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))

                   goto err_map;

         setup_timer(&q->timeout,blk_mq_rq_timer, (unsigned long) q);

         blk_queue_rq_timeout(q, 30000);

         q->nr_queues = nr_cpu_ids;

         q->nr_hw_queues =set->nr_hw_queues;

         q->mq_map = map;

         q->queue_ctx = ctx;

         q->queue_hw_ctx = hctxs;

         q->mq_ops = set->ops;

         q->queue_flags |=QUEUE_FLAG_MQ_DEFAULT;

         if (!(set->flags &BLK_MQ_F_SG_MERGE))

                   q->queue_flags |= 1<< QUEUE_FLAG_NO_SG_MERGE;

         q->sg_reserved_size = INT_MAX;

         INIT_WORK(&q->requeue_work,blk_mq_requeue_work);

         INIT_LIST_HEAD(&q->requeue_list);

         spin_lock_init(&q->requeue_lock);

         if (q->nr_hw_queues > 1)

                   blk_queue_make_request(q,blk_mq_make_request);//設定make_request函式指標

         else

                   blk_queue_make_request(q,blk_sq_make_request);

         if (set->timeout)

                   blk_queue_rq_timeout(q,set->timeout);

         /*

          * Do this after blk_queue_make_request()overrides it...

          */

         q->nr_requests =set->queue_depth;

         if (set->ops->complete)

                   blk_queue_softirq_done(q,set->ops->complete);

         blk_mq_init_cpu_queues(q,set->nr_hw_queues);//初始化sw queue

         if (blk_mq_init_hw_queues(q, set))//初始化hw queue

                   goto err_hw;

         mutex_lock(&all_q_mutex);

         list_add_tail(&q->all_q_node,&all_q_list);

         mutex_unlock(&all_q_mutex);

         blk_mq_add_queue_tag_set(set, q);

         blk_mq_map_swqueue(q);//對映sw queue

         return q;

err_hw:

         blk_cleanup_queu