1. 程式人生 > >Linux I/O Block--塊裝置的表示

Linux I/O Block--塊裝置的表示

       塊裝置的特點是其平均訪問時間較長,因此為了提高塊裝置的訪問效率,Linux核心用了很多的筆墨來設計和塊裝置相關的部分,這樣一來,從程式碼的角度來看,訪問一個檔案的過程變得尤其的漫長……整個路徑包含的過程基本可以概括為虛擬檔案系統-->塊裝置實際檔案系統-->通用塊層-->I/O scheduler-->塊裝置驅動程式。為了提高塊裝置的訪問效率,核心主要是在兩個方面下功夫:

1.引入快取,當用戶空間要訪問檔案時,核心不可能每次都去訪問塊裝置,核心會將塊裝置的內容讀取到記憶體中,以便下次訪問時可以直接在記憶體中找到相應的內容,這其中又涉及到了預讀等相關的問題,當然這不是現在關注的重點……

2.對於I/O請求的重排列,I/O請求並不會立即被響應,而是會放在一個佇列裡進行一段延遲,以期能夠和後來的I/O請求進行合併或者進行排序。因為像磁碟這樣的塊裝置,其耗時主要是因為磁頭的定位,因此核心會盡量保證磁頭只往一個方向移動,而不是來回移動(可以和電梯的運作進行對比),簡而言之,就是將儲存介質上相鄰的資料請求安排在一起,對於I/O請求的處理主要包括合併和排序,具體如何處理,由I/O scheduler決定。

首先,我們先來了解一個塊裝置是如何表示的。描述塊裝置的資料結構有兩個,一個是struct block_device,用來描述一個塊裝置或者塊裝置的一個分割槽;另一個是struct gendisk,用來描述整個塊裝置的特性。對於一個包含多個分割槽的塊裝置,struct block_device結構有多個,而struct gendisk結構永遠只有一個。

struct block_device {
	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
	struct inode *		bd_inode;	/* will die */
	struct super_block *	bd_super;
	int			bd_openers;
	struct mutex		bd_mutex;	/* open/close mutex */
	struct list_head	bd_inodes;
	void *			bd_holder;
	int			bd_holders;
#ifdef CONFIG_SYSFS
	struct list_head	bd_holder_list;
#endif
	struct block_device *	bd_contains;
	unsigned		bd_block_size;
	struct hd_struct *	bd_part;
	/* number of times partitions within this device have been opened. */
	unsigned		bd_part_count;
	int			bd_invalidated;
	struct gendisk *	bd_disk;
	struct list_head	bd_list;
	/*
	 * Private data.  You must have bd_claim'ed the block_device
	 * to use this.  NOTE:  bd_claim allows an owner to claim
	 * the same device multiple times, the owner must take special
	 * care to not mess up bd_private for that case.
	 */
	unsigned long		bd_private;

	/* The counter of freeze processes */
	int			bd_fsfreeze_count;
	/* Mutex for freeze */
	struct mutex		bd_fsfreeze_mutex;
};

bd_dev:該裝置(分割槽)的裝置號

bd_inode:指向該裝置檔案的inode

bd_openers:一個引用計數,記錄了該塊裝置開啟的次數,或者說有多少個程序打開了該裝置

bd_contains:如果該block_device描述的是一個分割槽,則該變數指向描述主塊裝置的block_device,反之,其指向本身

bd_part:如果該block_device描述的是一個分割槽,則該變數指向分割槽的資訊

bd_part_count:如果是分割槽,該變數記錄了分割槽被開啟的次數,在進行分割槽的重新掃描前,要保證該計數值為0

bd_disk:指向描述整個裝置的gendisk結構

struct gendisk {
	/* major, first_minor and minors are input parameters only,
	 * don't use directly.  Use disk_devt() and disk_max_parts().
	 */
	int major;			/* major number of driver */
	int first_minor;
	int minors;                     /* maximum number of minors, =1 for
                                         * disks that can't be partitioned. */

	char disk_name[DISK_NAME_LEN];	/* name of major driver */
	char *(*devnode)(struct gendisk *gd, mode_t *mode);
	/* Array of pointers to partitions indexed by partno.
	 * Protected with matching bdev lock but stat and other
	 * non-critical accesses use RCU.  Always access through
	 * helpers.
	 */
	struct disk_part_tbl *part_tbl;
	struct hd_struct part0;

	const struct block_device_operations *fops;
	struct request_queue *queue;
	void *private_data;

	int flags;
	struct device *driverfs_dev;  // FIXME: remove
	struct kobject *slave_dir;

	struct timer_rand_state *random;

	atomic_t sync_io;		/* RAID */
	struct work_struct async_notify;
#ifdef  CONFIG_BLK_DEV_INTEGRITY
	struct blk_integrity *integrity;
#endif
	int node_id;
};

major:塊裝置的主裝置號

first_minor:起始次裝置號

minors:描述了該塊裝置有多少個次裝置號,或者說有多少個分割槽,如果minors為1,則表示該塊裝置沒有分割槽

part_tbl:整個塊裝置的分割槽資訊都包含在裡面,其核心結構是一個struct hd_struct的指標陣列,每一項都指向一個描述分割槽的hd_struct結構

fops:指向特定於裝置的底層操作函式集

queue:塊裝置的請求佇列,所有針對該裝置的請求都會放入該請求佇列中,經過I/O scheduler的處理再進行提交

塊裝置的分割槽資訊由struct hd_struct結構描述,其中最重要的資訊就是分割槽的起始扇區號和分割槽的大小。所有分割槽資訊都一起儲存在gendisk的part_tbl結構中,同時每個分割槽的block_device也可以通過bd_part來查詢對應的分割槽資訊。

下圖描述了block_device,gendisk以及分割槽描述符之間的關係(塊裝置有兩個分割槽)

下面通過開啟一個塊裝置的過程,來理解這些結構之間的聯絡。

對於塊裝置檔案的操作,通過block_dev偽檔案系統來完成,open操作定義的函式為blkdev_open()

blkdev_open的主要任務有兩個

1.獲取裝置的block_device資訊

2.從gendisk中讀取相關資訊儲存到block_device,同時建立資料結構之間的聯絡

static int blkdev_open(struct inode * inode, struct file * filp)
{
	struct block_device *bdev;
	int res;

	/*
	 * Preserve backwards compatibility and allow large file access
	 * even if userspace doesn't ask for it explicitly. Some mkfs
	 * binary needs it. We might want to drop this workaround
	 * during an unstable branch.
	 */
	filp->f_flags |= O_LARGEFILE;

	if (filp->f_flags & O_NDELAY)
		filp->f_mode |= FMODE_NDELAY;
	if (filp->f_flags & O_EXCL)
		filp->f_mode |= FMODE_EXCL;
	if ((filp->f_flags & O_ACCMODE) == 3)
		filp->f_mode |= FMODE_WRITE_IOCTL;

	bdev = bd_acquire(inode);//獲取block device例項
	if (bdev == NULL)
		return -ENOMEM;

	filp->f_mapping = bdev->bd_inode->i_mapping;

	res = blkdev_get(bdev, filp->f_mode);//通過gendisk獲取資訊並建立聯絡
	if (res)
		return res;

	if (filp->f_mode & FMODE_EXCL) {
		res = bd_claim(bdev, filp);
		if (res)
			goto out_blkdev_put;
	}

	return 0;

 out_blkdev_put:
	blkdev_put(bdev, filp->f_mode);
	return res;
}

bd_acquire()負責獲取block_device的例項

static struct block_device *bd_acquire(struct inode *inode)
{
	struct block_device *bdev;

	spin_lock(&bdev_lock);
	bdev = inode->i_bdev;//如果這個裝置之前被開啟過則可以直接通過i_bdev獲取
	if (bdev) {
		atomic_inc(&bdev->bd_inode->i_count);
		spin_unlock(&bdev_lock);
		return bdev;
	}
	spin_unlock(&bdev_lock);

	bdev = bdget(inode->i_rdev);//通過裝置號的資訊來獲取block device例項
	if (bdev) {
		spin_lock(&bdev_lock);
		if (!inode->i_bdev) {
			/*
			 * We take an additional bd_inode->i_count for inode,
			 * and it's released in clear_inode() of inode.
			 * So, we can access it via ->i_mapping always
			 * without igrab().
			 */
			atomic_inc(&bdev->bd_inode->i_count);
			inode->i_bdev = bdev;
			inode->i_mapping = bdev->bd_inode->i_mapping;
			list_add(&inode->i_devices, &bdev->bd_inodes);
		}
		spin_unlock(&bdev_lock);
	}
	return bdev;
}
struct block_device *bdget(dev_t dev)
{
	struct block_device *bdev;
	struct inode *inode;

	/*這裡先在inode的雜湊表中進行查詢與dev裝置號對應的inode,如果沒找到的話,
	  則通過bdev偽檔案系統建立bdev_inode(包含inode和block device的結構體)*/
	inode = iget5_locked(blockdev_superblock, hash(dev),
			bdev_test, bdev_set, &dev);

	if (!inode)
		return NULL;

	//通過inode獲取bdev_inode,再通過bdev_inode獲取block device例項
	bdev = &BDEV_I(inode)->bdev;

	if (inode->i_state & I_NEW) {
		/*分別設定block device和inode的相關域*/
		bdev->bd_contains = NULL;
		bdev->bd_inode = inode;
		bdev->bd_block_size = (1 << inode->i_blkbits);
		bdev->bd_part_count = 0;
		bdev->bd_invalidated = 0;
		inode->i_mode = S_IFBLK;
		inode->i_rdev = dev;
		inode->i_bdev = bdev;
		inode->i_data.a_ops = &def_blk_aops;
		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
		inode->i_data.backing_dev_info = &default_backing_dev_info;
		spin_lock(&bdev_lock);
		list_add(&bdev->bd_list, &all_bdevs);
		spin_unlock(&bdev_lock);
		unlock_new_inode(inode);
	}
	return bdev;
}




blkdev_get()函式負責從gendisk中獲取資訊,並建立相關資料結構之間的聯絡

int blkdev_get(struct block_device *bdev, fmode_t mode)
{
	return __blkdev_get(bdev, mode, 0);
}


注意_blkdev_get()傳遞的最後一個引數為0,也就是說預設開啟的是主裝置

 獲取到gendisk之後會分四種情況進行處理,也就是針對裝置是不是第一次開啟以及開啟的裝置是主裝置還是分割槽來進行不同的處理,具體見程式碼註釋

static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
	struct gendisk *disk;
	int ret;
	int partno;
	int perm = 0;

	if (mode & FMODE_READ)
		perm |= MAY_READ;
	if (mode & FMODE_WRITE)
		perm |= MAY_WRITE;
	/*
	 * hooks: /n/, see "layering violations".
	 */
	if (!for_part) {
		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
		if (ret != 0) {
			bdput(bdev);
			return ret;
		}
	}

	lock_kernel();
 restart:

	ret = -ENXIO;
	//獲取該裝置的gendisk例項,如果bd_dev對應的是一個分割槽裝置的話,partno將會被修改
	disk = get_gendisk(bdev->bd_dev, &partno);
	if (!disk)
		goto out_unlock_kernel;

	mutex_lock_nested(&bdev->bd_mutex, for_part);
	if (!bdev->bd_openers) {//如果是第一次開啟裝置
		bdev->bd_disk = disk;//建立block device和gendisk之間的聯絡
		bdev->bd_contains = bdev;
		if (!partno) {//partno為0,也就是說開啟的是主裝置而不是分割槽
			struct backing_dev_info *bdi;

			ret = -ENXIO;
			bdev->bd_part = disk_get_part(disk, partno);//獲取gendisk中的分割槽陣列
			if (!bdev->bd_part)
				goto out_clear;

			if (disk->fops->open) {//gendisk中定義了open方式
				ret = disk->fops->open(bdev, mode);//呼叫open針對具體的裝置進行開啟操作
				if (ret == -ERESTARTSYS) {
					/* Lost a race with 'disk' being
					 * deleted, try again.
					 * See md.c
					 */
					disk_put_part(bdev->bd_part);
					bdev->bd_part = NULL;
					module_put(disk->fops->owner);
					put_disk(disk);
					bdev->bd_disk = NULL;
					mutex_unlock(&bdev->bd_mutex);
					goto restart;
				}
				if (ret)
					goto out_clear;
			}
			if (!bdev->bd_openers) {
				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);//從gendisk中提取容量資訊設定到block device
				bdi = blk_get_backing_dev_info(bdev);
				if (bdi == NULL)
					bdi = &default_backing_dev_info;
				bdev->bd_inode->i_data.backing_dev_info = bdi;
			}
			//塊裝置上的分割槽改變導致分割槽在核心中的資訊無效,則要重新掃描分割槽
			if (bdev->bd_invalidated)
				rescan_partitions(disk, bdev);
		} else {//如果開啟的是分割槽
			struct block_device *whole;
			whole = bdget_disk(disk, 0);//獲取主裝置的block device例項
			ret = -ENOMEM;
			if (!whole)
				goto out_clear;
			BUG_ON(for_part);
			ret = __blkdev_get(whole, mode, 1);
			if (ret)
				goto out_clear;
			bdev->bd_contains = whole;//設定分割槽的block device例項的bd_contains域到主裝置
			bdev->bd_inode->i_data.backing_dev_info =
			   whole->bd_inode->i_data.backing_dev_info;
			bdev->bd_part = disk_get_part(disk, partno);
			if (!(disk->flags & GENHD_FL_UP) ||
			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
				ret = -ENXIO;
				goto out_clear;
			}
			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
		}
	}   else {//如果不是第一次開啟
		module_put(disk->fops->owner);
		put_disk(disk);
		disk = NULL;
		if (bdev->bd_contains == bdev) {//開啟的是主裝置
			if (bdev->bd_disk->fops->open) {
				ret = bdev->bd_disk->fops->open(bdev, mode);//呼叫定義的open
				if (ret)
					goto out_unlock_bdev;
			}
			if (bdev->bd_invalidated)
				rescan_partitions(bdev->bd_disk, bdev);
		}
	}
	bdev->bd_openers++;//計數值加1
	if (for_part)//如果是分割槽則分割槽計數值也加1
		bdev->bd_part_count++;
	mutex_unlock(&bdev->bd_mutex);
	unlock_kernel();
	return 0;

 out_clear:
	disk_put_part(bdev->bd_part);
	bdev->bd_disk = NULL;
	bdev->bd_part = NULL;
	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
	if (bdev != bdev->bd_contains)
		__blkdev_put(bdev->bd_contains, mode, 1);
	bdev->bd_contains = NULL;
 out_unlock_bdev:
	mutex_unlock(&bdev->bd_mutex);
 out_unlock_kernel:
	unlock_kernel();

	if (disk)
		module_put(disk->fops->owner);
	put_disk(disk);
	bdput(bdev);

	return ret;
}