1. 程式人生 > >KVM虛擬機器IO處理過程(一) ----Guest VM I/O 處理過程

KVM虛擬機器IO處理過程(一) ----Guest VM I/O 處理過程

   虛擬化技術主要包含三部分內容:CPU虛擬化,記憶體虛擬化,裝置虛擬化.本系列文章主要描述磁碟裝置的虛擬化過程,包含了一個讀操作的I/O請求如何從Guest Vm到其最終被處理的整個過程.本系列文章中引用到的linux核心程式碼版本為3.7.10,使用的虛擬化平臺是KVM,qemu的版本是1.6.1.

    使用者程式想要訪問IO裝置需要呼叫作業系統提供的介面,即系統呼叫.當在使用者程式中呼叫一個read操作時,系統先儲存好read操作的引數,然後呼叫int 80命令(也可能是sysenter)進入核心空間,在核心空間中,讀操作的邏輯由sys_read函式實現.

    在講sys_read的實現過程之前,我們先來看看read操作在核心空間需要經歷的層次結構.從圖中可以看出,read操作首先經過虛擬檔案系統曾(vfs), 接下來是具體的檔案系統層,Page cache層,通用塊層(generic block layer),I/O排程層(I/O scheduler layer),塊裝置驅動層(block device driver layer),最後是塊物理裝置層(block device layer).

 

 

  • 虛擬檔案系統層:該層遮蔽了下層的具體操作,為上層提供統一的介面,如vfs_read,vfs_write等.vfs_read,vfs_write通過呼叫下層具體檔案系統的介面來實現相應的功能.
  • 具體檔案系統層:該層針對每一類檔案系統都有相應的操作和實現了,包含了具體檔案系統的處理邏輯.
  • page cache層:該層快取了從塊裝置中獲取的資料.引入該層的目的是避免頻繁的塊裝置訪問,如果在page cache中已經快取了I/O請求的資料,則可以將資料直接返回,無需訪問塊裝置.
  • 通過塊層:接收上層的I/O請求,並最終發出I/O請求.該層向上層遮蔽了下層裝置的特性.
  • I/O排程層:   接收通用塊層發出的 IO 請求,快取請求並試圖合併相鄰的請求(如果這兩個請求的資料在磁碟上是相鄰的)。並根據設定好的排程演算法,回撥驅動層提供的請求處理函式,以處理具體的 IO 請求
  • 塊裝置驅動層:從上層取出請求,並根據引數,操作具體的裝置.
  • 塊裝置層:真正的物理裝置.

 

    瞭解了核心層次的結構,讓我們來看一下read操作的程式碼實現.

     sys_read函式宣告在include/linux/syscalls.h檔案中,

asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);

           其函式實現在fs/read_write.c檔案中:

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	struct fd f = fdget(fd);
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos = file_pos_read(f.file);
		ret = vfs_read(f.file, buf, count, &pos); //呼叫vfs layer中的read操作
		file_pos_write(f.file, pos);//設定當前檔案的位置
		fdput(f);
	}
	return ret;
}

 

    vfs_read函式屬於vfs layer,定義在fs/read_write.c, 其主要功能是呼叫具體檔案系統中對應的read操作,如果具體檔案系統沒有提供read操作,則使用預設的do_sync_read函式.

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_READ))
		return -EBADF;
	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
		return -EINVAL;
	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
		return -EFAULT;

	ret = rw_verify_area(READ, file, pos, count);
	if (ret >= 0) {
		count = ret;
		if (file->f_op->read) {
			ret = file->f_op->read(file, buf, count, pos); //該函式由具體的檔案系統指定
		} else
			ret = do_sync_read(file, buf, count, pos);  //核心預設的讀檔案操作
		if (ret > 0) {
			fsnotify_access(file);
			add_rchar(current, ret);
		}
		inc_syscr(current);
	}

	return ret;
}

 

    file->f_op的型別為struct file_operations, 該型別定義了一系列涉及檔案操作的函式指標,針對不同的檔案系統,這些函式指標指向不同的實現.以ext4 檔案系統為例子,該資料結構的初始化在fs/ext4/file.c,從該初始化可以知道,ext4的read操作呼叫了核心自帶的do_sync_read()函式

const struct file_operations ext4_file_operations = {
	.llseek		= ext4_llseek,
	.read		= do_sync_read,
	.write		= do_sync_write,
	.aio_read	= generic_file_aio_read,
	.aio_write	= ext4_file_write,
	.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= ext4_compat_ioctl,
#endif
	.mmap		= ext4_file_mmap,
	.open		= ext4_file_open,
	.release	= ext4_release_file,
	.fsync		= ext4_sync_file,
	.splice_read	= generic_file_splice_read,
	.splice_write	= generic_file_splice_write,
	.fallocate	= ext4_fallocate,
};

 

    do_sync_read()函式定義fs/read_write.c中,

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };
	struct kiocb kiocb;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);//初始化kiocp,描述符kiocb是用來記錄I/O操作的完成狀態
	kiocb.ki_pos = *ppos;
	kiocb.ki_left = len;
	kiocb.ki_nbytes = len;

	for (;;) {
		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);//呼叫真正做讀操作的函式,ext4檔案系統在fs/ext4/file.c中配置
		if (ret != -EIOCBRETRY)
			break;
		wait_on_retry_sync_kiocb(&kiocb);
	}

	if (-EIOCBQUEUED == ret)
		ret = wait_on_sync_kiocb(&kiocb);
	*ppos = kiocb.ki_pos;
	return ret;
}

 

    在ext4檔案系統中filp->f_op->aio_read函式指標只想generic_file_aio_read, 該函式定義於mm/filemap.c檔案中,該函式有兩個執行路徑,如果是以O_DIRECT方式開啟檔案,則讀操作跳過page cache直接去讀取磁碟,否則呼叫do_generic_sync_read函式嘗試從page cache中獲取所需的資料.

ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
		unsigned long nr_segs, loff_t pos)
{
	struct file *filp = iocb->ki_filp;
	ssize_t retval;
	unsigned long seg = 0;
	size_t count;
	loff_t *ppos = &iocb->ki_pos;

	count = 0;
	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
	if (retval)
		return retval;

	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
	if (filp->f_flags & O_DIRECT) {
		loff_t size;
		struct address_space *mapping;
		struct inode *inode;

		struct timex txc;
		do_gettimeofday(&(txc.time));

		mapping = filp->f_mapping;
		inode = mapping->host;
		if (!count)
			goto out; /* skip atime */
		size = i_size_read(inode);
		if (pos < size) {
			retval = filemap_write_and_wait_range(mapping, pos,
					pos + iov_length(iov, nr_segs) - 1);
			if (!retval) {
				retval = mapping->a_ops->direct_IO(READ, iocb,
							iov, pos, nr_segs);
			}
			if (retval > 0) {
				*ppos = pos + retval;
				count -= retval;
			}

			/*
			 * Btrfs can have a short DIO read if we encounter
			 * compressed extents, so if there was an error, or if
			 * we've already read everything we wanted to, or if
			 * there was a short read because we hit EOF, go ahead
			 * and return.  Otherwise fallthrough to buffered io for
			 * the rest of the read.
			 */
			if (retval < 0 || !count || *ppos >= size) {
				file_accessed(filp);
				goto out;
			}
		}
	}

	count = retval;
	for (seg = 0; seg < nr_segs; seg++) {
		read_descriptor_t desc;
		loff_t offset = 0;

		/*
		 * If we did a short DIO read we need to skip the section of the
		 * iov that we've already read data into.
		 */
		if (count) {
			if (count > iov[seg].iov_len) {
				count -= iov[seg].iov_len;
				continue;
			}
			offset = count;
			count = 0;
		}

		desc.written = 0;
		desc.arg.buf = iov[seg].iov_base + offset;
		desc.count = iov[seg].iov_len - offset;
		if (desc.count == 0)
			continue;
		desc.error = 0;
		do_generic_file_read(filp, ppos, &desc, file_read_actor);
		retval += desc.written;
		if (desc.error) {
			retval = retval ?: desc.error;
			break;
		}
		if (desc.count > 0)
			break;
	}
out:
	return retval;
}

 

    do_generic_file_read定義在mm/filemap.c檔案中,該函式呼叫page cache層中相關的函式.如果所需資料存在與page cache中,並且資料不是dirty的,則從page cache中直接獲取資料返回.如果資料在page cache中不存在,或者資料是dirty的,則page cache會引發讀磁碟的操作.該函式的讀磁碟並不是簡單的只讀取所需資料的所在的block,而是會有一定的預讀機制來提高cache的命中率,減少磁碟訪問的次數. 

 

    page cache層中真正讀磁碟的操作為readpage系列,readpage系列函式具體指向的函式實現在fs/ext4/inode.c檔案中定義,該檔案中有很多個struct address_space_operation物件來對應與不同日誌機制,我們選擇linux預設的ordered模式的日誌機制來描述I/O的整個流程, ordered模式對應的readpage系列函式如下所示.

static const struct address_space_operations ext4_ordered_aops = {
	.readpage		= ext4_readpage,
	.readpages		= ext4_readpages,
	.writepage		= ext4_writepage,
	.write_begin		= ext4_write_begin,
	.write_end		= ext4_ordered_write_end,
	.bmap			= ext4_bmap,
	.invalidatepage		= ext4_invalidatepage,
	.releasepage		= ext4_releasepage,
	.direct_IO		= ext4_direct_IO,
	.migratepage		= buffer_migrate_page,
	.is_partially_uptodate  = block_is_partially_uptodate,
	.error_remove_page	= generic_error_remove_page,
};

 

    為簡化流程,我們選取最簡單的ext4_readpage函式來說明,該函式實現位於fs/ext4/inode.c中,函式很簡單,只是呼叫了mpage_readpage函式.mpage_readpage位於fs/mpage.c檔案中,該函式生成一個IO請求,並提交給Generic block layer.

int mpage_readpage(struct page *page, get_block_t get_block)
{
	struct bio *bio = NULL;
	sector_t last_block_in_bio = 0;
	struct buffer_head map_bh;
	unsigned long first_logical_block = 0;

	map_bh.b_state = 0;
	map_bh.b_size = 0;
	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
			&map_bh, &first_logical_block, get_block);
	if (bio)
		mpage_bio_submit(READ, bio);
	return 0;
}

 

 

    Generic block layer會將該請求分發到具體裝置的IO佇列中,由I/O Scheduler去呼叫具體的driver介面獲取所需的資料.

 

    至此,在Guest vm中整個I/O的流程已經介紹完了,後續的文章會介紹I/O操作如何從Guest vm跳轉到kvm及如何在qemu中模擬I/O裝置.

 

 

參考資料:

1. read系統呼叫剖析:http://www.ibm.com/developerworks/cn/linux/l-cn-read/

--------------------- 本文來自 dashulu 的CSDN 部落格 ,全文地址請點選:https://blog.csdn.net/dashulu/article/details/16820281?utm_source=copy