Linux VFS 之mount系統呼叫
2. 首先呼叫sys_mount,程式碼在<fs/namaspace.c>檔案,如下:
/** dev_name:包含一個檔案系統的裝置檔名,如/dev/sda dir_name:安裝點目錄 type:已註冊的檔案系統型別 flags:安裝標誌 data:檔案系統相關的資料結構,可以為NULL **/ asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, char __user * type, unsigned long flags, void __user * data) { int retval; unsigned long data_page; unsigned long type_page; unsigned long dev_page; char *dir_page; retval = copy_mount_options(type, &type_page);/*將型別複製到型別頁,如果不足一頁,補0*/ if (retval < 0) return retval; dir_page = getname(dir_name);/*路徑名從使用者空間複製到記憶體頁*/ retval = PTR_ERR(dir_page); if (IS_ERR(dir_page)) goto out1; retval = copy_mount_options(dev_name, &dev_page);/*將裝置名從使用者空間複製到記憶體*/ if (retval < 0) goto out2; retval = copy_mount_options(data, &data_page); if (retval < 0) goto out3; /*鎖定核心 */ lock_kernel(); /*安裝檔案系統*/ retval = do_mount((char *)dev_page, dir_page, (char *)type_page, flags, (void *)data_page); unlock_kernel(); free_page(data_page); out3: free_page(dev_page); out2: putname(dir_page); out1: free_page(type_page); return retval; }
sys_mount函式首先將使用者空間傳入的裝置路徑名dev_name, 檔案系統型別type和data分別複製到核心頁dev_page, type_page和data_page中,然後鎖定核心,呼叫do_mount函式進行處理。
/* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ long do_mount(char *dev_name, char *dir_name, char *type_page, unsigned long flags, void *data_page) { struct nameidata nd; int retval = 0; int mnt_flags = 0; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ /*基本檢查,包括目錄名是否為空,裝置名是否為空等*/ if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) return -EINVAL; if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) return -EINVAL; if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; /*安裝標誌*/ /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID;/*禁止使用setuid和setgid標誌*/ if (flags & MS_NODEV) mnt_flags |= MNT_NODEV;/*禁止訪問裝置檔案*/ if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC;/*不允許執行程式*/ if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME;/*不更新檔案的存取時間*/ if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME;/*不更新目錄的存取時間*/ if (flags & MS_RELATIME) mnt_flags |= MNT_RELATIME; /*清除這些標誌*/ flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | MS_NODIRATIME | MS_RELATIME); /* ... and get the mountpoint查詢路徑,儲存在namidata結構體物件,存放了安裝點目錄項物件和安裝點物件 */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); if (retval) return retval; retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page); if (retval) goto dput_out; /*是否需要重新掛載,通常改變檔案掛載的標誌,如將只讀的檔案系統變為可寫,一般不改變安裝點*/ if (flags & MS_REMOUNT) retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); /*可以將檔案系統的部分目錄掛載到另外一個地方,這樣,在兩個地方都可以訪問該目錄*/ else if (flags & MS_BIND) retval = do_loopback(&nd, dev_name, flags & MS_REC); /*改變安裝點的型別*/ else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); /*將已經掛載的檔案系統移動到新的安裝點,即移動目錄樹*/ else if (flags & MS_MOVE) retval = do_move_mount(&nd, dev_name); else/*通常情況下呼叫這個函式,建立一個新的安裝點*/ retval = do_new_mount(&nd, type_page, flags, mnt_flags, dev_name, data_page); dput_out: path_release(&nd); return retval; }
do_mount首先進行基本檢查,包括目錄名是否為空,裝置名是否為空等,接下來,根據flags設定掛載標誌mnt_flags. 其中, path_lookup是路徑查詢函式,根據目錄名找到目錄項物件,並將目錄項物件和安裝點物件儲存在nameidata結構體。根據標誌作一些判斷,包括是否需要重新掛載檔案系統(do_remount), 我們關心的是do_new_mount這個函式,即掛載一個新的檔案系統,繼續跟蹤這個函式:
/*建立一個新的掛載 * create a new mount for userspace and request it to be added into the * namespace's tree */ /** 引數1:nameidata結構體指標 引數2:掛載點型別 引數3:原掛載標誌 引數4:新掛載標誌 引數5:裝置名稱指標 引數6:私有資料結構指標 **/ static int do_new_mount(struct nameidata *nd, char *type, int flags, int mnt_flags, char *name, void *data) { struct vfsmount *mnt;/*vfsmount結構*/ if (!type || !memchr(type, 0, PAGE_SIZE)) return -EINVAL; /* we need capabilities... 檢視是否具有掛載許可權*/ if (!capable(CAP_SYS_ADMIN)) return -EPERM; /*返回一個新的安裝點物件,包括建立一個超級塊物件*/ mnt = do_kern_mount(type, flags, name, data); if (IS_ERR(mnt)) return PTR_ERR(mnt); /*將安裝點新增到安裝目錄樹,hash表和父安裝點的子連結串列*/ return do_add_mount(mnt, nd, mnt_flags, NULL); }
這個函式主要完成兩大功能,第一,建立一個新的安裝點物件和超級塊物件,並將安裝點物件和超級塊物件相關聯。第二,將安裝點物件加入到mount tree。我們分別看一下這兩個函式,首先跟蹤do_kern_mount函式:
/**
引數1:要掛載的檔案系統型別,如ext3
引數2:掛載標誌
引數3:塊裝置路徑名,如/dev/sda
引數4:指向additional data的指標,傳入 read_super函式
返回值:vfsmount指標
**/
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
/*在file_system連結串列查詢,得到一個已經註冊的檔案系統型別*/
struct file_system_type *type = get_fs_type(fstype);
struct vfsmount *mnt;
if (!type)
return ERR_PTR(-ENODEV);
/*返回掛載點物件*/
mnt = vfs_kern_mount(type, flags, name, data);
put_filesystem(type);
return mnt;
}
首先在file_system連結串列查詢已經註冊的檔案系統型別,在檔案系統型別一節提到。定義一個mnt指標,呼叫vfs_kern_mount:
/**
引數1:檔案系統型別
引數2:掛載標誌,如MS_BIND
引數3:裝置路徑
引數4:私有additional data,傳入read_super函式
返回值:已經和superblock關聯的vfsmount物件
**/
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct vfsmount *mnt;
char *secdata = NULL;
int error;
if (!type)
return ERR_PTR(-ENODEV);
error = -ENOMEM;
/*分配並初始化檔案系統物件*/
mnt = alloc_vfsmnt(name);
if (!mnt)
goto out;
if (data) {
secdata = alloc_secdata();
if (!secdata)
goto out_mnt;
error = security_sb_copy_data(type, data, secdata);
if (error)
goto out_free_secdata;
}
/*根據具體的檔案系統,分配超級塊,並初始化超級塊資訊,建立超級塊和vfsmount之間關係*/
error = type->get_sb(type, flags, name, data, mnt);
if (error < 0)
goto out_free_secdata;
error = security_sb_kern_mount(mnt->mnt_sb, secdata);
if (error)
goto out_sb;
/*設定安裝點目錄項物件和父安裝點,在以後graft_free持載到目錄樹中更新為合適的值*/
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
free_secdata(secdata);
return mnt;
out_sb:
dput(mnt->mnt_root);
up_write(&mnt->mnt_sb->s_umount);
deactivate_super(mnt->mnt_sb);
out_free_secdata:
free_secdata(secdata);
out_mnt:
free_vfsmnt(mnt);
out:
return ERR_PTR(error);
}
這個函式比較複雜,包括幾個關鍵的部分,首先呼叫alloc_vfsmnt分配並初始化安裝點物件。接下來,呼叫type->get_sb分配並初始化超級塊資訊,並將超級塊資訊和mnt相關聯,type涉及到具體的檔案系統,一會分析。最後設定安裝點目錄項物件和父安裝點。先看一下alloc_vfsmnt函式:
/*分配並初始化安裝點物件vfsmount*/
struct vfsmount *alloc_vfsmnt(const char *name)
{ /*在記憶體分配一個struct vfsmount*/
struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
if (mnt) {
memset(mnt, 0, sizeof(struct vfsmount));
atomic_set(&mnt->mnt_count, 1);
/*hash連結串列指標*/
INIT_LIST_HEAD(&mnt->mnt_hash);
/*子安裝點的下一個物件指標*/
INIT_LIST_HEAD(&mnt->mnt_child);
/*子安裝點連結串列的頭指標*/
INIT_LIST_HEAD(&mnt->mnt_mounts);
/*指向名稱空間的下一個安裝點物件*/
INIT_LIST_HEAD(&mnt->mnt_list);
/*檔案系統的過期連結串列*/
INIT_LIST_HEAD(&mnt->mnt_expire);
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
if (name) {
int size = strlen(name) + 1;
/*分配裝置名記憶體*/
char *newname = kmalloc(size, GFP_KERNEL);
if (newname) {
memcpy(newname, name, size);
/*將安裝點物件關聯裝置名稱*/
mnt->mnt_devname = newname;
}
}
}
return mnt;
}
看到這個函式,應該欣喜,比較簡單,在記憶體分配一個vfsmount,並初始化相應的連結串列資訊。最後將掛載點物件關聯裝置名稱。
下面看一下type->get_sb函式,由於在註冊檔案系統型別時就註冊了get_sb函式,所以這個函式與具體的檔案系統型別相關,以ext3為例,其對應函式為: ext3_get_sb:
static int ext3_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
}
呼叫了get_sb_bdev,設定回撥函式ext3_fill_super,進行超級塊的填充,這個函式會在以後呼叫。
/**
每個檔案系統型別對應多個超級塊物件,每個檔案系統有一個超級塊物件,例如ext3檔案系統型別可對應多個超級塊物件,而/dev/sda,/dev/sdb擁有一個超級塊物件
最後將安裝點和超級塊相關聯,這樣vfsmount和super_block之間的關係就建立好了
**/
int get_sb_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int),
struct vfsmount *mnt)
{
struct block_device *bdev;
struct super_block *s;
int error = 0;
/*開啟一個塊裝置,傳入型別,只讀或者讀寫*/
bdev = open_bdev_excl(dev_name, flags, fs_type);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
/*
* once the super is inserted into the list by sget, s_umount
* will protect the lockfs code from trying to start a snapshot
* while we are mounting
*/
down(&bdev->bd_mount_sem);
/*得到一個超級塊物件,根據bdev查詢*/
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
up(&bdev->bd_mount_sem);
if (IS_ERR(s))
goto error_s;
/*如果原超級塊存在*/
if (s->s_root) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
up_write(&s->s_umount);
deactivate_super(s);
error = -EBUSY;
goto error_bdev;
}
/*關閉塊裝置*/
close_bdev_excl(bdev);
} else {
char b[BDEVNAME_SIZE];
/*設定掛載標誌*/
s->s_flags = flags;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
/*設定塊大小,在512位元組-4K之間*/
sb_set_blocksize(s, block_size(bdev));
/*填充超級塊物件相關資訊,包括建立超級塊的根目錄項物件,相關操作方法super_operations等*/
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error) {
up_write(&s->s_umount);
deactivate_super(s);
goto error;
}
s->s_flags |= MS_ACTIVE;
bdev_uevent(bdev, KOBJ_MOUNT);
}
/*將安裝點和超級塊相關聯,成功返回0*/
return simple_set_mnt(mnt, s);
error_s:
error = PTR_ERR(s);
error_bdev:
close_bdev_excl(bdev);
error:
return error;
}
首先,以互斥的方式開啟裝置open_bdev_excl, 接下來sget函式得到一個超級塊物件,fill_super填充超級塊的相關資訊,最後simple_set_mnt函式將超級塊物件和掛載點物件相關聯。那麼sget函式是怎麼得到一個超級塊物件呢?
/**從type->fs_supers連結串列查詢屬於同一個檔案系統型別的超級塊物件,如果找到,則返回超級塊物件地址,否則,建立一個超級塊物件,並將超級塊物件加入到type->fs_supers連結串列
* sget - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
* @set: setup callback
* @data: argument to each of them
*/
struct super_block *sget(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
void *data)
{
struct super_block *s = NULL;
struct list_head *p;
int err;
retry:
spin_lock(&sb_lock);
/*s->s_bdev == data在新建立一個超級塊時,進行了設定!因此,當test為真時,說明此超級塊很有可能已經被建立過,在屬於同一檔案系統型別的超級塊連結串列查詢,fs->supers指向表頭,s_instances指向下一個超級塊物件*/
if (test) list_for_each(p, &type->fs_supers) {
struct super_block *old;
old = list_entry(p, struct super_block, s_instances);
if (!test(old, data))/*說明不是此超級塊*/
continue;
if (!grab_super(old))
goto retry;
if (s)
destroy_super(s);
/*找到返回*/
return old;
}
/*如果沒找到*/
if (!s) {
spin_unlock(&sb_lock);
/*建立一個超級塊物件*/
s = alloc_super(type);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
}
/*將s->s_bdev和data相關聯*/
err = set(s, data);
if (err) {
spin_unlock(&sb_lock);
destroy_super(s);
return ERR_PTR(err);
}
/*設定所屬檔案系統型別*/
s->s_type = type;
/*將包含超級塊裝置的名稱複製到s_id字元陣列*/
strlcpy(s->s_id, type->name, sizeof(s->s_id));
/*將超級塊加入到所有超級塊連結串列,表頭存在super_blocks變數*/
list_add_tail(&s->s_list, &super_blocks);
/*將超級塊加入到屬於同種檔案系統型別的連結串列*/
list_add(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
/*增加檔案系統計數*/
get_filesystem(type);
return s;
}
這個函式,首先在超級塊連結串列fs_supers查詢超級塊物件,如果找到,則返回。否則,建立一個超級塊物件。alloc_super在記憶體分配超級塊物件,然後設定所屬的檔案系統型別,將超級塊物件加入到所有超級塊連結串列(super_blocks)和將超級塊加入到屬於同種檔案系統型別的連結串列(type->fs_supers). 最後返回超級塊物件s. 在得到超級塊物件之後,在get_sb_bdev函式的第30行,判斷一下原超級塊是否存在,如果原超級塊存在,說明檔案系統已存在,並且超級塊已經填充過,此時呼叫close_bdev_excl關閉塊裝置。如果原超級塊不存在,接下來,對超級塊進行填充,以ext3檔案系統為例,呼叫ext3_fill_super函式。
/*填充超級塊資訊*/
static int ext3_fill_super (struct super_block *sb, void *data, int silent)
{
struct buffer_head * bh;
/*超級塊的磁碟結構*/
struct ext3_super_block *es = NULL;
/*超級塊相關資訊,存於記憶體*/
struct ext3_sb_info *sbi;
ext3_fsblk_t block;
/*得到超級塊的邏輯塊號*/
ext3_fsblk_t sb_block = get_sb_block(&data);
ext3_fsblk_t logic_sb_block;
unsigned long offset = 0;
unsigned int journal_inum = 0;
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
/*根索引節點物件*/
struct inode *root;
int blocksize;
int hblock;
int db_count;
int i;
int needs_recovery;
__le32 features;
/*分配超級塊的相關資訊的記憶體結構*/
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
/*將s_fs_info指向sbi*/
sb->s_fs_info = sbi;
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT3_DEF_RESUID;
sbi->s_resgid = EXT3_DEF_RESGID;
unlock_kernel();
/*得到塊的大小*/
blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
if (!blocksize) {
printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
goto out_fail;
}
/*
* The ext3 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
*/
if (blocksize != EXT3_MIN_BLOCK_SIZE) {
logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
} else {
logic_sb_block = sb_block;
}
/*讀取超級塊資訊*/
if (!(bh = sb_bread(sb, logic_sb_block))) {
printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
goto out_fail;
}
/*
* Note: s_es must be initialized as soon as possible because
* some ext3 macro-instructions depend on its value
*/
/*取得磁碟上的struct ext3_super_block資訊*/
es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
/*將s_es指向緩衝區的es*/
sbi->s_es = es;
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT3_SUPER_MAGIC)
goto cantfind_ext3;
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
if (def_mount_opts & EXT3_DEFM_DEBUG)
set_opt(sbi->s_mount_opt, DEBUG);
if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
set_opt(sbi->s_mount_opt, GRPID);
if (def_mount_opts & EXT3_DEFM_UID16)
set_opt(sbi->s_mount_opt, NO_UID32);
if (def_mount_opts & EXT3_DEFM_XATTR_USER)
set_opt(sbi->s_mount_opt, XATTR_USER);
if (def_mount_opts & EXT3_DEFM_ACL)
set_opt(sbi->s_mount_opt, POSIX_ACL);
if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
set_opt(sbi->s_mount_opt, ERRORS_RO);
else
set_opt(sbi->s_mount_opt, ERRORS_CONT);
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
set_opt(sbi->s_mount_opt, RESERVATION);
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
printk(KERN_WARNING
"EXT3-fs warning: feature flags set on rev 0 fs, "
"running e2fsck is recommended\n");
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
if (features) {
printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
"unsupported optional features (%x).\n",
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
"unsupported optional features (%x).\n",
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT3_MIN_BLOCK_SIZE ||
blocksize > EXT3_MAX_BLOCK_SIZE) {
printk(KERN_ERR
"EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
blocksize, sb->s_id);
goto failed_mount;
}
hblock = bdev_hardsect_size(sb->s_bdev);
if (sb->s_blocksize != blocksize) {
/*
* Make sure the blocksize for the filesystem is larger
* than the hardware sectorsize for the machine.
*/
if (blocksize < hblock) {
printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
"device blocksize %d.\n", blocksize, hblock);
goto failed_mount;
}
brelse (bh);
sb_set_blocksize(sb, blocksize);
logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
bh = sb_bread(sb, logic_sb_block);
if (!bh) {
printk(KERN_ERR
"EXT3-fs: Can't read superblock on 2nd try.\n");
goto failed_mount;
}
es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
printk (KERN_ERR
"EXT3-fs: Magic mismatch, very weird !\n");
goto failed_mount;
}
}
sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
} else {
sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
(sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
(sbi->s_inode_size > blocksize)) {
printk (KERN_ERR
"EXT3-fs: unsupported inode size: %d\n",
sbi->s_inode_size);
goto failed_mount;
}
}
sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
le32_to_cpu(es->s_log_frag_size);
if (blocksize != sbi->s_frag_size) {
printk(KERN_ERR
"EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
sbi->s_frag_size, blocksize);
goto failed_mount;
}
sbi->s_frags_per_block = 1;
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
if (EXT3_INODE_SIZE(sb) == 0)
goto cantfind_ext3;
sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0)
goto cantfind_ext3;
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
sbi->s_sbh = bh;
sbi->s_mount_state = le16_to_cpu(es->s_state);
sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
for (i=0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
if (sbi->s_blocks_per_group > blocksize * 8) {
printk (KERN_ERR
"EXT3-fs: #blocks per group too big: %lu\n",
sbi->s_blocks_per_group);
goto failed_mount;
}
if (sbi->s_frags_per_group > blocksize * 8) {
printk (KERN_ERR
"EXT3-fs: #fragments per group too big: %lu\n",
sbi->s_frags_per_group);
goto failed_mount;
}
if (sbi->s_inodes_per_group > blocksize * 8) {
printk (KERN_ERR
"EXT3-fs: #inodes per group too big: %lu\n",
sbi->s_inodes_per_group);
goto failed_mount;
}
if (le32_to_cpu(es->s_blocks_count) >
(sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
printk(KERN_ERR "EXT3-fs: filesystem on %s:"
" too large to mount safely\n", sb->s_id);
if (sizeof(sector_t) < 8)
printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
"enabled\n");
goto failed_mount;
}
if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
goto cantfind_ext3;
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT3_BLOCKS_PER_GROUP(sb)) + 1;
db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
EXT3_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
printk (KERN_ERR "EXT3-fs: not enough memory\n");
goto failed_mount;
}
bgl_lock_init(&sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
printk (KERN_ERR "EXT3-fs: "
"can't read group descriptor %d\n", i);
db_count = i;
goto failed_mount2;
}
}
if (!ext3_check_descriptors (sb)) {
printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
percpu_counter_init(&sbi->s_freeblocks_counter,
ext3_count_free_blocks(sb));
percpu_counter_init(&sbi->s_freeinodes_counter,
ext3_count_free_inodes(sb));
percpu_counter_init(&sbi->s_dirs_counter,
ext3_count_dirs(sb));
/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
sbi->s_rsv_window_root = RB_ROOT;
/* Add a single, static dummy reservation to the start of the
* reservation window list --- it gives us a placeholder for
* append-at-start-of-list which makes the allocation logic
* _much_ simpler. */
sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
sbi->s_rsv_window_head.rsv_alloc_hit = 0;
sbi->s_rsv_window_head.rsv_goal_size = 0;
ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
/*
* set up enough so that it can read an inode
*/
sb->s_op = &ext3_sops;/*超級塊物件操作,read_inode讀索引節點*/
sb->s_export_op = &ext3_export_ops;
sb->s_xattr = ext3_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->s_qcop = &ext3_qctl_operations;
sb->dq_op = &ext3_quota_operations;
#endif
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
/*將s_root設定為空*/
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
EXT3_HAS_INCOMPAT_FEATURE(sb,
EXT3_FEATURE_INCOMPAT_RECOVER));
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) &&
EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext3_load_journal(sb, es, journal_devnum))
goto failed_mount3;
} else if (journal_inum) {
if (ext3_create_journal(sb, es, journal_inum))
goto failed_mount3;
} else {
if (!silent)
printk (KERN_ERR
"ext3: No journal on filesystem on %s\n",
sb->s_id);
goto failed_mount3;
}
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
switch (test_opt(sb, DATA_FLAGS)) {
case 0:
/* No mode set, assume a default based on the journal
capabilities: ORDERED_DATA if the journal can
cope, else JOURNAL_DATA */
if (journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
set_opt(sbi->s_mount_opt, ORDERED_DATA);
else
set_opt(sbi->s_mount_opt, JOURNAL_DATA);
break;
case EXT3_MOUNT_ORDERED_DATA:
case EXT3_MOUNT_WRITEBACK_DATA:
if (!journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
printk(KERN_ERR "EXT3-fs: Journal does not support "
"requested data journaling mode\n");
goto failed_mount4;
}
default:
break;
}
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
"its supported only with writeback mode\n");
clear_opt(sbi->s_mount_opt, NOBH);
}
}
/*
* The journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
*/
/*根據根的索引節點號,得到索引節點物件,首先在inode cache查詢*/
root = iget(sb, EXT3_ROOT_INO);
/*分配目錄項物件,並將目錄項物件與索引節點物件關聯*/
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
printk(KERN_ERR "EXT3-fs: get root inode failed\n");
iput(root);
goto failed_mount4;
}
/*如是不是目錄*/
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
dput(sb->s_root);
sb->s_root = NULL;
printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
goto failed_mount4;
}
/*將超級塊寫到磁碟上*/
ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
* in numerous places. Here we just pop the lock - it's relatively
* harmless, because we are now ready to accept write_super() requests,
* and aviro says that's the only reason for hanging onto the
* superblock lock.
*/
EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
ext3_orphan_cleanup(sb, es);
EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
if (needs_recovery)
printk (KERN_INFO "EXT3-fs: recovery complete.\n");
ext3_mark_recovery_complete(sb, es);
printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
lock_kernel();
return 0;
cantfind_ext3:
if (!silent)
printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
sb->s_id);
goto failed_mount;
failed_mount4:
journal_destroy(sbi->s_journal);
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
failed_mount:
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
ext3_blkdev_remove(sbi);
brelse(bh);
out_fail:
sb->s_fs_info = NULL;
kfree(sbi);
lock_kernel();
return -EINVAL;
}
這個函式非常長,主要對記憶體超級塊物件填充,在378-380行,得到根的索引節點,並將sb->s_root指向根的目錄項物件, 然後呼叫ext3_setup_super將超級塊寫到磁碟上。
在填充超級塊之後,在get_sb_bdev函式的第58行,呼叫simple_set_mnt將掛載點物件和超級塊物件相關聯,具體如下:
/*將安裝點和超級塊關聯*/
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
/*vfsmount對應的超級塊指標*/
mnt->mnt_sb = sb;
/*安裝點對應的根目錄項*/
mnt->mnt_root = dget(sb->s_root);
return 0;
}
主要進行賦值操作,將安裝點物件的mnt_sb指向超級塊,安裝點的根目錄項物件mnt_root指向sb->s_root. 至此,掛載點已經完成了,超級塊物件的建立,掛載點物件的建立,並且把超級塊物件和掛載點物件關聯起來了。回到vfs_kern_mount函式,第42行和第43行,設定掛載點的目錄項物件和父掛載點。在以後的do_add_mount函式會重新設定。當vfs_kern_mount函式返回時,do_kern_mount也就返回了。接下來,在do_new_mount函式中繼續呼叫do_add_mount函式,將掛載點物件加入到mount tree,具體實現如下:
/* 將vfsmount物件加入到名稱空間的安裝樹
* add a mount into a namespace's mount tree
* - provide the option of adding the new mount to an expiration list
*/
/**
傳入引數:
newmnt-新的安裝點物件
nd-包含了分量的目錄項物件和安裝點物件
mnt_flags-安裝標誌
fslist-過期連結串列
**/
int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
int mnt_flags, struct list_head *fslist)
{
int err;
/*得到寫訊號量*/
down_write(&namespace_sem);
/* Something was mounted here while we slept */
while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
if (!check_mnt(nd->mnt))
goto unlock;
/* Refuse the same filesystem on the same mount point 將同一個檔案系統兩次安裝在同一個安裝點,就是已經掛載了 */
err = -EBUSY;
/*超級塊相同,並且目錄項物件相同,也就是同一檔案系統掛載到相同的目錄下,沒有實際意義*/
if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
nd->mnt->mnt_root == nd->dentry)
goto unlock;
err = -EINVAL;
/*如果安裝點是一個符號連結*/
if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
goto unlock;
/*安裝標誌*/
newmnt->mnt_flags = mnt_flags;
/*將新的安裝點插入到namespace list物件,hash表和父安裝點的子連結串列中*/
if ((err = graft_tree(newmnt, nd)))
goto unlock;
/*加入過期連結串列*/
if (fslist) {
/* add to the specified expiration list */
spin_lock(&vfsmount_lock);
list_add_tail(&newmnt->mnt_expire, fslist);
spin_unlock(&vfsmount_lock);
}
up_write(&namespace_sem);
return 0;
unlock:
up_write(&namespace_sem);
mntput(newmnt);
return err;
}
第28行,nd->mnt->mnt_sb==newmnt->mnt_sb表示超級塊相同,代表同一個檔案系統。nd->mnt->mnt_root==nd->dentry表示安裝在同一目錄。即將同一個檔案系統兩將安裝在同一個目錄,則返回,沒有什麼實際意義。第36行,設定掛載點標誌,呼叫graft_tree函式。
static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
{
int err;
if (mnt->mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
S_ISDIR(mnt->mnt_root->d_inode->i_mode))
return -ENOTDIR;
err = -ENOENT;
mutex_lock(&nd->dentry->d_inode->i_mutex);
if (IS_DEADDIR(nd->dentry->d_inode))
goto out_unlock;
err = security_sb_check_sb(mnt, nd);
if (err)
goto out_unlock;
err = -ENOENT;
/*呼叫attach_recursive_mnt加入到全域性安裝樹*/
if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
err = attach_recursive_mnt(mnt, nd, NULL);
out_unlock:
mutex_unlock(&nd->dentry->d_inode->i_mutex);
if (!err)
security_sb_post_addmount(mnt, nd);
return err;
}
這個函式呼叫attach_recursive_mnt將安裝點加入到全域性mount tree中。傳入引數分別是掛載點物件,nameidata和原父掛載點。具體原型如下所示:
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
* @parent_nd : if non-null, detach the source_mnt from its parent and
* store the parent mount and mountpoint dentry.
* (done when source_mnt is moved)
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
* ---------------------------------------------------------------------------
* | BIND MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (++) | shared (+) | shared(+++)| invalid |
* | | | | | |
* |non-shared| shared (+) | private | slave (*) | invalid |
* ***************************************************************************
* A bind operation clones the source mount and mounts the clone on the
* destination mount.
*
* (++) the cloned mount is propagated to all the mounts in the propagation
* tree of the destination mount and the cloned mount is added to
* the peer group of the source mount.
* (+) the cloned mount is created under the destination mount and is marked
* as shared. The cloned mount is added to the peer group of the source
* mount.
* (+++) the mount is propagated to all the mounts in the propagation tree
* of the destination mount and the cloned mount is made slave
* of the same master as that of the source mount. The cloned mount
* is marked as 'shared and slave'.
* (*) the cloned mount is made a slave of the same master as that of the
* source mount.
*
* ---------------------------------------------------------------------------
* | MOVE MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (+) | shared (+) | shared(+++) | invalid |
* | | | | | |
* |non-shared| shared (+*) | private | slave (*) | unbindable |
* ***************************************************************************
*
* (+) the mount is moved to the destination. And is then propagated to
* all the mounts in the propagation tree of the destination mount.
* (+*) the mount is moved to the destination.
* (+++) the mount is moved to the destination and is then propagated to
* all the mounts belonging to the destination mount's propagation tree.
* the mount is marked as 'shared and slave'.
* (*) the mount continues to be a slave at the new location.
*
* if the source mount is a tree, the operations explained above is
* applied to each mount in the tree.
* Must be called without spinlocks held, since this function can sleep
* in allocations.
*/
/**
第一步:設定父安裝點nd->mnt和安裝點目錄項物件nd->dentry
第二步:將安裝點加入到全域性目錄樹,即
將安裝點新增到三個連結串列:
(1)全域性hash連結串列
(2)名稱空間連結串列mnt_list
(3)父安裝點的子連結串列
**/
static int attach_recursive_mnt(struct vfsmount *source_mnt,
struct nameidata *nd, struct nameidata *parent_nd)
{
LIST_HEAD(tree_list);
/*nd->mnt指向父檔案系統安裝點*/
struct vfsmount *dest_mnt = nd->mnt;
/*安裝點的目錄項物件*/
struct dentry *dest_dentry = nd->dentry;
struct vfsmount *child, *p;
if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
return -EINVAL;
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
}
spin_lock(&vfsmount_lock);
if (parent_nd) {/*如果父檔案系統安裝點存在,先與父檔案系統的安裝點斷開,再新增到新的父檔案系統的安裝點*/
detach_mnt(source_mnt, parent_nd);
attach_mnt(source_mnt, nd);//連結到父安裝點
touch_mnt_namespace(current->nsproxy->mnt_ns);
} else {
/*設定父安裝點,安裝點目錄項和d_mounted*/
mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
/*將安裝點加入hash連結串列,名稱空間連結串列和父安裝點的子連結串列*/
commit_tree(source_mnt);
}
list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
list_del_init(&child->mnt_hash);
commit_tree(child);
}
spin_unlock(&vfsmount_lock);
return 0;
}
第91行,首先判斷父掛載點是否存在,如果存在,先與父掛載點斷後,再連結到新的父掛載點。如果父掛載點不存在,則呼叫mnt_set_mountpoint設定父掛載點,掛載點目錄項物件和d_mounted,然後將掛載點加入到全域性hash表,名稱空間連結串列和父掛載點的子連結串列。注意,nd->mnt表示的是新的父掛載點。mnt_set_mountpoint和commit_tree函式如下所示,分別對關鍵部分進行了註釋:
/*設定安裝點和父目錄*/
void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
struct vfsmount *child_mnt)
{
/*將子檔案系統安裝點mnt_parent指向父檔案系統安裝點*/
child_mnt->mnt_parent = mntget(mnt);
/*裝載點的目錄項物件*/
child_mnt->mnt_mountpoint = dget(dentry);
/*目錄項物件加1,由於同一個目錄項可以裝載多個檔案系統*/
dentry->d_mounted++;
}
/*
* the caller must hold vfsmount_lock
*/
static void commit_tree(struct vfsmount *mnt)
{
struct vfsmount *parent = mnt->mnt_parent;/*父檔案系統安裝點物件*/
struct vfsmount *m;
LIST_HEAD(head);
struct mnt_namespace *n = parent->mnt_ns;
BUG_ON(parent == mnt);
/*加入到名稱空間的list連結串列*/
list_add_tail(&head, &mnt->mnt_list);
list_for_each_entry(m, &head, mnt_list)
m->mnt_ns = n;
list_splice(&head, n->list.prev);
/*新增到hash表,mount_hashtable*/
list_add_tail(&mnt->mnt_hash, mount_hashtable +
hash(parent, mnt->mnt_mountpoint));
/*新增到父檔案系統的子連結串列*/
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
touch_mnt_namespace(n);
}
至此,mount系統呼叫就完成了,呵呵,條理還算清晰。
3.總結
mount系統呼叫可以總結如下:
(1)得到一個掛載點物件(vfsmount)->do_kern_mount
(2)將掛載點物件加入到mount tree->do_add_mount
其中(1)又分為:
構建vfsmount物件,構建超級塊物件super_block,將超級塊物件和掛載點物件相關聯。
(2)可分為:
設定vfsmount的父掛載點,安裝點目錄項,加入到全域性mount_hashtable, 名稱空間連結串列list和父掛載點的子連結串列mnt_mounts.
對於mount系統呼叫就寫到這了,在接下來,我們將一步一步分析Linux核心,包括檔案系統,塊裝置層,I/O排程層, SCSI裝置驅動。有機會的話,還將分析一下Linux核心對SSD的支援,包括trim命令。