HugePage介紹、實現分析、配置和使用

阿新 • • 發佈：2020-11-30

關鍵詞：TLB、hugetlb、hugetlbfs、mmap、shmget/shmat、nr_hugepages等等。

1. HugePage介紹

Linux記憶體管理採用“分頁機制”，記憶體頁面預設大小為4KB。但是當執行記憶體需求量較大時，預設4KB大小的頁面會導致較多的TLB miss和缺頁中斷，從而大大影響應用程式效能。

但是直接修改Linux核心頁面大小，涉及面較廣，不一定合適。為了以最小代價實現大頁面支援，Linux採用了hugetlbfs特殊檔案系統。這種檔案系統形式支援大頁面，使得應用程式可以根據需要靈活地選擇虛擬記憶體頁面大小，而不會被強制使用2MB大小頁面。

使用HugePage需要核心中開啟CONFIG_HUGETLB_PAGE

以及CONFIG_HUGETLBFS。

Page Table:頁表，也就是一種用於記憶體管理的實現方式，用於實體地址到邏輯地址之間的對映。因此對於記憶體的訪問，先是訪問Page Table，然後根據Page Table中的對映關係，隱式的轉移到實體地址來存取資料。

TLB: Translation Lookaside Buffer (TLB) ，是虛擬地址到實體地址轉換cache，包含了部分page table的對映關係，用於快速實現虛擬地址到實體地址的轉換。

hugetlb: hugetlb 是TLB中指向HugePage的一個入口。

使用者可以通過mmap或者SYSV共享記憶體(shmget/shmat)來使用HugePage。

核心中預留給HugePage的記憶體不能被其他功能複用。

2. HugePage實現

HugePage的應用涉及到兩方面：核心使用HugeTLB對HugePage進行應設管理；使用hugetlbfs來提供為檔案系統給使用者空間訪問。

2.1 HugeTLB初始化

2.1.1 資料結構

全域性陣列hstates中每一個struct hstate相當於一個Huge Page池，不同的成員，其頁面大小是不一樣的。

hugetlb_max_hstate表示當前成員數量，HUGE_MAX_HSTATE表示系統支援最多struct hstate數量。

struct hstate hstates[HUGE_MAX_HSTATE];

 
/* Defines one hugetlb page size */
struct hstate {
    int next_nid_to_alloc;
    int next_nid_to_free;
    unsigned int order;
    unsigned long mask;
    unsigned long max_huge_pages;
    unsigned long nr_huge_pages;
    unsigned long free_huge_pages;
    unsigned long resv_huge_pages;
    unsigned long surplus_huge_pages;
    unsigned long nr_overcommit_huge_pages;
    struct list_head hugepage_activelist;
    struct list_head hugepage_freelists[MAX_NUMNODES];
    unsigned int nr_huge_pages_node[MAX_NUMNODES];
    unsigned int free_huge_pages_node[MAX_NUMNODES];
    unsigned int surplus_huge_pages_node[MAX_NUMNODES];
#ifdef CONFIG_CGROUP_HUGETLB
    /* cgroup control files */
    struct cftype cgroup_files[5];
#endif
    char name[HSTATE_NAME_LEN];
};

2.1.2 Huge Page TLB初始化

hugetlb_init()是Huge Page初始化入口，屬於subsys_initcall()，在arch_initcall()之後，fs_initcall()之前。

static int __init hugetlb_init(void)
{
    int i;

    if (!hugepages_supported())
        return 0;

    if (!size_to_hstate(default_hstate_size)) {-------------------------------如果通過command line設定了default_hugepagesz則跳過；否則使用系統預設的HPAGE_SIZE作為預設Huge Page大小。
        if (default_hstate_size != 0) {
            pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
                   default_hstate_size, HPAGE_SIZE);
        }

        default_hstate_size = HPAGE_SIZE;
        if (!size_to_hstate(default_hstate_size))
            hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
    }
    default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));---預設Huge Page在hstates中對應索引號。
    if (default_hstate_max_huge_pages) {
        if (!default_hstate.max_huge_pages)
            default_hstate.max_huge_pages = default_hstate_max_huge_pages;
    }

    hugetlb_init_hstates();---------------------------------------------------根據當前hstate->order，初始化order小於MAX_ORDER的Huge Page記憶體池。較大的部分在之前已經進行分配。
    gather_bootmem_prealloc();
    report_hugepages();-------------------------------------------------------輸出當前系統支援的不同Huge Page大小以及分配頁數。

    hugetlb_sysfs_init();-----------------------------------------------------在/sys/kernel/mm/hugepages目錄下針對不同大小的Huge Paeg建立目錄。
    hugetlb_register_all_nodes();---------------------------------------------處理NUMA架構下不同node的Huge Page。
    hugetlb_cgroup_file_init();-----------------------------------------------建立/sys/fs/cgroup/hugetlb下節點：hugetlb.2MB.failcnt、hugetlb.2MB.limit_in_bytes、hugetlb.2MB.max_usage_in_bytes、hugetlb.2MB.usage_in_bytes。

#ifdef CONFIG_SMP
    num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
#else
    num_fault_mutexes = 1;
#endif
    hugetlb_fault_mutex_table =
        kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
    BUG_ON(!hugetlb_fault_mutex_table);

    for (i = 0; i < num_fault_mutexes; i++)
        mutex_init(&hugetlb_fault_mutex_table[i]);----------------------------建立hugetlb mutex。
    return 0;
}

void __init hugetlb_add_hstate(unsigned int order)
{
    struct hstate *h;
    unsigned long i;

    if (size_to_hstate(PAGE_SIZE << order)) {------------------避免同大小兩次加入。
        pr_warn("hugepagesz= specified twice, ignoring\n");
        return;
    }
    BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
    BUG_ON(order == 0);
    h = &hstates[hugetlb_max_hstate++];------------------------設定hstates中對應Huge Page池屬性。
    h->order = order;
    h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
    h->nr_huge_pages = 0;
    h->free_huge_pages = 0;
    for (i = 0; i < MAX_NUMNODES; ++i)
        INIT_LIST_HEAD(&h->hugepage_freelists[i]);
    INIT_LIST_HEAD(&h->hugepage_activelist);
    h->next_nid_to_alloc = first_memory_node;
    h->next_nid_to_free = first_memory_node;
    snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                    huge_page_size(h)/1024);

    parsed_hstate = h;
}

hugetlb_sysfs_init()對不同大小的Huge Page建立/sys/kernel/mm/hugepages對應的目錄，對每個目錄中屬性節點進行配置達到配置Huge Page池的目的。

static void __init hugetlb_sysfs_init(void)
{
    struct hstate *h;
    int err;

    hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
    if (!hugepages_kobj)
        return;

    for_each_hstate(h) {
        err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
                     hstate_kobjs, &hstate_attr_group);
        if (err)
            pr_err("Hugetlb: Unable to add hstate %s", h->name);
    }
}

static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                    struct kobject **hstate_kobjs,
                    const struct attribute_group *hstate_attr_group)
{
    int retval;
    int hi = hstate_index(h);

    hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
    if (!hstate_kobjs[hi])
        return -ENOMEM;

    retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
    if (retval)
        kobject_put(hstate_kobjs[hi]);

    return retval;
}

static const struct attribute_group hstate_attr_group = {
    .attrs = hstate_attrs,
};

static struct attribute *hstate_attrs[] = {
    &nr_hugepages_attr.attr,
    &nr_overcommit_hugepages_attr.attr,
    &free_hugepages_attr.attr,
    &resv_hugepages_attr.attr,
    &surplus_hugepages_attr.attr,
#ifdef CONFIG_NUMA
    &nr_hugepages_mempolicy_attr.attr,
#endif
    NULL,
};

修改nr_hugepaegs對Huge Page頁面數進行配置，核心是set_max_huge_pages()。

static ssize_t nr_hugepages_show_common(struct kobject *kobj,
                    struct kobj_attribute *attr, char *buf)
{
    struct hstate *h;
    unsigned long nr_huge_pages;
    int nid;

    h = kobj_to_hstate(kobj, &nid);
    if (nid == NUMA_NO_NODE)
        nr_huge_pages = h->nr_huge_pages;
    else
        nr_huge_pages = h->nr_huge_pages_node[nid];

    return sprintf(buf, "%lu\n", nr_huge_pages);
}

static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
                       struct hstate *h, int nid,
                       unsigned long count, size_t len)
{
    int err;
    NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);

    if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
        err = -EINVAL;
        goto out;
    }

    if (nid == NUMA_NO_NODE) {
        /*
         * global hstate attribute
         */
        if (!(obey_mempolicy &&
                init_nodemask_of_mempolicy(nodes_allowed))) {
            NODEMASK_FREE(nodes_allowed);
            nodes_allowed = &node_states[N_MEMORY];
        }
    } else if (nodes_allowed) {
        /*
         * per node hstate attribute: adjust count to global,
         * but restrict alloc/free to the specified node.
         */
        count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
        init_nodemask_of_node(nodes_allowed, nid);
    } else
        nodes_allowed = &node_states[N_MEMORY];

    h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);

    if (nodes_allowed != &node_states[N_MEMORY])
        NODEMASK_FREE(nodes_allowed);

    return len;
out:
    NODEMASK_FREE(nodes_allowed);
    return err;
}

static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                     struct kobject *kobj, const char *buf,
                     size_t len)
{
    struct hstate *h;
    unsigned long count;
    int nid;
    int err;

    err = kstrtoul(buf, 10, &count);
    if (err)
        return err;

    h = kobj_to_hstate(kobj, &nid);
    return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
}

static ssize_t nr_hugepages_show(struct kobject *kobj,
                       struct kobj_attribute *attr, char *buf)
{
    return nr_hugepages_show_common(kobj, attr, buf);
}

static ssize_t nr_hugepages_store(struct kobject *kobj,
           struct kobj_attribute *attr, const char *buf, size_t len)
{
    return nr_hugepages_store_common(false, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages);

#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                        nodemask_t *nodes_allowed)
{
    unsigned long min_count, ret;

    if (hstate_is_gigantic(h) && !gigantic_page_supported())
        return h->max_huge_pages;

    /*
     * Increase the pool size
     * First take pages out of surplus state.  Then make up the
     * remaining difference by allocating fresh huge pages.
     *
     * We might race with __alloc_buddy_huge_page() here and be unable
     * to convert a surplus huge page to a normal huge page. That is
     * not critical, though, it just means the overall size of the
     * pool might be one hugepage larger than it needs to be, but
     * within all the constraints specified by the sysctls.
     */
    spin_lock(&hugetlb_lock);
    while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
        if (!adjust_pool_surplus(h, nodes_allowed, -1))
            break;
    }

    while (count > persistent_huge_pages(h)) {
        /*
         * If this allocation races such that we no longer need the
         * page, free_huge_page will handle it by freeing the page
         * and reducing the surplus.
         */
        spin_unlock(&hugetlb_lock);

        /* yield cpu to avoid soft lockup */
        cond_resched();

        if (hstate_is_gigantic(h))
            ret = alloc_fresh_gigantic_page(h, nodes_allowed);
        else
            ret = alloc_fresh_huge_page(h, nodes_allowed);
        spin_lock(&hugetlb_lock);
        if (!ret)
            goto out;

        /* Bail for signals. Probably ctrl-c from user */
        if (signal_pending(current))
            goto out;
    }

    /*
     * Decrease the pool size
     * First return free pages to the buddy allocator (being careful
     * to keep enough around to satisfy reservations).  Then place
     * pages into surplus state as needed so the pool will shrink
     * to the desired size as pages become free.
     *
     * By placing pages into the surplus state independent of the
     * overcommit value, we are allowing the surplus pool size to
     * exceed overcommit. There are few sane options here. Since
     * __alloc_buddy_huge_page() is checking the global counter,
     * though, we'll note that we're not allowed to exceed surplus
     * and won't grow the pool anywhere else. Not until one of the
     * sysctls are changed, or the surplus pages go out of use.
     */
    min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
    min_count = max(count, min_count);
    try_to_free_low(h, min_count, nodes_allowed);
    while (min_count < persistent_huge_pages(h)) {
        if (!free_pool_huge_page(h, nodes_allowed, 0))
            break;
        cond_resched_lock(&hugetlb_lock);
    }
    while (count < persistent_huge_pages(h)) {
        if (!adjust_pool_surplus(h, nodes_allowed, 1))
            break;
    }
out:
    ret = persistent_huge_pages(h);
    spin_unlock(&hugetlb_lock);
    return ret;
}

其他屬性包括：nr_overcommit_hugepages配置Huge Page池超額使用頁面雙；free_hugepages、resv_hugepages、surplus_hugepages等屬性都是隻讀。

對於非NUMA架構，nr_overcommit_hugepages、free_hugepages、resv_hugepages、surplus_hugepages分別對應struct hstate中的nr_overcommit_huge_pages、free_huge_pages、resv_huge_pages、surplus_huge_pages。

2.2 hugetlbfs

hugetlbfs中建立的檔案可以被讀系統呼叫操作，但是不允許被寫系統呼叫操作。如果需要些內容，可以修改mmap之後的記憶體。

chown/chgrp/chmod等修改問檔案屬性的命令仍然可以被使用。

fs_initcall(init_hugetlbfs_fs)

static int __init init_hugetlbfs_fs(void)
{
    struct hstate *h;
    int error;
    int i;

    if (!hugepages_supported()) {
        pr_info("disabling because there are no supported hugepage sizes\n");
        return -ENOTSUPP;
    }

    error = -ENOMEM;
    hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
                    sizeof(struct hugetlbfs_inode_info),
                    0, SLAB_ACCOUNT, init_once);-------------------------初始化hugetlbfs檔案系統inode slab快取。
    if (hugetlbfs_inode_cachep == NULL)
        goto out2;

    error = register_filesystem(&hugetlbfs_fs_type);---------------------註冊hugetlbfs檔案系統。
    if (error)
        goto out;

    i = 0;
    for_each_hstate(h) {
        char buf[50];
        unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);

        snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
        hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
                            buf);

        if (IS_ERR(hugetlbfs_vfsmount[i])) {
            pr_err("Cannot mount internal hugetlbfs for "
                "page size %uK", ps_kb);
            error = PTR_ERR(hugetlbfs_vfsmount[i]);
            hugetlbfs_vfsmount[i] = NULL;
        }
        i++;
    }
    /* Non default hstates are optional */
    if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
        return 0;

 out:
    kmem_cache_destroy(hugetlbfs_inode_cachep);
 out2:
    return error;
}

hugetlbfs檔案系統目錄、檔案、塊等操作函式集如下：

static struct file_system_type hugetlbfs_fs_type = {
    .name        = "hugetlbfs",
    .mount        = hugetlbfs_mount,
    .kill_sb    = kill_litter_super,
};

static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
    int flags, const char *dev_name, void *data)
{
    return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
}

static int
hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
{
    int ret;
    struct hugetlbfs_config config;
    struct hugetlbfs_sb_info *sbinfo;

    config.max_hpages = -1; /* No limit on size by default */
    config.nr_inodes = -1; /* No limit on number of inodes by default */
    config.uid = current_fsuid();
    config.gid = current_fsgid();
    config.mode = 0755;
    config.hstate = &default_hstate;
    config.min_hpages = -1; /* No default minimum size */
    ret = hugetlbfs_parse_options(data, &config);
    if (ret)
        return ret;

    sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
    if (!sbinfo)
        return -ENOMEM;
    sb->s_fs_info = sbinfo;
    sbinfo->hstate = config.hstate;
    spin_lock_init(&sbinfo->stat_lock);
    sbinfo->max_inodes = config.nr_inodes;
    sbinfo->free_inodes = config.nr_inodes;
    sbinfo->spool = NULL;
    sbinfo->uid = config.uid;
    sbinfo->gid = config.gid;
    sbinfo->mode = config.mode;

    /*
     * Allocate and initialize subpool if maximum or minimum size is
     * specified.  Any needed reservations (for minimim size) are taken
     * taken when the subpool is created.
     */
    if (config.max_hpages != -1 || config.min_hpages != -1) {
        sbinfo->spool = hugepage_new_subpool(config.hstate,
                            config.max_hpages,
                            config.min_hpages);
        if (!sbinfo->spool)
            goto out_free;
    }
    sb->s_maxbytes = MAX_LFS_FILESIZE;
    sb->s_blocksize = huge_page_size(config.hstate);
    sb->s_blocksize_bits = huge_page_shift(config.hstate);
    sb->s_magic = HUGETLBFS_MAGIC;
    sb->s_op = &hugetlbfs_ops;
    sb->s_time_gran = 1;
    sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
    if (!sb->s_root)
        goto out_free;
    return 0;
out_free:
    kfree(sbinfo->spool);
    kfree(sbinfo);
    return -ENOMEM;
}

static const struct super_operations hugetlbfs_ops = {
    .alloc_inode    = hugetlbfs_alloc_inode,
    .destroy_inode  = hugetlbfs_destroy_inode,
    .evict_inode    = hugetlbfs_evict_inode,
    .statfs        = hugetlbfs_statfs,
    .put_super    = hugetlbfs_put_super,
    .show_options    = hugetlbfs_show_options,
};

static const struct inode_operations hugetlbfs_dir_inode_operations = {
    .create        = hugetlbfs_create,
    .lookup        = simple_lookup,
    .link        = simple_link,
    .unlink        = simple_unlink,
    .symlink    = hugetlbfs_symlink,
    .mkdir        = hugetlbfs_mkdir,
    .rmdir        = simple_rmdir,
    .mknod        = hugetlbfs_mknod,
    .rename        = simple_rename,
    .setattr    = hugetlbfs_setattr,
};

static const struct inode_operations hugetlbfs_inode_operations = {
    .setattr    = hugetlbfs_setattr,
};

const struct file_operations hugetlbfs_file_operations = {--------------可以看出hugetlbfs檔案系統中檔案只支援read/mmap/ummap等操作，不支援write。
    .read_iter        = hugetlbfs_read_iter,
    .mmap            = hugetlbfs_file_mmap,
    .fsync            = noop_fsync,
    .get_unmapped_area    = hugetlb_get_unmapped_area,
    .llseek            = default_llseek,
    .fallocate        = hugetlbfs_fallocate,
};

2.3 mmap和shmget/shmat

使用者空間在hugetlbfs檔案系統建立檔案，或者使用MAP_HUGETLB屬性時，在mmap系統呼叫中執行如下操作：

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        unsigned long, prot, unsigned long, flags,
        unsigned long, fd, unsigned long, pgoff)
{
    struct file *file = NULL;
    unsigned long retval;

    if (!(flags & MAP_ANONYMOUS)) {-----------------------------------有名檔案對映。
        audit_mmap_fd(fd, flags);
        file = fget(fd);
        if (!file)
            return -EBADF;
        if (is_file_hugepages(file))----------------------------------如果是hugetlbfs檔案系統檔案，將檔案大小對齊到頁面大小。
            len = ALIGN(len, huge_page_size(hstate_file(file)));
        retval = -EINVAL;
        if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
            goto out_fput;
    } else if (flags & MAP_HUGETLB) {--------------------------------匿名Huge Page對映。
        struct user_struct *user = NULL;
        struct hstate *hs;

        hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
        if (!hs)
            return -EINVAL;

        len = ALIGN(len, huge_page_size(hs));------------------------大小對齊到Huge Page大小。
        /*
         * VM_NORESERVE is used because the reservations will be
         * taken when vm_ops->mmap() is called
         * A dummy user value is used because we are not locking
         * memory so no accounting is necessary
         */
        file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                VM_NORESERVE,
                &user, HUGETLB_ANONHUGE_INODE,
                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);----------建立檔案，使用hugetlbfs_file_operations作為操作函式集。
        if (IS_ERR(file))
            return PTR_ERR(file);
    }

    flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);

    retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);----呼叫hugetlbfs的mmap函式hugetlbfs_file_mmap()進行對映。
out_fput:
    if (file)
        fput(file);
    return retval;
}

當使用SysV共享記憶體進行Huge Page對映時：

static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
    key_t key = params->key;
    int shmflg = params->flg;
    size_t size = params->u.size;
    int error;
    struct shmid_kernel *shp;
    size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
    struct file *file;
    char name[13];
    vm_flags_t acctflag = 0;
...
    sprintf(name, "SYSV%08x", key);
    if (shmflg & SHM_HUGETLB) {
        struct hstate *hs;
        size_t hugesize;

        hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);--------Huge Page頁面大小對齊。
        if (!hs) {
            error = -EINVAL;
            goto no_file;
        }
        hugesize = ALIGN(size, huge_page_size(hs));

        /* hugetlb_file_setup applies strict accounting */
        if (shmflg & SHM_NORESERVE)
            acctflag = VM_NORESERVE;
        file = hugetlb_file_setup(name, hugesize, acctflag,
                  &shp->mlock_user, HUGETLB_SHMFS_INODE,
                (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);---------------------建立hugetlbfs檔案。
    } else {
...
    }
    error = PTR_ERR(file);
    if (IS_ERR(file))
        goto no_file;...
}

記憶體對映在do_shmat()中進行：

static const struct file_operations shm_file_operations_huge = {
    .mmap        = shm_mmap,
    .fsync        = shm_fsync,
    .release    = shm_release,
    .get_unmapped_area    = shm_get_unmapped_area,
    .llseek        = noop_llseek,
    .fallocate    = shm_fallocate,
};

long do_shmat(int shmid, char __user *shmaddr, int shmflg,
          ulong *raddr, unsigned long shmlba)
{
    struct shmid_kernel *shp;
    unsigned long addr = (unsigned long)shmaddr;
...
    file = alloc_file(&path, f_mode,
              is_file_hugepages(shp->shm_file) ?
                &shm_file_operations_huge :
                &shm_file_operations);
...
    addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);----呼叫shm_mmap()進行對映。
...
}

2.4 HugePage缺頁異常

do_page_fault()是系統處理缺頁異常的入口，如果是Huge Page型別的缺頁異常則呼叫hugetlb_fault()進行處理。

do_page_fault
  ->__do_page_fault
    ->handle_mm_fault

int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        unsigned int flags)
{
...
    if (unlikely(is_vm_hugetlb_page(vma)))
        ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
    else
        ret = __handle_mm_fault(vma, address, flags);
...
}

int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
            unsigned long address, unsigned int flags)
{
    pte_t *ptep, entry;
    spinlock_t *ptl;
    int ret;
    u32 hash;
    pgoff_t idx;
    struct page *page = NULL;
    struct page *pagecache_page = NULL;
    struct hstate *h = hstate_vma(vma);
    struct address_space *mapping;
    int need_wait_lock = 0;

    address &= huge_page_mask(h);

    ptep = huge_pte_offset(mm, address, huge_page_size(h));
    if (ptep) {
        entry = huge_ptep_get(ptep);
        if (unlikely(is_hugetlb_entry_migration(entry))) {
            migration_entry_wait_huge(vma, mm, ptep);
            return 0;
        } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
            return VM_FAULT_HWPOISON_LARGE |
                VM_FAULT_SET_HINDEX(hstate_index(h));
    } else {
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
        if (!ptep)
            return VM_FAULT_OOM;
    }

    mapping = vma->vm_file->f_mapping;
    idx = vma_hugecache_offset(h, vma, address);

    /*
     * Serialize hugepage allocation and instantiation, so that we don't
     * get spurious allocation failures if two CPUs race to instantiate
     * the same page in the page cache.
     */
    hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
    mutex_lock(&hugetlb_fault_mutex_table[hash]);

    entry = huge_ptep_get(ptep);
    if (huge_pte_none(entry)) {
        ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
        goto out_mutex;
    }

    ret = 0;

    /*
     * entry could be a migration/hwpoison entry at this point, so this
     * check prevents the kernel from going below assuming that we have
     * a active hugepage in pagecache. This goto expects the 2nd page fault,
     * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
     * handle it.
     */
    if (!pte_present(entry))
        goto out_mutex;

    /*
     * If we are going to COW the mapping later, we examine the pending
     * reservations for this page now. This will ensure that any
     * allocations necessary to record that reservation occur outside the
     * spinlock. For private mappings, we also lookup the pagecache
     * page now as it is used to determine if a reservation has been
     * consumed.
     */
    if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
        if (vma_needs_reservation(h, vma, address) < 0) {
            ret = VM_FAULT_OOM;
            goto out_mutex;
        }
        /* Just decrements count, does not deallocate */
        vma_end_reservation(h, vma, address);

        if (!(vma->vm_flags & VM_MAYSHARE))
            pagecache_page = hugetlbfs_pagecache_page(h,
                                vma, address);
    }

    ptl = huge_pte_lock(h, mm, ptep);

    /* Check for a racing update before calling hugetlb_cow */
    if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
        goto out_ptl;

    /*
     * hugetlb_cow() requires page locks of pte_page(entry) and
     * pagecache_page, so here we need take the former one
     * when page != pagecache_page or !pagecache_page.
     */
    page = pte_page(entry);
    if (page != pagecache_page)
        if (!trylock_page(page)) {
            need_wait_lock = 1;
            goto out_ptl;
        }

    get_page(page);

    if (flags & FAULT_FLAG_WRITE) {
        if (!huge_pte_write(entry)) {
            ret = hugetlb_cow(mm, vma, address, ptep,
                      pagecache_page, ptl);
            goto out_put_page;
        }
        entry = huge_pte_mkdirty(entry);
    }
    entry = pte_mkyoung(entry);
    if (huge_ptep_set_access_flags(vma, address, ptep, entry,
                        flags & FAULT_FLAG_WRITE))
        update_mmu_cache(vma, address, ptep);
out_put_page:
    if (page != pagecache_page)
        unlock_page(page);
    put_page(page);
out_ptl:
    spin_unlock(ptl);

    if (pagecache_page) {
        unlock_page(pagecache_page);
        put_page(pagecache_page);
    }
out_mutex:
    mutex_unlock(&hugetlb_fault_mutex_table[hash]);
    /*
     * Generally it's safe to hold refcount during waiting page lock. But
     * here we just wait to defer the next page fault to avoid busy loop and
     * the page is not used after unlocked before returning from the current
     * page fault. So we are safe from accessing freed page, even if we wait
     * here without taking refcount.
     */
    if (need_wait_lock)
        wait_on_page_locked(page);
    return ret;
}

3. HugePage配置

獲取當前系統使用頁面大小命令：

getconf PAGESIZE

通過/proc/meminfo檢視HugePage資訊：

MemTotal:       16310112 kB
MemFree:          586168 kB
MemAvailable:   10733508 kB
...
HugePages_Total:       0---------------------HugePage池中大小。
HugePages_Free:        0---------------------HugePage池中未被分配HugePage數量。
HugePages_Rsvd:        0---------------------HugePage池中承諾被分配但還未執行分配操作的HugePage數量。
HugePages_Surp:        0---------------------HugePage池中超出/proc/sys/vm/nr_hugepages。最大不超過/proc/sys/vm/nr_overcommit_hugepages。
Hugepagesize:       2048 kB
...

核心可以在命令列設定hugepages和hugepagesz，分別表示HugePage頁面數量和頁面大小。當系統支援多種大小HugePage時，預設HugePage大小通過default_huagepaegsz指定。

3.1 命令列引數

通過命令列分別設定nr_hugepages和default_hugepagesz，可以設定HugePage頁面數量和大小。以及x86_64特有的配置選項。

由於__setup()在initcall()之前執行，所以下面的命令都在hugetlb_init()之前執行。

static int __init hugetlb_nrpages_setup(char *s)
{
    unsigned long *mhp;
    static unsigned long *last_mhp;

    if (!parsed_valid_hugepagesz) {
        pr_warn("hugepages = %s preceded by "
            "an unsupported hugepagesz, ignoring\n", s);
        parsed_valid_hugepagesz = true;
        return 1;
    }
    /*
     * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
     * so this hugepages= parameter goes to the "default hstate".
     */
    else if (!hugetlb_max_hstate)
        mhp = &default_hstate_max_huge_pages;-----------如果hugepagesz沒有被解析過，則hugepages解析值儲存在default_hstate_max_huge_pages中；否則儲存在parsed_hstate->max_huge_pages中。
    else
        mhp = &parsed_hstate->max_huge_pages;

    if (mhp == last_mhp) {
        pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
        return 1;
    }

    if (sscanf(s, "%lu", mhp) <= 0)---------------------儲存命令列解析的hugepages值。
        *mhp = 0;

    /*
     * Global state is always initialized later in hugetlb_init.
     * But we need to allocate >= MAX_ORDER hstates here early to still
     * use the bootmem allocator.
     */
    if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)---必須滿足hugetlb_max_hstate被初始化過，並且parsed_hstate->order不小於MAX_ORDER才提前分配記憶體。
        hugetlb_hstate_alloc_pages(parsed_hstate);

    last_mhp = mhp;

    return 1;
}
__setup("hugepages=", hugetlb_nrpages_setup);

static int __init hugetlb_default_setup(char *s)
{
    default_hstate_size = memparse(s, &s);---------------解析default_hugepagesz值到default_hstate_size中，表示預設的Huge Page大小。後續系統可能存在多種Huge Page大小。
    return 1;
}
__setup("default_hugepagesz=", hugetlb_default_setup);

static __init int setup_hugepagesz(char *opt)
{
    unsigned long ps = memparse(opt, &opt);--------------x86_64特有屬性值。
    if (ps == PMD_SIZE) {
        hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
    } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
        hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
    } else {
        hugetlb_bad_size();
        printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
            ps >> 20);
        return 0;
    }
    return 1;
}
__setup("hugepagesz=", setup_hugepagesz);

3.2 配置節點

核心每一個不同尺寸的Huge Page都有一對應的目錄：/sys/kernel/mm/hugepages/hugepages-*kB/。包含如下檔案節點：

nr_hugepages
nr_hugepages_mempolicy
nr_overcommit_hugepages
free_hugepages
resv_hugepages
surplus_hugepages

nr_hugepages(讀寫)

系統執行起來後，可以通過/proc/sys/vm/nr_hugepages設定，系統根據實際情況分配或釋放HugePages。

當增加nr_hugepages之後，系統優先使用surplus中頁面。然後才會分配新的Huge Page來滿足需求。

當減小nr_hugepages，導致使用中頁面大於nr_hugepages時，將使用中頁面轉換成surplus頁面。

當減小nr_hugepaes，導致用中頁面大於nr_hugepages+nr_overcommit_hugepages時，同樣會將超出nr_hugepages的頁面轉成surplus頁面。直到nr_hugepages+nr_overcommit_hugepages足夠大，或者釋放足夠多的surplus頁面，否則不會繼續申請surplus頁面。

free_hugepages(只讀)

表示系統中persistent Huge Page頁面數量。

resv_hugepages(只讀)

表示已經被分配但是未被實際使用的Huge Page數量。

surplus_hugepages(只讀)

使用中的overcommit Huge Page頁面數量。

nr_overcommit_hugepages(讀寫)

系統允許的最大overcommit頁面數量。

表示噹噹前系統可以從HugePage池中分配超過nr_hugepages數量的HugePage數。這些頁面被稱為surplus Huge Pages，當這些頁面不被使用後，可以被釋放返回給系統。

HugePage為2MB情況下，分別設定nr_hugepages=128、nr_overcommit_hugepages=128。分別通過mmap申請100MB、300MB、512MB後關係如下：

Item	nr_hugepages	free_hugepages	resv_hugepages	nr_overcommit_hugepages	surplus_hugepages
初始值	128	128	0	128	0
100MB	128	78	0	128	0
300MB	150	0	0	128	22
512M	256	0	0	128	128

hugepages_treat_as_movable

分配的Huge Page頁面將具備__GFP_MOVABLE標誌。

4. HugePage優缺點

使用HugePage的優點：

系統使用HugePage則記憶體頁數量會減少，從而需要更少的頁表，節約了頁表所佔用的記憶體數量。
所需的地址轉化也減少了，TLB快取失效的次數也減少了，從而提高記憶體訪問的效能。
地址轉換所需資訊一般儲存在CPU快取中，HugePage使用讓地址轉換資訊減少，減少了CPU快取的使用。
HugePage頁面是不支援swap的，所以沒有page table lookups。所以大記憶體情況下，kswapd也不會頻繁被呼叫。

當然HugePage在某些場景下也存在缺點：

當申請一塊大記憶體，但是使用記憶體並不多，比如：每個2MB，寫4KB內容。使用HugePage就會導致實際佔用的實體記憶體相對於4KB頁面大很多。

5. HugePage使用

有兩種方式使用HugePage：mmap和SYSV共享記憶體，其中mmap可以有名對映或者匿名對映。

共享記憶體和mmap通過MAP_HUGETLB使用HugePage是不需要掛載HugePage檔案系統的。

5.1 mmap

5.1.1 mmap有名對映HugePage

掛載HugePage檔案系統

mount none /mnt/huge -t hugetlbfs

使用如下程式碼建立huge/hugepagefile檔案，然後使用256M對映：

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>

#define FILE_NAME "huge/hugepagefile"
#define LENGTH (256UL*1024*1024)
#define PROTECTION (PROT_READ | PROT_WRITE)

/* Only ia64 requires this */
#ifdef __ia64__
#define ADDR (void *)(0x8000000000000000UL)
#define FLAGS (MAP_SHARED | MAP_FIXED)
#else
#define ADDR (void *)(0x0UL)
#define FLAGS (MAP_SHARED)
#endif

static void check_bytes(char *addr)
{
    printf("First hex is %x\n", *((unsigned int *)addr));
}

static void write_bytes(char *addr)
{
    unsigned long i;

    for (i = 0; i < LENGTH; i++)
        *(addr + i) = (char)i;
}

static int read_bytes(char *addr)
{
    unsigned long i;

    check_bytes(addr);
    for (i = 0; i < LENGTH; i++)
        if (*(addr + i) != (char)i) {
            printf("Mismatch at %lu\n", i);
            return 1;
        }
    return 0;
}

int main(void)
{
    void *addr;
    int fd, ret;

    fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
    if (fd < 0) {
        perror("Open failed");
        exit(1);
    }

    addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
    if (addr == MAP_FAILED) {
        perror("mmap");
        unlink(FILE_NAME);
        exit(1);
    }

    printf("Returned address is %p\n", addr);
    check_bytes(addr);
    write_bytes(addr);
    ret = read_bytes(addr);

    sleep(10);

    munmap(addr, LENGTH);
    close(fd);
    unlink(FILE_NAME);

    return ret;
}

輸出如下：

Returned address is 0x7f2d8ba00000
First hex is 0
First hex is 3020100

檢視程序maps如下：

...
7f2d8ba00000-7f2d9ba00000 rw-s 00000000 00:9b 10940003                   /home/al/hugepage/huge/hugepagefile
...

檢視檔案系統：

-rwxr-xr-x 1 root root 258M 11月 29 23:06 hugepagefile

5.1.2 mmap匿名對映HugePage

程式碼如下：

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>

#define LENGTH (256UL*1024*1024)
#define PROTECTION (PROT_READ | PROT_WRITE)

#ifndef MAP_HUGETLB
#define MAP_HUGETLB 0x40000 /* arch specific */
#endif

/* Only ia64 requires this */
#ifdef __ia64__
#define ADDR (void *)(0x8000000000000000UL)
#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
#else
#define ADDR (void *)(0x0UL)
#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
#endif

static void check_bytes(char *addr)
{
    printf("First hex is %x\n", *((unsigned int *)addr));
}

static void write_bytes(char *addr)
{
    unsigned long i;

    for (i = 0; i < LENGTH; i++)
        *(addr + i) = (char)i;
}

static int read_bytes(char *addr)
{
    unsigned long i;

    check_bytes(addr);
    for (i = 0; i < LENGTH; i++)
        if (*(addr + i) != (char)i) {
            printf("Mismatch at %lu\n", i);
            return 1;
        }
    return 0;
}

int main(void)
{
    void *addr;
    int ret;

    addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0);
    if (addr == MAP_FAILED) {
        perror("mmap");
        exit(1);
    }

    printf("Returned address is %p\n", addr);
    check_bytes(addr);
    write_bytes(addr);
    ret = read_bytes(addr);
    sleep(10);
    /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
    if (munmap(addr, LENGTH)) {
        perror("munmap");
        exit(1);
    }

    return ret;
}

輸出如下：

Returned address is 0x7ff600200000
First hex is 0
First hex is 3020100

檢視程序的maps如下：

...
7ff600200000-7ff610200000 rw-p 00000000 00:0f 10940472                   /anon_hugepage (deleted)
...

5.2 SYSV共享記憶體使用HugePage

程式碼如下：

#include <stdlib.h>
#include <stdio.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

#define LENGTH (256UL*1024*1024)

#define dprintf(x)  printf(x)

/* Only ia64 requires this */
#ifdef __ia64__
#define ADDR (void *)(0x8000000000000000UL)
#define SHMAT_FLAGS (SHM_RND)
#else
#define ADDR (void *)(0x0UL)
#define SHMAT_FLAGS (0)
#endif

int main(void)
{
    int shmid;
    unsigned long i;
    char *shmaddr;

    shmid = shmget(0x12345678, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
    if (shmid < 0) {
        perror("shmget");
        exit(1);
    }
    printf("shmid: %d\n", shmid);

    shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
    if (shmaddr == (char *)-1) {
        perror("Shared memory attach failure");
        shmctl(shmid, IPC_RMID, NULL);
        exit(2);
    }
    printf("shmaddr: %p\n", shmaddr);

    dprintf("Starting the writes:\n");
    for (i = 0; i < LENGTH; i++) {
        shmaddr[i] = (char)(i);
        if (!(i % (1024 * 1024)))
            dprintf(".");
    }
    dprintf("\n");

    dprintf("Starting the Check...");
    for (i = 0; i < LENGTH; i++)
        if (shmaddr[i] != (char)i) {
            printf("\nIndex %lu mismatched\n", i);
            exit(3);
        }
    dprintf("Done.\n");
    sleep(10);

    if (shmdt((const void *)shmaddr) != 0) {
        perror("Detach failure");
        shmctl(shmid, IPC_RMID, NULL);
        exit(4);
    }

    shmctl(shmid, IPC_RMID, NULL);

    return 0;
}

執行結果如下：

shmid: 32407590
shmaddr: 0x7f1fc2c00000
Starting the writes:
................................................................................................................................................................................................................................................................
Starting the Check...Done.

檢視程序maps如下：

...
7f1fc2c00000-7f1fd2c00000 rw-s 00000000 00:0f 32407590                   /SYSV12345678 (deleted)
...

共享記憶體使用情況如下：

------ Shared Memory Segments --------
key        shmid      owner      perms      bytes      nattch     status      
...        
0x12345678 32407590   root       600        268435456  1

5.3 libhugetlbfs做迴歸測試

原始碼：

https://github.com/libhugetlbfs/libhugetlbfs

《HOWTO》對libhugetlbfs做了詳細介紹：

通過libhugetlbfs對使用hugetlbfs提供了一套方便的應用程式介面；使用libhugetblfs替代目前庫中malloc()函式，使記憶體分配在HugePage上進行；libhugetlbfs能使程序test/data/bss段在HugePage上分配。
支援libhugetlbfs的硬體、核心、工具鏈、配置。
如何編譯安裝libhugetlbfs。
如何使用libhugetlbfs：替代malloc()、共享記憶體、程序text/data/bss段。

安裝libhugetlgfs：

sudo apt-get install libhugetlbfs libhugetlbfs-tests

建立掛載點：

sudo mount none /home/al/hugepage/huge -t hugetlbfs

使用hugeadm檢視掛載情況：

hugeadm --list-all-mounts：
libhugetlbfs: ERROR: Line too long when parsing mounts
Mount Point            Options
/dev/hugepages         rw,relatime,pagesize=2M
/home/al/hugepage/huge rw,relatime,pagesize=2M

hugeadm --pool-list：
libhugetlbfs: ERROR: Line too long when parsing mounts
      Size  Minimum  Current  Maximum  Default
   2097152      512      512      512        *
1073741824        0        0        0

使用如下指令碼進行測試：

sudo /usr/lib/libhugetlbfs/tests/run_tests.py