1. 程式人生 > >深入淺出記憶體管理--記憶體節點(Node)

深入淺出記憶體管理--記憶體節點(Node)

本文以Linux核心4.9來做介紹。

Node 結構體

核心中的節點是使用一個結構體struct pglist_data來進行管理的,它的組成如下所示,本文只會列出幾個關鍵成員,其餘成員待遇到時在做解釋:

 typedef struct pglist_data {
     struct zone node_zones[MAX_NR_ZONES];
     struct zonelist node_zonelists[MAX_ZONELISTS];
     int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
     struct page *node_mem_map;
 #ifdef CONFIG_PAGE_EXTENSION
     struct page_ext *node_page_ext;
 #endif
 #endif
 #ifndef CONFIG_NO_BOOTMEM
     struct bootmem_data *bdata;
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
     /*
      * Must be held any time you expect node_start_pfn, node_present_pages
      * or node_spanned_pages stay constant.  Holding this will also
      * guarantee that any pfn_valid() stays that way.
      *
      * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
      * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
      *
      * Nests above zone->lock and zone->span_seqlock
      */
     spinlock_t node_size_lock;
 #endif
     unsigned long node_start_pfn;
     unsigned long node_present_pages; /* total number of physical pages */
     unsigned long node_spanned_pages; /* total size of physical page
                          range, including holes */
     int node_id;
     wait_queue_head_t kswapd_wait;
     wait_queue_head_t pfmemalloc_wait;
     struct task_struct *kswapd; /* Protected by
                        mem_hotplug_begin/end() */
     int kswapd_order;
     enum zone_type kswapd_classzone_idx;
 
     int kswapd_failures;        /* Number of 'reclaimed == 0' runs */
 
 #ifdef CONFIG_COMPACTION
     int kcompactd_max_order;
     enum zone_type kcompactd_classzone_idx;
     wait_queue_head_t kcompactd_wait;
     struct task_struct *kcompactd;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
     /* Lock serializing the migrate rate limiting window */
     spinlock_t numabalancing_migrate_lock;
 
     /* Rate limiting time interval */
     unsigned long numabalancing_migrate_next_window;
 
     /* Number of pages migrated during the rate limiting time interval */
     unsigned long numabalancing_migrate_nr_pages;
 #endif
     /*
      * This is a per-node reserve of pages that are not available
      * to userspace allocations.
      */
     unsigned long       totalreserve_pages;
 
 #ifdef CONFIG_NUMA
     /*
      * zone reclaim becomes active if more unmapped pages exist.
      */
     unsigned long       min_unmapped_pages;
     unsigned long       min_slab_pages;
 #endif /* CONFIG_NUMA */
 
     /* Write-intensive fields used by page reclaim */
     ZONE_PADDING(_pad1_)
     spinlock_t      lru_lock;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
     /*
      * If memory initialisation on large machines is deferred then this
      * is the first PFN that needs to be initialised.
      */
     unsigned long first_deferred_pfn;
     /* Number of non-deferred pages */
     unsigned long static_init_pgcnt;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     spinlock_t split_queue_lock;
     struct list_head split_queue;
     unsigned long split_queue_len;
 #endif
 
     /* Fields commonly accessed by the page reclaim scanner */
     struct lruvec       lruvec;
     /*
      * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
      * this node's LRU.  Maintained by the pageout code.
      */
     unsigned int inactive_ratio;
 
     unsigned long       flags;
 
     ZONE_PADDING(_pad2_)
 
     /* Per-node vmstats */
     struct per_cpu_nodestat __percpu *per_cpu_nodestats;
     atomic_long_t       vm_stat[NR_VM_NODE_STAT_ITEMS];
 } pg_data_t;

  • node_zones[MAX_NR_ZONES];
    該節點中所有管理區(ZONE)的描述符陣列,
  • node_zonelists[MAX_ZONELISTS];
    頁分配器使用的結構體陣列,頁分配器會根據不同的GFP申請標誌來按照不同的順序掃描對應節點中的ZONE,而該結構體就是用於定製不同的順序。
 enum {
    ZONELIST_FALLBACK,  /* zonelist with fallback */
#ifdef CONFIG_NUMA
    /*
     * The NUMA zonelists are doubled because we need zonelists that
     * restrict the allocations to a single node for __GFP_THISNODE.
     */
    ZONELIST_NOFALLBACK,    /* zonelist without fallback (__GFP_THISNODE) */
#endif
    MAX_ZONELISTS
};

如上所示支援的分配方式有兩種,ZONELIST_FALLBACK和ZONELIST_NOFALLBACK,那麼根據不同的分配方式,對於ZONE的優先順序可能是不同的,這個陣列可以記錄下來不同策略對應的優先順序,所以這就是該成員存在的意義。

  • nr_zones
    節點中存在的管理區數目,最大為MAX_NR_ZONES。
  • node_mem_map
    節點中頁描述符陣列。
  • node_id
    節點的id。
  • node_start_pfn
    節點中的物理頁其實頁框。

系統中Node的定義

我們以ARM64平臺為例,它對系統中所有Node的定義:

 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
 
 static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
 

這裡定義了一個結構體指標陣列,並作為全域性變數來使用,這個陣列的內容是需要系統啟動時進行初始化和填充的:

/**
 * Initialize NODE_DATA for a node on the local memory
 */
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
    const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
    u64 nd_pa;
    void *nd;
    int tnid;

    if (start_pfn < end_pfn)
        pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
            start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
    else
        pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
        
    nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
    nd = __va(nd_pa);

    /* report and initialize */
    pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
        nd_pa, nd_pa + nd_size - 1);
    tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
    if (tnid != nid)
        pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);

    node_data[nid] = nd;
    memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
    NODE_DATA(nid)->node_id = nid;
    NODE_DATA(nid)->node_start_pfn = start_pfn;
    NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
}

setup_node_data是在如下函式中被呼叫的:

static int __init numa_register_nodes(void)
{
    int nid;
    struct memblock_region *mblk;

    /* Check that valid nid is set to memblks */
    for_each_memblock(memory, mblk)
        if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
            pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
                mblk->nid, mblk->base,
                mblk->base + mblk->size - 1);
            return -EINVAL;
        }

    /* Finally register nodes. */
    for_each_node_mask(nid, numa_nodes_parsed) {
        unsigned long start_pfn, end_pfn;

        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
        setup_node_data(nid, start_pfn, end_pfn);
        node_set_online(nid);
    }

    /* Setup online nodes to actual nodes*/
    node_possible_map = numa_nodes_parsed;

    return 0;
}

進一步跟下去會發現它是在void __init arm64_numa_init(void)裡面進行層層呼叫下來的。具體我們不做分析了。

單一節點

對於單一節點的系統來說,系統中只有一個node描述符,定義如下:

#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data = {
    .bdata = &bootmem_node_data[0]
};
EXPORT_SYMBOL(contig_page_data);
#endif