深入淺出記憶體管理--記憶體節點(Node)
阿新 • • 發佈:2018-12-16
本文以Linux核心4.9來做介紹。
Node 結構體
核心中的節點是使用一個結構體struct pglist_data來進行管理的,它的組成如下所示,本文只會列出幾個關鍵成員,其餘成員待遇到時在做解釋:
typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #ifndef CONFIG_NO_BOOTMEM struct bootmem_data *bdata; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; enum zone_type kswapd_classzone_idx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; /* Rate limiting time interval */ unsigned long numabalancing_migrate_next_window; /* Number of pages migrated during the rate limiting time interval */ unsigned long numabalancing_migrate_nr_pages; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; /* Number of non-deferred pages */ unsigned long static_init_pgcnt; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; #endif /* Fields commonly accessed by the page reclaim scanner */ struct lruvec lruvec; /* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this node's LRU. Maintained by the pageout code. */ unsigned int inactive_ratio; unsigned long flags; ZONE_PADDING(_pad2_) /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; } pg_data_t;
- node_zones[MAX_NR_ZONES];
該節點中所有管理區(ZONE)的描述符陣列, - node_zonelists[MAX_ZONELISTS];
頁分配器使用的結構體陣列,頁分配器會根據不同的GFP申請標誌來按照不同的順序掃描對應節點中的ZONE,而該結構體就是用於定製不同的順序。
enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS };
如上所示支援的分配方式有兩種,ZONELIST_FALLBACK和ZONELIST_NOFALLBACK,那麼根據不同的分配方式,對於ZONE的優先順序可能是不同的,這個陣列可以記錄下來不同策略對應的優先順序,所以這就是該成員存在的意義。
- nr_zones
節點中存在的管理區數目,最大為MAX_NR_ZONES。 - node_mem_map
節點中頁描述符陣列。 - node_id
節點的id。 - node_start_pfn
節點中的物理頁其實頁框。
系統中Node的定義
我們以ARM64平臺為例,它對系統中所有Node的定義:
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
這裡定義了一個結構體指標陣列,並作為全域性變數來使用,這個陣列的內容是需要系統啟動時進行初始化和填充的:
/**
* Initialize NODE_DATA for a node on the local memory
*/
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
u64 nd_pa;
void *nd;
int tnid;
if (start_pfn < end_pfn)
pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
else
pr_info("Initmem setup node %d [<memory-less node>]\n", nid);
nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
nd = __va(nd_pa);
/* report and initialize */
pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
if (tnid != nid)
pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
NODE_DATA(nid)->node_id = nid;
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
}
setup_node_data是在如下函式中被呼叫的:
static int __init numa_register_nodes(void)
{
int nid;
struct memblock_region *mblk;
/* Check that valid nid is set to memblks */
for_each_memblock(memory, mblk)
if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
mblk->nid, mblk->base,
mblk->base + mblk->size - 1);
return -EINVAL;
}
/* Finally register nodes. */
for_each_node_mask(nid, numa_nodes_parsed) {
unsigned long start_pfn, end_pfn;
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
setup_node_data(nid, start_pfn, end_pfn);
node_set_online(nid);
}
/* Setup online nodes to actual nodes*/
node_possible_map = numa_nodes_parsed;
return 0;
}
進一步跟下去會發現它是在void __init arm64_numa_init(void)裡面進行層層呼叫下來的。具體我們不做分析了。
單一節點
對於單一節點的系統來說,系統中只有一個node描述符,定義如下:
#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data = {
.bdata = &bootmem_node_data[0]
};
EXPORT_SYMBOL(contig_page_data);
#endif