Linux管理内存分阶段抽象,用数据结构管理。先用节点集合管理内存,然后用zone的集合管理节点,再用页的集合管理zone.
pglist_data结构描述节点
typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_CGROUP_MEM_RES_CTLR struct page_cgroup *node_page_cgroup; #endif #endif struct bootmem_data *bdata; #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * * Nests above zone->lock and zone->size_seqlock. */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; } pg_data_t;
zone结构体描述zone
struct zone{ /* Fields commonly accessed by the page allocator */ unsigned long pages_min, pages_low, pages_high; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk * to run OOM on the lower zones despite there's tons of freeable ram * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ unsigned long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; /* * zone reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; #endif /* * free areas of different sizes */ spinlock_t lock; #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif struct free_area free_area[MAX_ORDER]; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ ZONE_PADDING(_pad1_) /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; struct { struct list_head list; unsigned long nr_scan; } lru[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim * target at the previous try_to_free_pages() or balance_pgdat() * invokation. * * We use prev_priority as a measure of how much stress page reclaim is * under - it drives the swappiness decision: whether to unmap mapped * pages. * * Access to both this field is quite racy even on uniprocessor. But * it is expected to average out OK. */ int prev_priority; /* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this zone's LRU. Maintained by the pageout code. */ unsigned int inactive_ratio; ZONE_PADDING(_pad2_) /* Rarely used or read-mostly fields */ /* * wait_table -- the array holding the hash table * wait_table_hash_nr_entries -- the size of the hash table array * wait_table_bits -- wait_table_size == (1 << wait_table_bits) * * The purpose of all these is to keep track of the people * waiting for a page to become available and make them * runnable again when possible. The trouble is that this * consumes a lot of space, especially when so few things * wait on pages at a given time. So instead of using * per-page waitqueues, we use a waitqueue hash table. * * The bucket discipline is to sleep on the same queue when * colliding and wake all in that wait queue when removing. * When something wakes, it must check to be sure its page is * truly available, a la thundering herd. The cost of a * collision is great, but given the expected load of the * table, they should be so rare as to be outweighed by the * benefits from the saved space. * * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the * primary users of these fields, and in mm/page_alloc.c * free_area_init_core() performs the initialization of them. */ wait_queue_head_t * wait_table; unsigned long wait_table_hash_nr_entries; unsigned long wait_table_bits; /* * Discontig memory support fields. */ struct pglist_data *zone_pgdat; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * zone_start_pfn, spanned_pages and present_pages are all * protected by span_seqlock. It is a seqlock because it has * to be read outside of zone->lock, and it is done in the main * allocator path. But, it is written quite infrequently. * * The lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ /* * rarely used fields: */ const char *name; }
1.构建借用内存的结构体
这里主要的结构体是struct node_zonelists
/* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * If zlcache_ptr is not NULL, then it is just the address of zlcache, * as explained above. If zlcache_ptr is NULL, there is no zlcache. * * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zonelist_cache *zlcache_ptr; // NULL or &zlcache struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; #ifdef CONFIG_NUMA struct zonelist_cache zlcache; // optional ... #endif };
相关的结构体有struct zoneref
struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ };
struct zonelist_cache
#ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled becausse we need zonelists that restrict the * allocations to a single node for GFP_THISNODE. * * [0] : Zonelist with fallback * [1] : No fallback (GFP_THISNODE) */ #define MAX_ZONELISTS 2 struct zonelist_cache { unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ unsigned long last_full_zap; /* when last zap'd (jiffies) */ }; #else #define MAX_ZONELISTS 1 struct zonelist_cache; #endif
pg_data_t->node_zonelists[MAX_ZONELISTS]结构体中MAX_ZONELISTS根据CONFIG_NUMA配置不同。
配置NUMA时,内存有多个节点,node_zonelists[0]保存备份列表,node_zonelists[1]构建的是相应节点的zone列表。
1.set_zonelist_order
借用内存的两种策略,节点优先,先可本节点的内存分配,本节点无可用内存在用其它节点的内存,速度优先;
zone优先,先可低成本的zone的内存分配,本节点没有到其它节点同zone下分配内存,都没有在找高成本的zone分配,可靠性优先。
/* * zonelist_order: * 0 = automatic detection of better ordering. * 1 = order by ([node] distance, -zonetype) * 2 = order by (-zonetype, [node] distance) * * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create * the same zonelist. So only NUMA can configure this param. */ #define ZONELIST_ORDER_DEFAULT 0 #define ZONELIST_ORDER_NODE 1 #define ZONELIST_ORDER_ZONE 2
对应UMA类型的内存,就一个节点,只能选ZONELIST_ORDER_ZONE.
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;是mm/page_alloc.c中定义的全局变量。
static void set_zonelist_order(void) { current_zonelist_order = ZONELIST_ORDER_ZONE; }
对于NUMA类型的内存,有多个节点,借用内存的类型要根据不同zone中内存大小的分布来决定。
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
static void set_zonelist_order(void) { if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) current_zonelist_order = default_zonelist_order(); else current_zonelist_order = user_zonelist_order; }
如果没哟DMA zone或者DMA zone的内存比较多,选择节点顺序ZONELIST_ORDER_NODE。否则选择ZONE顺序ZONELIST_ORDER_ZONE。
2.__build_all_zonelists
构建备用列表的主要工作是在__build_all_zonelists函数中实现的。
static int __build_all_zonelists(void *dummy) { int nid; for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } return 0; }
2.1build_zonelists函数即完成pg_data_t->node_zonelists[0]备份列表的初始化,同时完成pg_data_t->node_zonelists[1]自身节点的zone列表的初始化。
以上主要完成zonelists->_zonerefs数组的初始化,再次把zoneref结构体的定义贴在这里
struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ };
最终的设置zoneref结构体的函数是zoneref_set_zone。
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; zoneref->zone_idx = zone_idx(zone); }
2.2 build_zonelist_cache主要完成node_zonelists[0]->zlcache,即zonelist_cache结构体的初始化。
这里把zonelist_cache结构体的定义及build_zonelist_cache函数贴在这里。这个结构体的作用主要是为了提高查找效率。
struct zonelist_cache { unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ unsigned long last_full_zap; /* when last zap'd (jiffies) */ }; /* Construct the zonelist performance cache - see further mmzone.h */ static void build_zonelist_cache(pg_data_t *pgdat) { struct zonelist *zonelist; struct zonelist_cache *zlc; struct zoneref *z; zonelist = &pgdat->node_zonelists[0]; zonelist->zlcache_ptr = zlc = &zonelist->zlcache; bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); for (z = zonelist->_zonerefs; z->zone; z++) zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); }
3.mminit_verify_zonelist()
输出系统内所有节点的备份列表信息。这里解释其中一个关键的函数for_each_zone_zonelist(zone, z, zonelist, zoneid)遍历zonelist的过程。
3.1 for循环的主体
/* Iterate the zonelist struct zone *zone; @zone - The current zone in the iterator.zone = &pgdat->node_zones[zoneid]; struct zoneref *z; @z - The current pointer within zonelist->zones being iterated struct zonelist *zonelist; 固定的指针,指向gd_data的备用列表 int zoneid; 固定的数值,用于比较 */ for_each_zone_zonelist(zone, z, zonelist, zoneid) -->for_each_zone_zonelist_nodemask(zone, z, zonelist, zoneid, NULL) -->for ( z = first_zones_zonelist(zonelist, zoneid, NULL, &zone); zone; z = next_zones_zonelist(++z, zoneid, NULL, &zone) )
3.2 循环初始化
/* struct zonelist *zonelist; 固定的指针,指向gd_data的备用列表 struct zoneref *z; @z - The current pointer within zonelist->zones being iterated struct zone *zone; @zone - The current zone in the iterator.zone = &pgdat->node_zones[zoneid]; int zoneid; 固定的数值,用于比较 */ z=first_zones_zonelist(zonelist, zoneid, NULL, &zone) -->next_zones_zonelist(zonelist->_zonerefs, zoneid, NULL, &zone); //返回指向zoneref结构的指针
3.3循环条件变化
z = next_zones_zonelist(++z, zoneid, NULL, &zone) -->while (zonelist_zone_idx(z) > zoneid) z++; -->*zone = zonelist_zone(z); -->return z;