• Linux内核情景分析的alloc_pages


    NUMA结构的alloc_pages
    1. ==================== mm/numa.c 43 43 ====================
    2. 43 #ifdef CONFIG_DISCONTIGMEM
    3. ==================== mm/numa.c 91 128 ====================
    4. 91 /*
    5. 92 * This can be refined. Currently, tries to do round robin, instead
    6. 93 * should do concentratic circle search, starting from current node.
    7. 94 */
    8. //分配策略, 所需物理块的大小,2的order次方
    9. 95 struct page * alloc_pages(int gfp_mask, unsigned long order)
    10. 96 {
    11. 97 struct page *ret = 0;
    12. 98 pg_data_t *start, *temp;
    13. 99 #ifndef CONFIG_NUMA
    14. 100 unsigned long flags;
    15. 101 static pg_data_t *next = 0;
    16. 102 #endif
    17. 103
    18. 104 if (order >= MAX_ORDER)
    19. 105 return NULL;
    20. 106 #ifdef CONFIG_NUMA//NUMA结构
    21. 107 temp = NODE_DATA(numa_node_id());//可以通过宏操作找到cpu的节数据结构队列
    22. 108 #else
    23. 109 spin_lock_irqsave(&node_lock, flags);
    24. 110 if (!next) next = pgdat_list;
    25. 111 temp = next;
    26. 112 next = next->node_next;
    27. 113 spin_unlock_irqrestore(&node_lock, flags);
    28. 114 #endif
    29. /*
    30. 函数主要操作2个循环,一个从temp到队列末尾,一个从队头到temp,扫描所有节,直到某节点内存分配成功
    31. */
    32. 115 start = temp;
    33. 116 while (temp) {
    34. 117 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))//接下来解析此函数
    35. 118 return(ret);
    36. 119 temp = temp->node_next;
    37. 120 }
    38. 121 temp = pgdat_list;
    39. 122 while (temp != start) {
    40. 123 if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
    41. 124 return(ret);
    42. 125 temp = temp->node_next;
    43. 126 }
    44. 127 return(0);
    45. 128 }
    alloc_pages_pgdat试图分配所需页面,是__alloc_pages的封装
    1. 85 static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask,
    2. 86 unsigned long order)
    3. 87 { //node_zonelist决定分配策略数组
    4. 88     return __alloc_pages(pgdat->node_zonelists + gfp_mask, order);
    5. 89 }
    与UMA的alloc_pages()相比较,UMA只有一个节点,contig_page_data.UMA与NUMA共同使用__alloc_pages
    1. ==================== include/linux/mm.h 343 352 ====================
    2. 343 #ifndef CONFIG_DISCONTIGMEM//只有这个无定义,才使用uma的__alloc_pages
    3. 344 static inline struct page * alloc_pages(int gfp_mask, unsigned long order)
    4. 83
    5. 345 {
    6. 346 /*
    7. 347 * Gets optimized away by the compiler.
    8. 348 */
    9. 349 if (order >= MAX_ORDER)
    10. 350 return NULL;
    11. 351 return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order);
    12. 352 }
    查看__alloc_page,
    1. 如果只分配一个页面,而且要等待完成分配,又不适用于管理的目的
    2. 把direct_reclaim设置为1,表示可以从相应的管理区的不活跃干净页面缓冲队列中回收
    3. 84发现空闲页面短缺,唤醒以下2个进程,试图腾出一些页面出来
    1. [alloc_pages()>__alloc_pages()]
    2. 270 /*
    3. 271 * This is the 'heart' of the zoned buddy allocator:
    4. 272 */
    5. 273 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
    6. 274 {
    7. 275 zone_t **zone;
    8. 276 int direct_reclaim = 0;
    9. 277 unsigned int gfp_mask = zonelist->gfp_mask;//获取具体的分配策略
    10. 278 struct page * page;
    11. 279
    12. 280 /*
    13. 281 * Allocations put pressure on the VM subsystem.
    14. 282 */
    15. 283 memory_pressure++;//表示内存管理所承受的压力,分配++,归还--
    16. 284
    17. 285 /*
    18. 286 * (If anyone calls gfp from interrupts nonatomically then it
    19. 287 * will sooner or later tripped up by a schedule().)
    20. 288 *
    21. 289 * We are falling back to lower-level zones if allocation
    22. 290 * in a higher zone fails.
    23. 291 */
    24. 292
    25. 293 /*
    26. 294 如果只分配一个页面,而且要等待完成分配,又不适用于管理的目的
    27. 把direct_reclaim设置为1,表示可以从相应的管理区的不活跃干净页面缓冲队列中回收
    28. 296 */
    29. 297 if (order == 0 && (gfp_mask & __GFP_WAIT) &&
    30. 298 !(current->flags & PF_MEMALLOC))
    31. 299 direct_reclaim = 1;
    32. 300
    33. 301 /*
    34. 302 * If we are about to get low on free pages and we also have
    35. 303 * an inactive page shortage, wake up kswapd.
    36. 84发现空闲页面短缺,唤醒以下2个进程,试图腾出一些页面出来
    37. 304 */
    38. 305 if (inactive_shortage() > inactive_target / 2 && free_shortage())
    39. 306 wakeup_kswapd(0);
    40. 307 /*
    41. 308 * If we are about to get low on free pages and cleaning
    42. 309 * the inactive_dirty pages would fix the situation,
    43. 310 * wake up bdflush.
    44. 311 */
    45. 312 else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
    46. 313 && nr_inactive_dirty_pages >= freepages.high)
    47. 314 wakeup_bdflush(0);
    48. 315
    继续看__alloc_page代码
    //如果管理区的空闲页面大于其最低标准,分配成功直接返回
    1. //否则有进程(内核线程kreclaimd)在等待队列睡眠,把它唤醒,用于回收一些页面,备用
    1. ==================== mm/page_alloc.c 316 340 ====================
    2. [alloc_pages()>__alloc_pages()]
    3. 316 try_again:
    4. 317 /*
    5. 318 * First, see if we have any zones with lots of free memory.
    6. 319 *
    7. 320 * We allocate free memory first because it doesn't contain
    8. 321 * any data ... DUH!
    9. 322 */
    10. 323 zone = zonelist->zones;//获取管理区指针
    11. 324 for (;;) {
    12. 325 zone_t *z = *(zone++);//管理区
    13. 326 if (!z)
    14. 327 break;
    15. 328 if (!z->size)
    16. 329 BUG();
    17. 330//如果管理区的空闲页面大于其最低标准
    18. 331 if (z->free_pages >= z->pages_low) {
    19. 332 page = rmqueue(z, order);//分配内存,接下来分析此函数
    20. 333 if (page)
    21. 334 return page;
    22. 335 } 
    23. //否则有进程(内核线程kreclaimd)在等待队列睡眠,把它唤醒,用于回收一些页面,备用
    24. else if (z->free_pages < z->pages_min &&
    25. 336 waitqueue_active(&kreclaimd_wait)) {
    26. 85
    27. 337 wake_up_interruptible(&kreclaimd_wait);
    28. 338 }
    29. 339 }
    30. 340

    1. [alloc_pages()>__alloc_pages()>rmqueue()]
    2. 172 static struct page * rmqueue(zone_t *zone, unsigned long order)
    3. 173 {
    4. 174 free_area_t * area = zone->free_area + order;//获取其数组对应的元素
    5. 175 unsigned long curr_order = order;
    6. 176 struct list_head *head, *curr;
    7. 177 unsigned long flags;
    8. 178 struct page *page;
    9. 179
    10. 180 spin_lock_irqsave(&zone->lock, flags);//相应管理区加锁
    11. 181 do {
    12. 182 head = &area->free_list;//头
    13. 183 curr = memlist_next(head);//头的下一个节点
    14. 184
    15. 185 if (curr != head) {//不等于空,说明有物理页块
    16. 186 unsigned int index;
    17. 187//从非空队列中取出第一个结构page元素
    18. 188 page = memlist_entry(curr, struct page, list);
    19. 189 if (BAD_RANGE(zone,page))
    20. 190 BUG();
    21. 191 memlist_del(curr);//删除队列中的元素
    22. 192 index = (page - mem_map) - zone->offset;//偏移
    23. 193 MARK_USED(index, curr_order, area);//将相应位图设置为1
    24. 194 zone->free_pages -= 1 << order;
    25. 195//分配成功,把大块剩余的部分分解为小块,链入相应的队列
    26. 196 page = expand(zone, page, index, order, curr_order, area);
    27. 197 spin_unlock_irqrestore(&zone->lock, flags);
    28. 198
    29. 199 set_page_count(page, 1);
    30. 200 if (BAD_RANGE(zone,page))
    31. 201 BUG();
    32. 202 DEBUG_ADD_PAGE
    33. 203 return page;
    34. 204 }
    35. 205 curr_order++;
    36. 206 area++;
    37. 86
    38. 207 } while (curr_order < MAX_ORDER);
    39. 208 spin_unlock_irqrestore(&zone->lock, flags);
    40. 209
    41. 210 return NULL;
    42. 211 }


    1. [alloc_pages()>__alloc_pages()>rmqueue()>expand()]
    2. /*
    3.     
          low表示所需块大小,high表示实际大小
    4.     */
    5. 150 static inline struct page * expand (zone_t *zone, struct page *page,
    6. 151 unsigned long index, int low, int high, free_area_t * area)
    7. 152 {
    8. 153 unsigned long size = 1 << high;
    9. 154
    10. 155 while (high > low) {
    11. 156 if (BAD_RANGE(zone,page))
    12. 157 BUG();
    13. 158 area--;
    14. 159 high--;
    15. 160 size >>= 1;//每次减少2的n次方
    16. 161 memlist_add_head(&(page)->list, &(area)->free_list);
    17. 162 MARK_USED(index, high, area);//标记位图
    18. //处理更低一档的空闲块队列
    19. 163 index += size;
    20. 164 page += size;
    21. 165 }
    22. 166 if (BAD_RANGE(zone,page))
    23. 167 BUG();
    24. 168 return page;
    25. 169 }
    就这样rmqueue队列一直往上扫描,直到分配成功或者失败,如果失败,则__alloc_pages通过for循环
    指向下一个管理区(按照分配策略),直到成功.
    要是给定的分配策略中的所有页面管理区都失败,那就只能加大力度再试试.要么降低对页面的水位要求
    要么把缓冲在管理区的不活跃干净页面也给考虑进去
    1. [alloc_pages()>__alloc_pages()]
    2. 341 /*
    3. 342 * Try to allocate a page from a zone with a HIGH
    4. 343 * amount of free + inactive_clean pages.
    5. 344 *
    6. 345 * If there is a lot of activity, inactive_target
    7. 346 * will be high and we'll have a good chance of
    8. 347 * finding a page using the HIGH limit.
    9. 348 */
    10. //先用page_high,如果不行再用page_low
    11. 349 page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
    12. 350 if (page)
    13. 351 return page;
    14. 352
    15. 353 /*
    16. 354 * Then try to allocate a page from a zone with more
    17. 355 * than zone->pages_low free + inactive_clean pages.
    18. 356 *
    19. 357 * When the working set is very large and VM activity
    20. 358 * is low, we're most likely to have our allocation
    21. 359 * succeed here.
    22. 360 */
    23. 361 page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
    24. 362 if (page)
    25. 363 return page;
    26. 364


    1. [alloc_pages()>__alloc_pages()>__alloc_pages_limit()]
    2. 213 #define PAGES_MIN 0
    3. 214 #define PAGES_LOW 1
    4. 215 #define PAGES_HIGH 2
    5. 88
    6. 216
    7. 217 /*
    8. 218 * This function does the dirty work for __alloc_pages
    9. 219 * and is separated out to keep the code size smaller.
    10. 220 * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
    11. 221 */
    12. 222 static struct page * __alloc_pages_limit(zonelist_t *zonelist,
    13. 223 unsigned long order, int limit, int direct_reclaim)
    14. 224 {
    15. 225 zone_t **zone = zonelist->zones;
    16. 226
    17. 227 for (;;) {
    18. 228 zone_t *z = *(zone++);
    19. 229 unsigned long water_mark;
    20. 230
    21. 231 if (!z)
    22. 232 break;
    23. 233 if (!z->size)
    24. 234 BUG();
    25. 235
    26. 236 /*
    27. 237 * We allocate if the number of free + inactive_clean
    28. 238 * pages is above the watermark.
    29. 239 */
    30. 240 switch (limit) {
    31. 241 default:
    32. 242 case PAGES_MIN://通过分配策略,改变水位
    33. 243 water_mark = z->pages_min;
    34. 244 break;
    35. 245 case PAGES_LOW:
    36. 246 water_mark = z->pages_low;
    37. 247 break;
    38. 248 case PAGES_HIGH:
    39. 249 water_mark = z->pages_high;
    40. 250 }
    41. 251//如果空闲页面+干净回收页面大于最低水位
    42. 252 if (z->free_pages + z->inactive_clean_pages > water_mark) {
    43. 253 struct page *page = NULL;
    44. 254 /* 如果空闲页面小于最低水位+8,那就回收. */
    45. 255 if (direct_reclaim && z->free_pages < z->pages_min + 8)
    46. 256 page = reclaim_page(z);//把inactive_clean_list队列回收页面
    47. 257 /* If that fails, fall back to rmqueue. */
    48. 258 if (!page)
    49. 259 page = rmqueue(z, order);
    50. 260 if (page)
    51. 261 return page;
    52. 262 }
    53. 263 }
    54. 264
    55. 89
    56. 265 /* Found nothing. */
    57. 266 return NULL;
    58. 267 }
    如果还是不行,那就说明管理区的页面很短缺了
    1. [alloc_pages()>__alloc_pages()]
    2. 365 /*
    3. 366 * OK, none of the zones on our zonelist has lots
    4. 367 * of pages free.
    5. 368 *
    6. 369 * We wake up kswapd, in the hope that kswapd will
    7. 370 * resolve this situation before memory gets tight.
    8. 371 *
    9. 372 * We also yield the CPU, because that:
    10. 373 * - gives kswapd a chance to do something
    11. 374 * - slows down allocations, in particular the
    12. 375 * allocations from the fast allocator that's
    13. 376 * causing the problems ...
    14. 377 * - ... which minimises the impact the "bad guys"
    15. 378 * have on the rest of the system
    16. 379 * - if we don't have __GFP_IO set, kswapd may be
    17. 380 * able to free some memory we can't free ourselves
    18. 381 */
    19. 382 wakeup_kswapd(0);//唤醒内核线程,想办法换出一些页面
    20. 383 if (gfp_mask & __GFP_WAIT) {//要求必须获取页面,分配不到时等待,那就让系统再调用一次(目的为了调度kswapd线程)
    21. //以此获取一些页面
    22. 384 __set_current_state(TASK_RUNNING);
    23. 385 current->policy |= SCHED_YIELD;
    24. 386 schedule();
    25. 387 }
    26. 388
    27. 389 /*
    28. 390 * After waking up kswapd, we try to allocate a page
    29. 391 * from any zone which isn't critical yet.
    30. 392 *
    31. 393 * Kswapd should, in most situations, bring the situation
    32. 394 * back to normal in no time.
    33. 395 */
    34. /*
    35. 如果不允许等待,那就用pages_min再调用一次__alloc_pages_limit
    36. */
    37. 396 page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
    38. 397 if (page)
    39. 398 return page;
    40. 399
    要是再失败,那就看谁在要求分配内存页面.如果是kswaped,本身就是内存分配工作者
    是要更好的分配页面,比一般进程更重要,那就PF_memalloc标志位为1,不过我们先看一般进程
    即pe_memalloc标志位为0的策略.

    1. ==================== mm/page_alloc.c 400 477 ====================
    2. [alloc_pages()>__alloc_pages()]
    3. 400 /*
    4. 401 * Damn, we didn't succeed.
    5. 402 *
    6. 403 * This can be due to 2 reasons:
    7. 404 * - we're doing a higher-order allocation
    8. 405 * --> move pages to the free list until we succeed
    9. 406 * - we're /really/ tight on memory
    10. 407 * --> wait on the kswapd waitqueue until memory is freed
    11. 408 */
    12. 409 if (!(current->flags & PF_MEMALLOC)) {
    13. 410 /*
    14. 411 * Are we dealing with a higher order allocation?
    15. 412 *
    16. 413 * Move pages from the inactive_clean to the free list
    17. 414 * in the hope of creating a large, physically contiguous
    18. 415 * piece of free memory.
    19. 416 */
    20. 417 if (order > 0 && (gfp_mask & __GFP_WAIT)) {
    21. 418 zone = zonelist->zones;
    22. 419 /* First, clean some dirty pages. */
    23. 420 current->flags |= PF_MEMALLOC;
    24. 421 page_launder(gfp_mask, 1);//把脏页洗干净(页面的定期换出)
    25. 422 current->flags &= ~PF_MEMALLOC;
    26. 423 for (;;) {
    27. 424 zone_t *z = *(zone++);//通过一个for循环把干净页面等待队列的页面回收
    28. 425 if (!z)
    29. 426 break;
    30. 427 if (!z->size)
    31. 428 continue;
    32. //是否有干净页面
    33. 429 while (z->inactive_clean_pages) {
    34. 430 struct page * page;
    35. 431 /* Move one page to the free list. */
    36. 432 page = reclaim_page(z);//回收干净页面等待队列
    37. 433 if (!page)
    38. 434 break;
    39. 91
    40. 435 __free_page(page);//通过__free_page释放页面的同时,把空闲页面拼接成大的页面块
    41. 436 /* Try if the allocation succeeds. */
    42. 437 page = rmqueue(z, order);//试图再次请求成功
    43. 438 if (page)
    44. 439 return page;
    45. 440 }
    46. 441 }
    47. 442 }
    48. 443 /*
    49. 444 * When we arrive here, we are really tight on memory.
    50. 445 *
    51. 446 * We wake up kswapd and sleep until kswapd wakes us
    52. 447 * up again. After that we loop back to the start.
    53. 448 *
    54. 449 * We have to do this because something else might eat
    55. 450 * the memory kswapd frees for us and we need to be
    56. 451 * reliable. Note that we don't loop back for higher
    57. 452 * order allocations since it is possible that kswapd
    58. 453 * simply cannot free a large enough contiguous area
    59. 454 * of memory *ever*.
    60. 455 */
    61. /*
    62. 如果依旧失败,而且必须要求分配到页面,那就等待,进程睡眠
    63. */
    64. 456 if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
    65. 457 wakeup_kswapd(1);//唤醒kswaped,要求分配页面进程睡眠,等待kswapd完成一轮运行再唤醒需要页面的进程
    66. 458 memory_pressure++;
    67. 459 if (!order)//如果要求分配的是1个页面,跳到try_again
    68. 460 goto try_again;
    69. 461 /*
    70. 462 * If __GFP_IO isn't set, we can't wait on kswapd because
    71. 463 * kswapd just might need some IO locks /we/ are holding ...
    72. 464 *
    73. 465 * SUBTLE: The scheduling point above makes sure that
    74. 466 * kswapd does get the chance to free memory we can't
    75. 467 * free ourselves...
    76. 468 */
    77. 469 } else if (gfp_mask & __GFP_WAIT) {
    78. 470 try_to_free_pages(gfp_mask);//另外一种方案...直接调用此函数获取页面(本来就是kswaped函数调用的)
    79. 471 memory_pressure++;
    80. 472 if (!order)
    81. 473 goto try_again;
    82. 474 }
    83. 475
    84. 476 }
    85. 477
    最后的办法了
     
    1. [alloc_pages()>__alloc_pages()]
    2. 478 /*
    3. 479 * Final phase: allocate anything we can!
    4. 480 *
    5. 481 * Higher order allocations, GFP_ATOMIC allocations and
    6. 482 * recursive allocations (PF_MEMALLOC) end up here.
    7. 483 *
    8. 484 * Only recursive allocations can use the very last pages
    9. 485 * in the system, otherwise it would be just too easy to
    10. 486 * deadlock the system...
    11. 487 */
    12. 488 zone = zonelist->zones;
    13. 489 for (;;) {
    14. 490 zone_t *z = *(zone++);
    15. 491 struct page * page = NULL;
    16. 492 if (!z)
    17. 493 break;
    18. 494 if (!z->size)
    19. 495 BUG();
    20. 496
    21. 497 /*
    22. 498 * SUBTLE: direct_reclaim is only possible if the task
    23. 499 * becomes PF_MEMALLOC while looping above. This will
    24. 500 * happen when the OOM killer selects this task for
    25. 501 * instant execution...
    26. 93
    27. 502 */
    28. 503 if (direct_reclaim) {
    29. 504 page = reclaim_page(z);
    30. 505 if (page)
    31. 506 return page;
    32. 507 }
    33. 508
    34. 509 /* XXX: is pages_min/4 a good amount to reserve for this? */
    35. 510 if (z->free_pages < z->pages_min / 4 &&
    36. 511 !(current->flags & PF_MEMALLOC))
    37. 512 continue;
    38. 513 page = rmqueue(z, order);
    39. 514 if (page)
    40. 515 return page;
    41. 516 }
    42. 517
    43. 518 /* No luck.. */
    44. 519 printk(KERN_ERR "__alloc_pages: %lu-order allocation failed. ", order);
    45. 520 return NULL;
    46. 521 }
    如果这都失败,那就系统一定出现了问题了
    节点->页面短缺->调用线程,试图腾出页面->
    开始遍历每个管理区->一旦管理区的空闲页面大于最低水位,那就调用rmqueue进行分配,否则把kcreclaimd线程唤醒,回收页面
    rmqueue分析->如果失败,换一个管理区(按照分配策略),如果全部失败->降低页面的水位要求,把不活跃干净的页面考虑进来
    ->调用__alloc_pages_limit->如果空闲页面小于最低水位+8,那就回收干净页面队列(换出,腾出空间)->失败,唤醒内核线程,获取页面
    ->依旧失败,把脏页面洗干净,换出.获取页面->依旧失败,再次调用线程换取页面,依旧失败->把水位降低到1/4看能否满足分配->
    依旧不能,系统出了问题
     









  • 相关阅读:
    图解VS2008单元测试及查看代码覆盖率
    Effective C++:条款02:尽量以const, enum, inline替换#define (Prefer consts, enums, and inline to #defines.)
    Effective C++:条款01:视C++为一个语言联邦(View C++ as a federation of languages.)
    Effective C++:条款03:尽可能使用const (Use const whenever possible.)
    mysql foreign key <转>
    Linux下Apache绑定多个域名的方法 <转>
    python(1)input()和raw_input
    《精通CSS》读书笔记(1)
    CSS相对定位和绝对定位
    【分享】沪江网提供的每日一句API
  • 原文地址:https://www.cnblogs.com/zengyiwen/p/5ea8a7e480ab6d257117e03df33d1368.html
Copyright © 2020-2023  润新知