Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】

Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】
转自：https://blog.csdn.net/21cnbao/article/details/7309757

在我们使用ARM等嵌入式Linux系统的时候，一个头疼的问题是GPU，Camera，HDMI等都需要预留大量连续内存，这部分内存平时不用，但是一般的做法又必须先预留着。目前，Marek Szyprowski和Michal Nazarewicz实现了一套全新的Contiguous Memory Allocator。通过这套机制，我们可以做到不预留内存，这些内存平时是可用的，只有当需要的时候才被分配给Camera，HDMI等设备。下面分析它的基本代码流程。

声明连续内存

内核启动过程中arch/arm/mm/init.c中的arm_memblock_init()会调用dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

该函数位于：drivers/base/dma-contiguous.c
1. /**
2. * dma_contiguous_reserve() - reserve area for contiguous memory handling
3. * @limit: End address of the reserved memory (optional, 0 for any).
4. *
5. * This function reserves memory from early allocator. It should be
6. * called by arch specific code once the early allocator (memblock or bootmem)
7. * has been activated and all other subsystems have already allocated/reserved
8. * memory.
9. */
10. void __init dma_contiguous_reserve(phys_addr_t limit)
11. {
12. unsigned long selected_size = 0;
14. pr_debug("%s(limit %08lx) ", __func__, (unsigned long)limit);
16. if (size_cmdline != -1) {
17. selected_size = size_cmdline;
18. } else {
19. #ifdef CONFIG_CMA_SIZE_SEL_MBYTES
20. selected_size = size_bytes;
21. #elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
22. selected_size = cma_early_percent_memory();
23. #elif defined(CONFIG_CMA_SIZE_SEL_MIN)
24. selected_size = min(size_bytes, cma_early_percent_memory());
25. #elif defined(CONFIG_CMA_SIZE_SEL_MAX)
26. selected_size = max(size_bytes, cma_early_percent_memory());
27. #endif
28. }
30. if (selected_size) {
31. pr_debug("%s: reserving %ld MiB for global area ", __func__,
32. selected_size / SZ_1M);
34. dma_declare_contiguous(NULL, selected_size, 0, limit);
35. }
36. };
其中的size_bytes定义为：

static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M; 默认情况下，CMA_SIZE_MBYTES会被定义为16MB，来源于CONFIG_CMA_SIZE_MBYTES=16

->
1. int __init dma_declare_contiguous(struct device *dev, unsigned long size,
2. phys_addr_t base, phys_addr_t limit)
3. {
4. ...
5. /* Reserve memory */
6. if (base) {
7. if (memblock_is_region_reserved(base, size) ||
8. memblock_reserve(base, size) < 0) {
9. base = -EBUSY;
10. goto err;
11. }
12. } else {
13. /*
14. * Use __memblock_alloc_base() since
15. * memblock_alloc_base() panic()s.
16. */
17. phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
18. if (!addr) {
19. base = -ENOMEM;
20. goto err;
21. } else if (addr + size > ~(unsigned long)0) {
22. memblock_free(addr, size);
23. base = -EINVAL;
24. base = -EINVAL;
25. goto err;
26. } else {
27. base = addr;
28. }
29. }
31. /*
32. * Each reserved area must be initialised later, when more kernel
33. * subsystems (like slab allocator) are available.
34. */
35. r->start = base;
36. r->size = size;
37. r->dev = dev;
38. cma_reserved_count++;
39. pr_info("CMA: reserved %ld MiB at %08lx ", size / SZ_1M,
40. (unsigned long)base);
42. /* Architecture specific contiguous memory fixup. */
43. dma_contiguous_early_fixup(base, size);
44. return 0;
45. err:
46. pr_err("CMA: failed to reserve %ld MiB ", size / SZ_1M);
47. return base;
48. }
由此可见，连续内存区域也是在内核启动的早期，通过__memblock_alloc_base()拿到的。

另外：

drivers/base/dma-contiguous.c里面的core_initcall()会导致cma_init_reserved_areas()被调用：
1. static int __init cma_init_reserved_areas(void)
2. {
3. struct cma_reserved *r = cma_reserved;
4. unsigned i = cma_reserved_count;
6. pr_debug("%s() ", __func__);
8. for (; i; --i, ++r) {
9. struct cma *cma;
10. cma = cma_create_area(PFN_DOWN(r->start),
11. r->size >> PAGE_SHIFT);
12. if (!IS_ERR(cma))
13. dev_set_cma_area(r->dev, cma);
14. }
15. return 0;
16. }
17. core_initcall(cma_init_reserved_areas);
cma_create_area()会调用cma_activate_area(),cma_activate_area()函数则会针对每个page调用：

init_cma_reserved_pageblock(pfn_to_page(base_pfn));

这个函数则会通过set_pageblock_migratetype(page, MIGRATE_CMA)将页设置为MIGRATE_CMA类型的：
1. #ifdef CONFIG_CMA
2. /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
3. void __init init_cma_reserved_pageblock(struct page *page)
4. {
5. unsigned i = pageblock_nr_pages;
6. struct page *p = page;
8. do {
9. __ClearPageReserved(p);
10. set_page_count(p, 0);
11. } while (++p, --i);
13. set_page_refcounted(page);
14. set_pageblock_migratetype(page, MIGRATE_CMA);
15. __free_pages(page, pageblock_order);
16. totalram_pages += pageblock_nr_pages;
17. }
18. #endif
同时其中调用的__free_pages(page, pageblock_order);最终会调用到__free_one_page(page, zone, order, migratetype);
相关的page会被加到MIGRATE_CMA的free_list上面去：

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

申请连续内存

申请连续内存仍然使用标准的arch/arm/mm/dma-mapping.c中定义的dma_alloc_coherent()和dma_alloc_writecombine()，这二者会间接调用drivers/base/dma-contiguous.c中的
1. struct page *dma_alloc_from_contiguous(struct device *dev, int count,
2. unsigned int align)
->
1. struct page *dma_alloc_from_contiguous(struct device *dev, int count,
2. unsigned int align)
3. {
4. ...
6. for (;;) {
7. pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
8. start, count, mask);
9. if (pageno >= cma->count) {
10. ret = -ENOMEM;
11. goto error;
12. }
14. pfn = cma->base_pfn + pageno;
15. ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
16. if (ret == 0) {
17. bitmap_set(cma->bitmap, pageno, count);
18. break;
19. } else if (ret != -EBUSY) {
20. goto error;
21. }
22. pr_debug("%s(): memory range at %p is busy, retrying ",
23. __func__, pfn_to_page(pfn));
24. /* try again with a bit different memory target */
25. start = pageno + mask + 1;
26. }
27. ...
29. }
->

int alloc_contig_range(unsigned long start, unsigned long end,

unsigned migratetype)

需要隔离page，隔离page的作用通过代码的注释可以体现：
1. /*
2. * What we do here is we mark all pageblocks in range as
3. * MIGRATE_ISOLATE. Because of the way page allocator work, we
4. * align the range to MAX_ORDER pages so that page allocator
5. * won't try to merge buddies from different pageblocks and
6. * change MIGRATE_ISOLATE to some other migration type.
7. *
8. * Once the pageblocks are marked as MIGRATE_ISOLATE, we
9. * migrate the pages from an unaligned range (ie. pages that
10. * we are interested in). This will put all the pages in
11. * range back to page allocator as MIGRATE_ISOLATE.
12. *
13. * When this is done, we take the pages in range from page
14. * allocator removing them from the buddy system. This way
15. * page allocator will never consider using them.
16. *
17. * This lets us mark the pageblocks back as
18. * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
19. * MAX_ORDER aligned range but not in the unaligned, original
20. * range are put back to page allocator so that buddy can use
21. * them.
22. */
24. ret = start_isolate_page_range(pfn_align_to_maxpage_down(start),
25. pfn_align_to_maxpage_up(end),
26. migratetype);
简单地说，就是把相关的page标记为MIGRATE_ISOLATE，这样buddy系统就不会再使用他们。
1. /*
2. * start_isolate_page_range() -- make page-allocation-type of range of pages
3. * to be MIGRATE_ISOLATE.
4. * @start_pfn: The lower PFN of the range to be isolated.
5. * @end_pfn: The upper PFN of the range to be isolated.
6. * @migratetype: migrate type to set in error recovery.
7. *
8. * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
9. * the range will never be allocated. Any free pages and pages freed in the
10. * future will not be allocated again.
11. *
12. * start_pfn/end_pfn must be aligned to pageblock_order.
13. * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
14. */
15. int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
16. unsigned migratetype)
17. {
18. unsigned long pfn;
19. unsigned long undo_pfn;
20. struct page *page;
22. BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
23. BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
25. for (pfn = start_pfn;
26. pfn < end_pfn;
27. pfn += pageblock_nr_pages) {
28. page = __first_valid_page(pfn, pageblock_nr_pages);
29. if (page && set_migratetype_isolate(page)) {
30. undo_pfn = pfn;
31. goto undo;
32. }
33. }
34. return 0;
35. undo:
36. for (pfn = start_pfn;
37. pfn < undo_pfn;
38. pfn += pageblock_nr_pages)
39. unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
41. return -EBUSY;
42. }
接下来调用__alloc_contig_migrate_range()进行页面隔离和迁移:
1. static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
2. {
3. /* This function is based on compact_zone() from compaction.c. */
5. unsigned long pfn = start;
6. unsigned int tries = 0;
7. int ret = 0;
9. struct compact_control cc = {
10. .nr_migratepages = 0,
11. .order = -1,
12. .zone = page_zone(pfn_to_page(start)),
13. .sync = true,
14. };
15. INIT_LIST_HEAD(&cc.migratepages);
17. migrate_prep_local();
19. while (pfn < end || !list_empty(&cc.migratepages)) {
20. if (fatal_signal_pending(current)) {
21. ret = -EINTR;
22. break;
23. }
25. if (list_empty(&cc.migratepages)) {
26. cc.nr_migratepages = 0;
27. pfn = isolate_migratepages_range(cc.zone, &cc,
28. pfn, end);
29. if (!pfn) {
30. ret = -EINTR;
31. break;
32. }
33. tries = 0;
34. } else if (++tries == 5) {
35. ret = ret < 0 ? ret : -EBUSY;
36. break;
37. }
39. ret = migrate_pages(&cc.migratepages,
40. __alloc_contig_migrate_alloc,
41. 0, false, true);
42. }
44. putback_lru_pages(&cc.migratepages);
45. return ret > 0 ? 0 : ret;
46. }
其中的函数migrate_pages()会完成页面的迁移，迁移过程中通过传入的__alloc_contig_migrate_alloc()申请新的page，并将老的page付给新的page：
1. int migrate_pages(struct list_head *from,
2. new_page_t get_new_page, unsigned long private, bool offlining,
3. bool sync)
4. {
5. int retry = 1;
6. int nr_failed = 0;
7. int pass = 0;
8. struct page *page;
9. struct page *page2;
10. int swapwrite = current->flags & PF_SWAPWRITE;
11. int rc;
13. if (!swapwrite)
14. current->flags |= PF_SWAPWRITE;
16. for(pass = 0; pass < 10 && retry; pass++) {
17. retry = 0;
19. list_for_each_entry_safe(page, page2, from, lru) {
20. cond_resched();
22. rc = unmap_and_move(get_new_page, private,
23. page, pass > 2, offlining,
24. sync);
26. switch(rc) {
27. case -ENOMEM:
28. goto out;
29. case -EAGAIN:
30. retry++;
31. break;
32. case 0:
33. break;
34. default:
35. /* Permanent failure */
36. nr_failed++;
37. break;
38. }
39. }
40. }
41. rc = 0;
42. ...
43. }
其中的unmap_and_move()函数较为关键，它定义在mm/migrate.c中
1. /*
2. * Obtain the lock on page, remove all ptes and migrate the page
3. * to the newly allocated page in newpage.
4. */
5. static int unmap_and_move(new_page_t get_new_page, unsigned long private,
6. struct page *page, int force, bool offlining, bool sync)
7. {
8. int rc = 0;
9. int *result = NULL;
10. struct page *newpage = get_new_page(page, private, &result);
11. int remap_swapcache = 1;
12. int charge = 0;
13. struct mem_cgroup *mem = NULL;
14. struct anon_vma *anon_vma = NULL;
16. ...
18. /* charge against new page */
19. charge = mem_cgroup_prepare_migration(page, newpage, &mem);
20. ...
22. if (PageWriteback(page)) {
23. if (!force || !sync)
24. goto uncharge;
25. wait_on_page_writeback(page);
26. }
27. /*
28. * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
29. * we cannot notice that anon_vma is freed while we migrates a page.
30. * This get_anon_vma() delays freeing anon_vma pointer until the end
31. * of migration. File cache pages are no problem because of page_lock()
32. * File Caches may use write_page() or lock_page() in migration, then,
33. * just care Anon page here.
34. */
35. if (PageAnon(page)) {
36. /*
37. * Only page_lock_anon_vma() understands the subtleties of
38. * getting a hold on an anon_vma from outside one of its mms.
39. */
40. anon_vma = page_lock_anon_vma(page);
41. if (anon_vma) {
42. /*
43. * Take a reference count on the anon_vma if the
44. * page is mapped so that it is guaranteed to
45. * exist when the page is remapped later
46. */
47. get_anon_vma(anon_vma);
48. page_unlock_anon_vma(anon_vma);
49. } else if (PageSwapCache(page)) {
50. /*
51. * We cannot be sure that the anon_vma of an unmapped
52. * swapcache page is safe to use because we don't
53. * know in advance if the VMA that this page belonged
54. * to still exists. If the VMA and others sharing the
55. * data have been freed, then the anon_vma could
56. * already be invalid.
57. *
58. * To avoid this possibility, swapcache pages get
59. * migrated but are not remapped when migration
60. * completes
61. */
62. remap_swapcache = 0;
63. } else {
64. goto uncharge;
65. }
66. }
68. ...
69. /* Establish migration ptes or remove ptes */
70. try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
72. skip_unmap:
73. if (!page_mapped(page))
74. rc = move_to_new_page(newpage, page, remap_swapcache);
76. if (rc && remap_swapcache)
77. remove_migration_ptes(page, page);
79. /* Drop an anon_vma reference if we took one */
80. if (anon_vma)
81. drop_anon_vma(anon_vma);
83. uncharge:
84. if (!charge)
85. mem_cgroup_end_migration(mem, page, newpage, rc == 0);
86. unlock:
87. unlock_page(page);
89. move_newpage:
90. ...
91. }
通过unmap_and_move()，老的page就被迁移过去新的page。

接下来要回收page，回收page的作用是，不至于因为拿了连续的内存后，系统变得内存饥饿：

->
1. /*
2. * Reclaim enough pages to make sure that contiguous allocation
3. * will not starve the system.
4. */
5. __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
->
1. /*
2. * Trigger memory pressure bump to reclaim some pages in order to be able to
3. * allocate 'count' pages in single page units. Does similar work as
4. *__alloc_pages_slowpath() function.
5. */
6. static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
7. {
8. enum zone_type high_zoneidx = gfp_zone(gfp_mask);
9. struct zonelist *zonelist = node_zonelist(0, gfp_mask);
10. int did_some_progress = 0;
11. int order = 1;
12. unsigned long watermark;
14. /*
15. * Increase level of watermarks to force kswapd do his job
16. * to stabilise at new watermark level.
17. */
18. __update_cma_watermarks(zone, count);
20. /* Obey watermarks as if the page was being allocated */
21. watermark = low_wmark_pages(zone) + count;
22. while (!zone_watermark_ok(zone, 0, watermark, 0, 0)) {
23. wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
25. did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
26. NULL);
27. if (!did_some_progress) {
28. /* Exhausted what can be done so it's blamo time */
29. out_of_memory(zonelist, gfp_mask, order, NULL);
30. }
31. }
33. /* Restore original watermark levels. */
34. __update_cma_watermarks(zone, -count);
36. return count;
37. }
释放连续内存

内存释放的时候也比较简单，直接就是：

arch/arm/mm/dma-mapping.c：
```
void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)
```
->

arch/arm/mm/dma-mapping.c:
1. static void __free_from_contiguous(struct device *dev, struct page *page,
2. size_t size)
3. {
4. __dma_remap(page, size, pgprot_kernel);
5. dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
6. }
->
1. bool dma_release_from_contiguous(struct device *dev, struct page *pages,
2. int count)
3. {
4. ...
5. free_contig_range(pfn, count);
6. ..
8. }
->
1. void free_contig_range(unsigned long pfn, unsigned nr_pages)
2. {
3. for (; nr_pages--; ++pfn)
4. __free_page(pfn_to_page(pfn));
5. }
将page交还给buddy。

内核内存分配的migratetype

内核内存分配的时候，带的标志是GFP_，但是GFP_可以转化为migratetype：
1. static inline int allocflags_to_migratetype(gfp_t gfp_flags)
2. {
3. WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
5. if (unlikely(page_group_by_mobility_disabled))
6. return MIGRATE_UNMOVABLE;
8. /* Group based on mobility */
9. return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
10. ((gfp_flags & __GFP_RECLAIMABLE) != 0);
11. }
之后申请内存的时候，会对比迁移类型匹配的free_list：
1. page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2. zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
3. preferred_zone, migratetype);
另外，笔者也编写了一个测试程序，透过它随时测试CMA的功能：
1. /*
2. * kernel module helper for testing CMA
3. *
4. * Licensed under GPLv2 or later.
5. */
7. #include <linux/module.h>
8. #include <linux/device.h>
9. #include <linux/fs.h>
10. #include <linux/miscdevice.h>
11. #include <linux/dma-mapping.h>
13. #define CMA_NUM 10
14. static struct device *cma_dev;
15. static dma_addr_t dma_phys[CMA_NUM];
16. static void *dma_virt[CMA_NUM];
18. /* any read request will free coherent memory, eg.
19. * cat /dev/cma_test
20. */
21. static ssize_t
22. cma_test_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
23. {
24. int i;
26. for (i = 0; i < CMA_NUM; i++) {
27. if (dma_virt[i]) {
28. dma_free_coherent(cma_dev, (i + 1) * SZ_1M, dma_virt[i], dma_phys[i]);
29. _dev_info(cma_dev, "free virt: %p phys: %p ", dma_virt[i], (void *)dma_phys[i]);
30. dma_virt[i] = NULL;
31. break;
32. }
33. }
34. return 0;
35. }
37. /*
38. * any write request will alloc coherent memory, eg.
39. * echo 0 > /dev/cma_test
40. */
41. static ssize_t
42. cma_test_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
43. {
44. int i;
45. int ret;
47. for (i = 0; i < CMA_NUM; i++) {
48. if (!dma_virt[i]) {
49. dma_virt[i] = dma_alloc_coherent(cma_dev, (i + 1) * SZ_1M, &dma_phys[i], GFP_KERNEL);
51. if (dma_virt[i]) {
52. void *p;
53. /* touch every page in the allocated memory */
54. for (p = dma_virt[i]; p < dma_virt[i] + (i + 1) * SZ_1M; p += PAGE_SIZE)
55. *(u32 *)p = 0;
57. _dev_info(cma_dev, "alloc virt: %p phys: %p ", dma_virt[i], (void *)dma_phys[i]);
58. } else {
59. dev_err(cma_dev, "no mem in CMA area ");
60. ret = -ENOMEM;
61. }
62. break;
63. }
64. }
66. return count;
67. }
69. static const struct file_operations cma_test_fops = {
70. .owner = THIS_MODULE,
71. .read = cma_test_read,
72. .write = cma_test_write,
73. };
75. static struct miscdevice cma_test_misc = {
76. .name = "cma_test",
77. .fops = &cma_test_fops,
78. };
80. static int __init cma_test_init(void)
81. {
82. int ret = 0;
84. ret = misc_register(&cma_test_misc);
85. if (unlikely(ret)) {
86. pr_err("failed to register cma test misc device! ");
87. return ret;
88. }
89. cma_dev = cma_test_misc.this_device;
90. cma_dev->coherent_dma_mask = ~0;
91. _dev_info(cma_dev, "registered. ");
93. return ret;
94. }
95. module_init(cma_test_init);
97. static void __exit cma_test_exit(void)
98. {
99. misc_deregister(&cma_test_misc);
100. }
101. module_exit(cma_test_exit);
103. MODULE_LICENSE("GPL");
104. MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>");
105. MODULE_DESCRIPTION("kernel module to help the test of CMA");
106. MODULE_ALIAS("CMA test");
申请内存：
```
# echo 0 > /dev/cma_test
```
释放内存：
```
# cat /dev/cma_test
```
参考链接：

[1] http://www.spinics.net/lists/arm-kernel/msg160854.html

[2] http://www.spinics.net/lists/arm-kernel/msg162063.html

[3] http://lwn.net/Articles/447405/
相关阅读:
1509 加长棒
 51Nod 1158 全是1的最大子矩阵
 P2953 [USACO09OPEN]牛的数字游戏Cow Digit Game
P3384 【模板】树链剖分
 北京集训DAY3
北京集训DAY2
北京集训DAY1
51Nod 1422 沙拉酱前缀二分查找
 51Nod 1109 01组成的N的倍数
 51Nod 1043 幸运号码数位DP
原文地址：https://www.cnblogs.com/sky-heaven/p/9549482.html

最新文章
博弈论
 环套树
 图论
 计算几何
 咂题
 动态树 LCT
可持久化数据结构
 线性基
 概率与期望
 NOI2015 迟来的测试，及时的总结

Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】

声明连续内存

申请连续内存

释放连续内存

内核内存分配的migratetype