• slub分配object


    kmem_cache如下:

    62struct kmem_cache {
    63    struct kmem_cache_cpu __percpu *cpu_slab;
    64    /* Used for retriving partial slabs etc */
    65    unsigned long flags;
    66    unsigned long min_partial;
    67    int size;        /* The size of an object including meta data */
    68    int object_size;    /* The size of an object without meta data */
    69    int offset;        /* Free pointer offset. */
    70    int cpu_partial;    /* Number of per cpu partial objects to keep around */
    71    struct kmem_cache_order_objects oo;
    72
    73    /* Allocation and freeing of slabs */
    74    struct kmem_cache_order_objects max;
    75    struct kmem_cache_order_objects min;
    76    gfp_t allocflags;    /* gfp flags to use on each alloc */
    77    int refcount;        /* Refcount for slab cache destroy */
    78    void (*ctor)(void *);
    79    int inuse;        /* Offset to metadata */
    80    int align;        /* Alignment */
    81    int reserved;        /* Reserved bytes at the end of slabs */
    82    const char *name;    /* Name (only for display!) */
    83    struct list_head list;    /* List of slab caches */
    84    int red_left_pad;    /* Left redzone padding size */
    85#ifdef CONFIG_SYSFS
    86    struct kobject kobj;    /* For sysfs */
    87#endif
    88#ifdef CONFIG_MEMCG_KMEM
    89    struct memcg_cache_params memcg_params;
    90    int max_attr_size; /* for propagation, maximum size of a stored attr */
    91#ifdef CONFIG_SYSFS
    92    struct kset *memcg_kset;
    93#endif
    94#endif
    95
    96#ifdef CONFIG_NUMA
    97    /*
    98     * Defragmentation by allocating from a remote node.
    99     */
    100    int remote_node_defrag_ratio;
    101#endif
    102
    103#ifdef CONFIG_KASAN
    104    struct kasan_cache kasan_info;
    105#endif
    106
    107    struct kmem_cache_node *node[MAX_NUMNODES];
    108};

    kmem_cache_cpu定义如下:

    40struct kmem_cache_cpu {
    41    void **freelist;    /* Pointer to next available object */
    42    unsigned long tid;    /* Globally unique transaction id */
    43    struct page *page;    /* The slab from which we are allocating */
    44    struct page *partial;    /* Partially allocated frozen slabs */
    45#ifdef CONFIG_SLUB_STATS
    46    unsigned stat[NR_SLUB_STAT_ITEMS];
    47#endif
    48};

    kmem_cache_node定义如下:

    326struct kmem_cache_node {
    327    spinlock_t list_lock;
    328
    329#ifdef CONFIG_SLAB
    330    struct list_head slabs_partial;    /* partial list first, better asm code */
    331    struct list_head slabs_full;
    332    struct list_head slabs_free;
    333    unsigned long free_objects;
    334    unsigned int free_limit;
    335    unsigned int colour_next;    /* Per-node cache coloring */
    336    struct array_cache *shared;    /* shared per node */
    337    struct alien_cache **alien;    /* on other nodes */
    338    unsigned long next_reap;    /* updated without locking */
    339    int free_touched;        /* updated without locking */
    340#endif
    341
    342#ifdef CONFIG_SLUB
    343    unsigned long nr_partial;
    344    struct list_head partial;
    345#ifdef CONFIG_SLUB_DEBUG
    346    atomic_long_t nr_slabs;
    347    atomic_long_t total_objects;
    348    struct list_head full;
    349#endif
    350#endif
    351
    352};

    总的来说,slub分配object,先从c->freelist找,如果为空,再从c->page里transfer object到freelist(get_freelist).如果依然找不着,从c->partail里找。再找不着,从node->partail里找。再找不着,从相邻的numa节点找,再找不着,申请一个新的slab.

    咱们跟着代码来过一下这个流程。

    2668static __always_inline void *slab_alloc_node(struct kmem_cache *s,
    2669        gfp_t gfpflags, int node, unsigned long addr)
    2670{
    2671    void *object;
    2672    struct kmem_cache_cpu *c;
    2673    struct page *page;
    2674    unsigned long tid;
    2675
    2676    s = slab_pre_alloc_hook(s, gfpflags);
    2677    if (!s)
    2678        return NULL;
    2679redo:
    2680    /*
    2681     * Must read kmem_cache cpu data via this cpu ptr. Preemption is
    2682     * enabled. We may switch back and forth between cpus while
    2683     * reading from one cpu area. That does not matter as long
    2684     * as we end up on the original cpu again when doing the cmpxchg.
    2685     *
    2686     * We should guarantee that tid and kmem_cache are retrieved on
    2687     * the same cpu. It could be different if CONFIG_PREEMPT so we need
    2688     * to check if it is matched or not.
    2689     */
    2690    do {
    2691        tid = this_cpu_read(s->cpu_slab->tid);
    2692        c = raw_cpu_ptr(s->cpu_slab);
    2693    } while (IS_ENABLED(CONFIG_PREEMPT) &&
    2694         unlikely(tid != READ_ONCE(c->tid)));
    2695
    2696    /*
    2697     * Irqless object alloc/free algorithm used here depends on sequence
    2698     * of fetching cpu_slab's data. tid should be fetched before anything
    2699     * on c to guarantee that object and page associated with previous tid
    2700     * won't be used with current tid. If we fetch tid first, object and
    2701     * page could be one associated with next tid and our alloc/free
    2702     * request will be failed. In this case, we will retry. So, no problem.
    2703     */
    2704    barrier();
    2705
    2706    /*
    2707     * The transaction ids are globally unique per cpu and per operation on
    2708     * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
    2709     * occurs on the right processor and that there was no operation on the
    2710     * linked list in between.
    2711     */
    2712
    2713    object = c->freelist;
    2714    page = c->page;
    2715    if (unlikely(!object || !node_match(page, node))) {
    2716        object = __slab_alloc(s, gfpflags, node, addr, c);
    2717        stat(s, ALLOC_SLOWPATH);
    2718    } else {
    2719        void *next_object = get_freepointer_safe(s, object);

    2690~2695保证tid和cpu_slab是同一个cpu的。2704行考虑到了cpu缓存一致性的问题,加了内存屏障。2715行如果c->freelist为空,调__slab_alloc, __slab_alloc会禁本地cpu中断,再调___slab_alloc:

    2537static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
    2538              unsigned long addr, struct kmem_cache_cpu *c)
    2539{
    2540    void *freelist;
    2541    struct page *page;
    2542
    2543    page = c->page;
    2544    if (!page)
    2545        goto new_slab;
    2546redo:
    2547
         ...2574 2575 /* must check again c->freelist in case of cpu migration or IRQ */ 2576 freelist = c->freelist; 2577 if (freelist) 2578 goto load_freelist; 2579 2580 freelist = get_freelist(s, page); 2581 2582 if (!freelist) { 2583 c->page = NULL; 2584 stat(s, DEACTIVATE_BYPASS); 2585 goto new_slab; 2586 } 2587 2588 stat(s, ALLOC_REFILL); 2589 2590load_freelist: 2591 /* 2592 * freelist is pointing to the list of objects to be used. 2593 * page is pointing to the page from which the objects are obtained. 2594 * That page must be frozen for per cpu allocations to work. 2595 */ 2596 VM_BUG_ON(!c->page->frozen); 2597 c->freelist = get_freepointer(s, freelist); 2598 c->tid = next_tid(c->tid); 2599 return freelist; 2600 2601new_slab: 2602 2603 if (c->partial) { 2604 page = c->page = c->partial; 2605 c->partial = page->next; 2606 stat(s, CPU_PARTIAL_ALLOC); 2607 c->freelist = NULL; 2608 goto redo; 2609 } 2610 2611 freelist = new_slab_objects(s, gfpflags, node, &c); 2612 2613 if (unlikely(!freelist)) { 2614 slab_out_of_memory(s, gfpflags, node); 2615 return NULL; 2616 } 2617 2618 page = c->page; 2619 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) 2620 goto load_freelist; 2621 2622 /* Only entered in the debug case */ 2623 if (kmem_cache_debug(s) && 2624 !alloc_debug_processing(s, page, freelist, addr)) 2625 goto new_slab; /* Slab failed checks. Next slab needed */ 2626 2627 deactivate_slab(s, page, get_freepointer(s, freelist)); 2628 c->page = NULL; 2629 c->freelist = NULL; 2630 return freelist; 2631}
    ___slab_alloc是分配的主流程,因为进程迁移和cpu中断的原因再次判断c->freelist是否有object。没有,通过get_freelist把c->page->freelist置为NULL,并把该page的frozen和inuse置位。inuse置为page->objects,直到该slab用完。
    get_freelist实现如下:
    2494static inline void *get_freelist(struct kmem_cache *s, struct page *page)
    2495{
    2496    struct page new;
    2497    unsigned long counters;
    2498    void *freelist;
    2499
    2500    do {
    2501        freelist = page->freelist;
    2502        counters = page->counters;
    2503
    2504        new.counters = counters;
    2505        VM_BUG_ON(!new.frozen);
    2506
    2507        new.inuse = page->objects;
    2508        new.frozen = freelist != NULL;
    2509
    2510    } while (!__cmpxchg_double_slab(s, page,
    2511        freelist, counters,
    2512        NULL, new.counters,
    2513        "get_freelist"));
    2514
    2515    return freelist;
    2516}
    如果c->page也没有找到object,跳到new_slab:处。首先检查c->partial是否有object,如果有,会把freelist指到c->partial这个page.若没有,走到new_slab_objects:
    2442static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
    2443            int node, struct kmem_cache_cpu **pc)
    2444{
    2445    void *freelist;
    2446    struct kmem_cache_cpu *c = *pc;
    2447    struct page *page;
    2448
    2449    freelist = get_partial(s, flags, node, c);
    2450
    2451    if (freelist)
    2452        return freelist;
    2453
    2454    page = new_slab(s, flags, node);
    2455    if (page) {
    2456        c = raw_cpu_ptr(s->cpu_slab);
    2457        if (c->page)
    2458            flush_slab(s, c);
    2459
    2460        /*
    2461         * No other reference to the page yet so we can
    2462         * muck around with it freely without cmpxchg
    2463         */
    2464        freelist = page->freelist;
    2465        page->freelist = NULL;
    2466
    2467        stat(s, ALLOC_SLAB);
    2468        c->page = page;
    2469        *pc = c;
    2470    } else
    2471        freelist = NULL;
    2472
    2473    return freelist;
    2474}
    get_partial先从该cache匹配的node去找,如果找不到,再从其他相邻node找object.get_partial_node
    1845static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
    1846                struct kmem_cache_cpu *c, gfp_t flags)
    1847{
    1848    struct page *page, *page2;
    1849    void *object = NULL;
    1850    int available = 0;
    1851    int objects;
    1852
    1853    /*
    1854     * Racy check. If we mistakenly see no partial slabs then we
    1855     * just allocate an empty slab. If we mistakenly try to get a
    1856     * partial slab and there is none available then get_partials()
    1857     * will return NULL.
    1858     */
    1859    if (!n || !n->nr_partial)
    1860        return NULL;
    1861
    1862    spin_lock(&n->list_lock);
    1863    list_for_each_entry_safe(page, page2, &n->partial, lru) {
    1864        void *t;
    1865
    1866        if (!pfmemalloc_match(page, flags))
    1867            continue;
    1868
    1869        t = acquire_slab(s, n, page, object == NULL, &objects);
    1870        if (!t)
    1871            break;
    1872
    1873        available += objects;
    1874        if (!object) {
    1875            c->page = page;
    1876            stat(s, ALLOC_FROM_PARTIAL);
    1877            object = t;
    1878        } else {
    1879            put_cpu_partial(s, page, 0);
    1880            stat(s, CPU_PARTIAL_NODE);
    1881        }
    1882        if (!kmem_cache_has_cpu_partial(s)
    1883            || available > s->cpu_partial / 2)
    1884            break;
    1885
    1886    }
    1887    spin_unlock(&n->list_lock);
    1888    return object;
    1889}

    该函数会从node的partial链表去连续取一些slab,然后freeze这些slab,把这些slab从node的partial链表里删除,并把这些slab移到c->partail链表。直到可用的objects个数 > s->cpu_partial / 2.

    如果没找到,就去相邻node去找。这块暂不叙述。

    acquire_slab:

    1799static inline void *acquire_slab(struct kmem_cache *s,
    1800        struct kmem_cache_node *n, struct page *page,
    1801        int mode, int *objects)
    1802{
    1803    void *freelist;
    1804    unsigned long counters;
    1805    struct page new;
    1806
    1807    lockdep_assert_held(&n->list_lock);
    1808
    1809    /*
    1810     * Zap the freelist and set the frozen bit.
    1811     * The old freelist is the list of objects for the
    1812     * per cpu allocation list.
    1813     */
    1814    freelist = page->freelist;
    1815    counters = page->counters;
    1816    new.counters = counters;
    1817    *objects = new.objects - new.inuse;
    1818    if (mode) {
    1819        new.inuse = page->objects;
    1820        new.freelist = NULL;
    1821    } else {
    1822        new.freelist = freelist;
    1823    }
    1824
    1825    VM_BUG_ON(new.frozen);
    1826    new.frozen = 1;
    1827
    1828    if (!__cmpxchg_double_slab(s, page,
    1829            freelist, counters,
    1830            new.freelist, new.counters,
    1831            "acquire_slab"))
    1832        return NULL;
    1833
    1834    remove_partial(n, page);
    1835    WARN_ON(!freelist);
    1836    return freelist;
    1837}

    这里有一个小细节,如果object为NULL时(mode == true),会把该page->freelist置为NULL,表示正在使用,而如果已经找到了object,则会在put_cpu_partial里把该page->next指向之前c->partial.链表不会断。

    put_cpu_partial:
    2265static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
    2266{
    2267#ifdef CONFIG_SLUB_CPU_PARTIAL
    2268    struct page *oldpage;
    2269    int pages;
    2270    int pobjects;
    2271
    2272    preempt_disable();
    2273    do {
    2274        pages = 0;
    2275        pobjects = 0;
    2276        oldpage = this_cpu_read(s->cpu_slab->partial);
    2277
    2278        if (oldpage) {
    2279            pobjects = oldpage->pobjects;
    2280            pages = oldpage->pages;
    2281            if (drain && pobjects > s->cpu_partial) {
    2282                unsigned long flags;
    2283                /*
    2284                 * partial array is full. Move the existing
    2285                 * set to the per node partial list.
    2286                 */
    2287                local_irq_save(flags);
    2288                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
    2289                local_irq_restore(flags);
    2290                oldpage = NULL;
    2291                pobjects = 0;
    2292                pages = 0;
    2293                stat(s, CPU_PARTIAL_DRAIN);
    2294            }
    2295        }
    2296
    2297        pages++;
    2298        pobjects += page->objects - page->inuse;
    2299
    2300        page->pages = pages;
    2301        page->pobjects = pobjects;
    2302        page->next = oldpage;
    2303
    2304    } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
    2305                                != oldpage);
    2306    if (unlikely(!s->cpu_partial)) {
    2307        unsigned long flags;
    2308
    2309        local_irq_save(flags);
    2310        unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
    2311        local_irq_restore(flags);
    2312    }
    2313    preempt_enable();
    2314#endif
    2315}

    如果从node中未找到可用的object,则会申请内存,创建一个新的slab.

    1581static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
    1582{
    1583    struct page *page;
    1584    struct kmem_cache_order_objects oo = s->oo;
    1585    gfp_t alloc_gfp;
    1586    void *start, *p;
    1587    int idx, order;
    1588
    1589    flags &= gfp_allowed_mask;
    1590
    1591    if (gfpflags_allow_blocking(flags))
    1592        local_irq_enable();
    1593
    1594    flags |= s->allocflags;
    1595
    1596    /*
    1597     * Let the initial higher-order allocation fail under memory pressure
    1598     * so we fall-back to the minimum order allocation.
    1599     */
    1600    alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
    1601    if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
    1602        alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
    1603
    1604    page = alloc_slab_page(s, alloc_gfp, node, oo);
    1605    if (unlikely(!page)) {
    1606        oo = s->min;
    1607        alloc_gfp = flags;
    1608        /*
    1609         * Allocation may have failed due to fragmentation.
    1610         * Try a lower order alloc if possible
    1611         */
    1612        page = alloc_slab_page(s, alloc_gfp, node, oo);
    1613        if (unlikely(!page))
    1614            goto out;
    1615        stat(s, ORDER_FALLBACK);
    1616    }
    1617
         ...
    1634 page->objects = oo_objects(oo); 1635 1636 order = compound_order(page); 1637 page->slab_cache = s; 1638 __SetPageSlab(page); 1639 if (page_is_pfmemalloc(page)) 1640 SetPageSlabPfmemalloc(page); 1641 1642 start = page_address(page); 1643 1644 if (unlikely(s->flags & SLAB_POISON)) 1645 memset(start, POISON_INUSE, PAGE_SIZE << order); 1646 1647 kasan_poison_slab(page); 1648 1649 for_each_object_idx(p, idx, s, start, page->objects) { 1650 setup_object(s, page, p); 1651 if (likely(idx < page->objects)) 1652 set_freepointer(s, p, p + s->size); 1653 else 1654 set_freepointer(s, p, NULL); 1655 } 1656 1657 page->freelist = fixup_red_left(s, start); 1658 page->inuse = page->objects; 1659 page->frozen = 1; 1660 1661out: 1662 if (gfpflags_allow_blocking(flags)) 1663 local_irq_disable(); 1664 if (!page) 1665 return NULL; 1666 1667 mod_zone_page_state(page_zone(page), 1668 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1669 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1670 1 << oo_order(oo)); 1671 1672 inc_slabs_node(s, page_to_nid(page), page->objects); 1673 1674 return page; 1675}

    先申请order个页,然后freeze该slab,并初始化该slab的object(在该object的offset处填写下一个可用的object的地址)。并把该c->page置为刚申请的page,然后把page->freelist置为NULL,以示在使用。

    至此,slub分配object完。

     
     
  • 相关阅读:
    移动硬盘插入电脑后检测到却不显示如何解决?
    Js求角度、三角形、弧度计算
    Python之常用模块(一)
    PostgreSQL中的MVCC 事务隔离
    520了,用32做个简单的小程序
    “TensorFlow 开发者出道计划”全攻略,玩转社区看这里!
    MySQL 返回未包含在group by中的列
    《Mysql日志备份/恢复》大型详细攻略两万字图解(史上最详细,多图)
    年轻就该多尝试,教你20小时Get一项新技能
    Flutter开发指南之理论篇:Dart语法05(单线程模型,事件循环模型,Isolate)
  • 原文地址:https://www.cnblogs.com/chaozhu/p/10157241.html
Copyright © 2020-2023  润新知