• 重提基数排序


    重提基数排序

    在此之前,我已尝试过两次基数排序的方法:LSDMSD。 
    我的主要改进点在于每次“申请”大块存储器,而不是采用最原始的链表。 
    当然这种形式本质上还是链表,只是每个节点就是一个页面。 
    在存储器申请/释放上,开始时一次申请/结束时一次释放,避免了一次一数字时的malloc/free调用的代价。

    但是,缺点还是存在的,主要在于不够缓存友好。 
    看一下结果就很容易明白缓存友好的重要性了。

    主要数据结构:

    const __int32 TFSI = 1024*1024*500;
    const int PAGEAMOUNT = 4096;
    const int PAGEGRANULAR = PAGEAMOUNT/sizeof(int);
    const int TERMINATOR = -1;
    const int BUCKETSLOTCOUNT = 256;

    typedef struct tagPageList{
    int * PagePtr;
    struct tagPageList * next;
    }PageList;

    typedef struct tagBucket{
    int * currentPagePtr;
    int offset;
    PageList pl;
    PageList * currentPageListItem;
    }Bucket;

    void MakeSure(pmhBool s){
    if (s == pmhFalse) __debugbreak();
    }

    复制代码
    const __int32 TFSI = 1024*1024*500;
    const int PAGEAMOUNT = 4096;
    const int PAGEGRANULAR = PAGEAMOUNT/sizeof(int);
    const int TERMINATOR = -1;
    const int BUCKETSLOTCOUNT = 256;
    
    typedef struct tagPageList{
        int * PagePtr;
        struct tagPageList * next;
    }PageList;
    
    typedef struct tagBucket{
        int * currentPagePtr;
        int offset;
        PageList pl;
        PageList * currentPageListItem;
    }Bucket;
    
    void MakeSure(pmhBool s){
        if (s == pmhFalse) __debugbreak();
    }
    复制代码

    排序函数:

    int LSD_radix_sort_R2(){
    HANDLE heap = NULL;
    Bucket bucket[BUCKETSLOTCOUNT];
    PageList * pageListPool;
    int plpAvailable = 0;
    int * pages = NULL;
    int * pagesAvailable = NULL;

    typedef unsigned short ElementType;
    ElementType * s;

    time_t timeBegin;
    time_t timeEnd;

    //pages = (int * )VirtualAlloc(NULL, (TFSI/PAGEGRANULAR + BUCKETSLOTCOUNT + 8) * PAGEAMOUNT, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
    //int a = GetLastError();
    //pageListPool = (PageList *)VirtualAlloc(NULL, (TFSI/PAGEGRANULAR + 8) * sizeof(PageList), MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
    //s = (ElementType *)VirtualAlloc(NULL, TFSI*sizeof(ElementType), MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);

    heap = HeapCreate(HEAP_NO_SERIALIZE|HEAP_GENERATE_EXCEPTIONS, 1024*1024, 0);
    if (heap != NULL){
    pages = (int * )HeapAlloc(heap, 0, (TFSI/PAGEGRANULAR + BUCKETSLOTCOUNT + 8) * PAGEAMOUNT);
    pageListPool = (PageList *)HeapAlloc(heap, 0, (TFSI/PAGEGRANULAR + 8) * sizeof(PageList));
    s = (ElementType *)HeapAlloc(heap, 0, TFSI*sizeof(ElementType));
    }
    MakeSure(pages != NULL && pageListPool != NULL && s != NULL);

    timeBegin = clock();
    for (int i=0; i<TFSI; i++) s[i] = rand();
    timeEnd = clock();
    printf(" %f(s) consumed in generating numbers", (double)(timeEnd-timeBegin)/CLOCKS_PER_SEC);

    timeBegin = clock();

    for (int t=0; t<sizeof(ElementType); t++){
    FillMemory(pages, (TFSI/PAGEGRANULAR + BUCKETSLOTCOUNT + 8) * PAGEAMOUNT, 0xff);
    SecureZeroMemory(pageListPool, (TFSI/PAGEGRANULAR + 8) * sizeof(PageList));
    pagesAvailable = pages;
    plpAvailable = 0;

    for(int i=0; i<256; i++){
    bucket[i].currentPagePtr = pagesAvailable;
    bucket[i].offset = 0;
    bucket[i].pl.PagePtr = pagesAvailable;
    bucket[i].pl.next = NULL;
    pagesAvailable += PAGEGRANULAR;
    bucket[i].currentPageListItem = &(bucket[i].pl);
    }

    int bucketIdx;
    for (int i=0; i<TFSI; i++){
    bucketIdx = (s[i]>>t*8)&0xff;
    //save(bucketIdx, objIdx[i]);
    bucket[bucketIdx].currentPagePtr[ bucket[bucketIdx].offset ] = s[i];
    bucket[bucketIdx].offset++;
    if (bucket[bucketIdx].offset == PAGEGRANULAR){
    bucket[bucketIdx].currentPageListItem->next = &pageListPool[plpAvailable];
    plpAvailable++;
    bucket[bucketIdx].currentPageListItem->next->PagePtr = pagesAvailable;
    bucket[bucketIdx].currentPageListItem->next->next = NULL;

    bucket[bucketIdx].currentPagePtr = pagesAvailable;
    bucket[bucketIdx].offset = 0;
    pagesAvailable += PAGEGRANULAR;

    bucket[bucketIdx].currentPageListItem = bucket[bucketIdx].currentPageListItem->next;
    }
    }

    //update objIdx index
    int start = 0;
    for (int i=0; i<256; i++){
    PageList * p;
    p = &(bucket[i].pl);
    while (p){
    for (int t=0; t<PAGEGRANULAR; t++){
    int idx = p->PagePtr[t];
    if (idx != TERMINATOR){
    s[start] = idx;
    start++;
    }
    if (idx == TERMINATOR) break;
    }
    p = p->next;
    }
    }
    }

    timeEnd = clock();
    printf(" %f(s) consumed in generating results", (double)(timeEnd-timeBegin)/CLOCKS_PER_SEC);

    HeapFree(heap, 0, pages);
    HeapFree(heap, 0, pageListPool);
    HeapFree(heap, 0, s);
    HeapDestroy(heap);

    return 0;
    }

    复制代码
    int LSD_radix_sort_R2(){
        HANDLE heap = NULL;
        Bucket bucket[BUCKETSLOTCOUNT];
        PageList * pageListPool;
        int plpAvailable = 0;
        int * pages = NULL;
        int * pagesAvailable = NULL;
    
        typedef unsigned short ElementType;
        ElementType * s;
    
        time_t timeBegin;
        time_t timeEnd;
    
        //pages = (int * )VirtualAlloc(NULL, (TFSI/PAGEGRANULAR + BUCKETSLOTCOUNT + 8) * PAGEAMOUNT, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
        //int a = GetLastError();
        //pageListPool = (PageList *)VirtualAlloc(NULL, (TFSI/PAGEGRANULAR + 8) * sizeof(PageList),  MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
        //s = (ElementType *)VirtualAlloc(NULL, TFSI*sizeof(ElementType), MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
    
        heap = HeapCreate(HEAP_NO_SERIALIZE|HEAP_GENERATE_EXCEPTIONS, 1024*1024, 0);
        if (heap != NULL){
            pages = (int * )HeapAlloc(heap, 0, (TFSI/PAGEGRANULAR + BUCKETSLOTCOUNT + 8) * PAGEAMOUNT);
            pageListPool = (PageList *)HeapAlloc(heap, 0, (TFSI/PAGEGRANULAR + 8) * sizeof(PageList));
            s = (ElementType *)HeapAlloc(heap, 0, TFSI*sizeof(ElementType));
        }
        MakeSure(pages != NULL && pageListPool != NULL && s != NULL);
    
        timeBegin = clock();
        for (int i=0; i<TFSI; i++) s[i] = rand();
        timeEnd = clock();
        printf("
    %f(s) consumed in generating numbers", (double)(timeEnd-timeBegin)/CLOCKS_PER_SEC);
        
        timeBegin = clock();
    
        for (int t=0; t<sizeof(ElementType); t++){
            FillMemory(pages, (TFSI/PAGEGRANULAR + BUCKETSLOTCOUNT + 8) * PAGEAMOUNT, 0xff);
            SecureZeroMemory(pageListPool, (TFSI/PAGEGRANULAR + 8) * sizeof(PageList));
            pagesAvailable = pages;
            plpAvailable = 0;
    
            for(int i=0; i<256; i++){
                bucket[i].currentPagePtr = pagesAvailable;
                bucket[i].offset = 0;
                bucket[i].pl.PagePtr = pagesAvailable;
                bucket[i].pl.next = NULL;
                pagesAvailable += PAGEGRANULAR;
                bucket[i].currentPageListItem = &(bucket[i].pl);
            }
    
            int bucketIdx;
            for (int i=0; i<TFSI; i++){
                bucketIdx = (s[i]>>t*8)&0xff;
                //save(bucketIdx, objIdx[i]);
                bucket[bucketIdx].currentPagePtr[ bucket[bucketIdx].offset ] = s[i];
                bucket[bucketIdx].offset++;
                if (bucket[bucketIdx].offset == PAGEGRANULAR){
                    bucket[bucketIdx].currentPageListItem->next = &pageListPool[plpAvailable];
                    plpAvailable++;
                    bucket[bucketIdx].currentPageListItem->next->PagePtr = pagesAvailable;
                    bucket[bucketIdx].currentPageListItem->next->next = NULL;
                    
                    bucket[bucketIdx].currentPagePtr = pagesAvailable;
                    bucket[bucketIdx].offset = 0;
                    pagesAvailable += PAGEGRANULAR;
                    
                    bucket[bucketIdx].currentPageListItem = bucket[bucketIdx].currentPageListItem->next;
                }
            }
    
            //update objIdx index
            int start = 0;
            for (int i=0; i<256; i++){
                PageList * p;
                p = &(bucket[i].pl);
                while (p){
                    for (int t=0; t<PAGEGRANULAR; t++){
                        int idx = p->PagePtr[t];
                        if (idx != TERMINATOR){
                            s[start] = idx;
                            start++;
                        }
                        if (idx == TERMINATOR) break;
                    }
                    p = p->next;
                }
            }
        }
    
        timeEnd = clock();
        printf("
    %f(s) consumed in generating results", (double)(timeEnd-timeBegin)/CLOCKS_PER_SEC);
        
        HeapFree(heap, 0, pages);
        HeapFree(heap, 0, pageListPool);
        HeapFree(heap, 0, s);
        HeapDestroy(heap);    
        
        return 0;
    }
    复制代码

    随机生成5亿个 short,排序结果如图

    上两次最快的速度是 1亿个 short 4.563秒。
    这次5亿个5.454s,折合一亿个 1.09s,事实上直接运行一亿个数字排序的话可能要少于这个数字。

    近5倍的差距来源在于缓存的利用上面。

    上两次排的都是数字的序号,而不是真正的数字,这样做是因为我要最终完成二维表排序。

    可想而知,这样访问一个数字的话需要两个步骤: 
    1、先取到序号,这需要访问一次数组元素 
    2、按第一步提供的序号,访问排序数组内元素

    这样可真的就是随机访问存储了! 
    缺陷就是很差的存储器访问效率。

    当然,如果使用自索引排序[1](self-indexed sort),必然更快,不过那样我就不能拿来扩展二维表排序了。

    在The Algorithm Design Manual 上面,作者提到Pennysort的一个记录:$760的机器上排序32GB耗时1679s。 
    这个我也写了个大致方法: 
    设置master slave缓冲区, 
    一次排序5亿个整型:cpu 
    缓冲5亿个整型:I/O

    复制代码
    CreateThread fillmaster suspended
    CreateThread radix_sort suspended
    Resume all the threads
    
    FillMaster{
        Load first data-slice to the master;
        while (true){
            FillSlave;
            Radix_sort;
            WaitforMultipleObjects(Fillslave, radix_sort);
            swapPointer(master, slave);
            if (finished) break;
        }
        Radix_sort the last slice;
    }
    复制代码

    大致估计下时间:排序和I/O操作的最长一方决定了一轮的时间。 
    如果划分32GB为32个小文件,大概 32 * I/O时间 就是总共排序32GB的时间。(i/o一般是最慢的) 
    不过读写文件也有多种方式,BSIS_PennySort_2006的描述中提到了 IO完成端口(IO completion port)有着顺序读尽两倍的速度,这么一来,读写文件的速度有望翻倍。 
    不过这部分我就没有继续试验了。

    PennySort的网址可以参考 http://sortbenchmark.org/

    BSIS的描述文本 http://sortbenchmark.org/BSIS-PennySort_2006.pdf

    Computer Organization and Design: The Hardware/Software Interface中的一段话,作者的结论是理解存储器层次原理是理解的当今计算机性能的关键。

     

    Reference: 
    [1]Yingxu Wang.A New Sort Algorithm: Self-Indexed Sort.Communications of ACM SIGPALN, 1996,Vol.31, No.3, March:28-36

     
     
    分类: C/C++
  • 相关阅读:
    [Javascript] Broadcaster + Operator + Listener pattern -- 3 Stop with condition
    分布式事务科普(初识篇)
    分布式事务不理解?一次给你讲清楚!
    分布式事务,有解吗?
    分布式事务精华总结篇,实打实的干货!
    常用的分布式事务解决方案介绍有多少种?
    5种分布式事务解决方案优缺点对比
    Leaf——美团点评分布式ID生成系统
    MySQL分区总结
    互联网公司为啥基本不使用mysql分区表
  • 原文地址:https://www.cnblogs.com/Leo_wl/p/3414218.html
Copyright © 2020-2023  润新知