• C++ 大规模数据排序(100G数据 使用 4G 内存 排序)


    思路很简单,先分段排序,存储到临时文件中,然后合并.

    使用10000个整数来模拟大数据,每次读取100个到内存中.

    #include <stdint.h>
    #include <stdlib.h>
    #include <stdio.h>
    
    enum
    {
        enmMaxFileNameLen = 256,
    };
    
    void SaveArrToFile(int32_t arrInt[], int32_t arrSize, const char *fileName);
    void ReadArrFromFile(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, const char *fileName);
    void ReadArrFromFilePtr(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, FILE *fp);
    void RandomGenArrInt(int32_t arrInt[], int32_t arrSize);
    void RandomGenData(int32_t numberCount);
    void QSort(int32_t arrInt[], int32_t start, int32_t end);
    void BigDataSort(const char *fileName, const int32_t maxNumberInMem);
    int32_t Segment(const char *fileName, const int32_t maxNumberInMem);
    int32_t MergeTwoFile(const char *fileName1, const char *fileName2, const char *fileOut);
    void PrintArrInt(int32_t arr[], int32_t arrSize);
    
    int32_t main()
    {
        RandomGenData(10000);
        BigDataSort("data_10000.txt", 100);
        getchar();
        return 0;
    }
    
    void RandomGenArrInt(int32_t arrInt[], int32_t arrSize)
    {
        for (int32_t i = 0; i < arrSize; ++i)
        {
            arrInt[i] = i + 1;
        }
        for (int32_t i = 1; i <= arrSize; ++i)
        {
            int32_t m = rand() % arrSize;
            int32_t n = rand() % arrSize;
            int32_t tmp = arrInt[m];
            arrInt[m] = arrInt[n];
            arrInt[n] = tmp;
        }
    }
    
    void SaveArrToFile(int32_t arrInt[], int32_t arrSize, const char *fileName)
    {
        FILE *fp = NULL;
        fopen_s(&fp, fileName, "w");
        if (!fp)
        {
            printf("open %s failed!
    ", fileName);
            return;
        }
        for (int32_t i = 0; i < arrSize; ++i)
        {
            fprintf_s(fp, "%d,", arrInt[i]);
        }
        fclose(fp);
        printf("save %s 
    ", fileName);
    }
    
    void RandomGenData(int32_t numberCount)
    {
        int32_t *arr = new int32_t[numberCount];
        RandomGenArrInt(arr, numberCount);
        char fileName[enmMaxFileNameLen] = { 0 };
        sprintf_s(fileName, enmMaxFileNameLen,"data_%d.txt", numberCount);
        SaveArrToFile(arr, numberCount, fileName);
    }
    
    void QSort(int32_t arrInt[], int32_t start, int32_t end)
    {
        if (start >= end){ return; }
        int32_t i = start, j = end;
        int32_t tmp = arrInt[i];
        while (i < j)
        {
            while (i < j && tmp < arrInt[j])
            {
                --j;
            }
            arrInt[i] = arrInt[j];
            while (i < j && tmp >= arrInt[i])
            {
                ++i;
            }
            arrInt[j] = arrInt[i];
        }
        arrInt[i] = tmp;
        QSort(arrInt, start, i - 1);
        QSort(arrInt, i + 1, end);
    }
    
    void ReadArrFromFile(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, const char *fileName)
    {
        arrSize = 0;
        FILE *fp = NULL;
        fopen_s(&fp, fileName, "r");
        if (!fp)
        {
            printf("open %s failed!
    ", fileName);
            return;
        }
        while (arrSize < amxArrSize && !feof(fp))
        {
            fscanf_s(fp, "%d,", &arrInt[arrSize++]);
        }
    }
    
    void ReadArrFromFilePtr(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, FILE *fp)
    {
        arrSize = 0;
        while (arrSize < amxArrSize && !feof(fp))
        {
            fscanf_s(fp, "%d,", &arrInt[arrSize]);
            if (!feof(fp))
            {
                ++arrSize;
            }
        }
    }
    
    void BigDataSort(const char *fileName, const int32_t maxNumberInMem)
    {
        int32_t segFileCount = Segment(fileName, maxNumberInMem);
        int32_t fileIndex = 1;
        char fileName1[enmMaxFileNameLen] = { 0 };
        char fileName2[enmMaxFileNameLen] = { 0 };
        char fileOut[enmMaxFileNameLen] = { 0 };
        while (true)
        {
            sprintf_s(fileName1, "%d.txt", fileIndex++);
            sprintf_s(fileName2, "%d.txt", fileIndex++);
            sprintf_s(fileOut, "%d.txt", segFileCount++);
            int32_t ret = MergeTwoFile(fileName1, fileName2, fileOut);
            if (ret != 0)
            {
                break;
            }
        }
    }
    
    int32_t Segment(const char *fileName, const int32_t maxNumberInMem)
    {
        int32_t *arr = new int32_t[maxNumberInMem];
        FILE *fp = NULL;
        fopen_s(&fp, fileName, "r");
        if (!fp)
        {
            printf("open %s failed!
    ", fileName);
            return 0;
        }
        int32_t tmpFileIndex = 1;
        while (true)
        {
            int32_t arrSize = 0;
            ReadArrFromFilePtr(arr, arrSize, maxNumberInMem, fp);
            if (arrSize == 0)
            {
                break;
            }
            QSort(arr, 0, arrSize - 1);
            char tmpFileName[enmMaxFileNameLen] = { 0 };
            sprintf_s(tmpFileName, enmMaxFileNameLen, "%d.txt", tmpFileIndex++);
            SaveArrToFile(arr, arrSize, tmpFileName);
        }
        fclose(fp);
        delete[] arr;
        return tmpFileIndex;
    }
    
    int32_t MergeTwoFile(const char *fileName1, const char *fileName2, const char *fileOut)
    {
        int32_t ret = 1;
        FILE *fp1 = NULL, *fp2 = NULL, *fpOut = NULL;
        fopen_s(&fp1, fileName1, "r");
        fopen_s(&fp2, fileName2, "r");
        fopen_s(&fpOut, fileOut, "w");
        if (!fileOut)
        {
            printf("open %s failed!
    ", fileOut);
            return ret;
        }
        int32_t val1 = 0, val2 = 0;
        if (fp1){ fscanf_s(fp1, "%d,", &val1); }
        if (fp2){ fscanf_s(fp2, "%d,", &val2); }
        while (fp1 && fp2 && !feof(fp1) && !feof(fp2))
        {
            if (val1 < val2)
            {
                // printf("%d ", val1);
                fprintf_s(fpOut, "%d,", val1);
                fscanf_s(fp1, "%d,", &val1);
            }
            else
            {
                // printf("%d ", val2);
                fprintf_s(fpOut, "%d,", val2);
                fscanf_s(fp2, "%d,", &val2);
            }
            ret = 0;
        }
        while (fp1 && !feof(fp1))
        {
            // printf("%d ", val1);
            fprintf_s(fpOut, "%d,", val1);
            fscanf_s(fp1, "%d,", &val1);
        }
        while (fp2 && !feof(fp2))
        {
            // printf("%d ", val2);
            fprintf_s(fpOut, "%d,", val2);
            fscanf_s(fp2, "%d,", &val2);
        }
        if (fp1){ fclose(fp1); }
        if (fp2){ fclose(fp2); }
        fclose(fpOut);
        printf("save %s 
    ", fileOut);
        return ret;
    }
    
    void PrintArrInt(int32_t arr[], int32_t arrSize)
    {
        for (int32_t i = 0; i < arrSize; ++i)
        {
            printf("%d ", arr[i]);
        }
    }
  • 相关阅读:
    【树形DP】ZJOI2008 骑士
    【博弈论】CF 1215D Ticket Game
    【状态压缩DP】HDU 4352 XHXJ'S LIS
    【纯水题】CF 833A The Meaningless Game
    【不知道怎么分类】NOIP2016 蚯蚓
    【状态压缩DP】SCOI2009 围豆豆
    操作系统总结
    概率问题总结
    C++虚函数原理
    一些baidu面经
  • 原文地址:https://www.cnblogs.com/tangxin-blog/p/6814421.html
Copyright © 2020-2023  润新知