• External sort


    Comparison sorts

    Name Best Average Worst Memory Stable
    Bubble sort (n) (n^2) (n^2) (1) Yes
    Insertion sort (n) (n^2) (n^2) (1) Yes
    Merge sort (nlog n) (nlog n) (nlog n) (n) Yes
    Heapsort (nlog n) (nlog n) (nlog n) (1) No
    Quicksort (nlog n) (nlog n) (n^2) (log n) No

    External sort

    External sorting

    external.cpp:

    #include <bits/stdc++.h>
    #define DBG(x) cerr << #x << " = " << x << endl
    
    using namespace std;
    typedef long long LL;
    
    const int MAX_MEM = 128 * 1024 * 1024; // 128MB
    const int BUF_SIZE = 64;
    
    string write_temp_file(int file_no, const vector<string> &strings) {
        string temp_file = to_string(file_no) + ".txt";
        FILE *f_temp = fopen(temp_file.c_str(), "w");
    
        for (const auto &s : strings) {
            fputs(s.c_str(), f_temp);
        }
        fclose(f_temp);
    
        return temp_file;
    }
    
    vector<string> sort_part_to_disk(const string &input_file) {
        vector<string> temp_files;
    
        FILE *f_input = fopen(input_file.c_str(), "r");
    
        int temp_size = 0;
        int file_no = 0;
        char buf[BUF_SIZE];
        vector<string> strings;
    
        while (fgets(buf, sizeof(buf) - 1, f_input) != NULL) {
            string str = buf;
            temp_size += str.size();
            strings.push_back(str);
    
            if (temp_size >= MAX_MEM) {
                sort(strings.begin(), strings.end());
                string temp_file = write_temp_file(file_no, strings);
                temp_files.push_back(temp_file);
    
                file_no++;
                strings.clear();
                temp_size = 0;
            }
        }
    
        if (!strings.empty()) {
            sort(strings.begin(), strings.end());
            string temp_file = write_temp_file(file_no, strings);
            temp_files.push_back(temp_file);
        }
    
        fclose(f_input);
    
        return temp_files;
    }
    
    void external_sort(const string &input_file, const string &output_file) {
        vector<string> temp_files = sort_part_to_disk(input_file);
        vector<FILE *> files;
        for (const auto &file_name : temp_files) {
            FILE *temp_file = fopen(file_name.c_str(), "r");
            files.push_back(temp_file);
        }
    
        FILE *f_output = fopen(output_file.c_str(), "w");
    
        priority_queue<pair<string, string>, vector<pair<string, string>>, greater<pair<string, string>>> pri_que;
    
        bool have_input = false;
        do {
            have_input = false;
    
            for (int i = 0; i < temp_files.size(); i++) {
                char buf[BUF_SIZE];
                if (fgets(buf, sizeof(buf) - 1, files[i]) != NULL) {
                    have_input = true;
                    pri_que.push(pair<string, string>(buf, temp_files[i]));
                }
            }
    
            if (!pri_que.empty()) {
                pair<string, string> top = pri_que.top();
                pri_que.pop();
                fputs(top.first.c_str(), f_output);
            }
    
        } while (have_input || !pri_que.empty());
    
        for (FILE *f : files) {
            fclose(f);
        }
        fclose(f_output);
    }
    
    int main(int argc, char **argv) {
        external_sort("raw.txt", "output.txt");
        return 0;
    }
    
     torapture@localhost ~/SSD/Codes/acm ls -lh raw.txt
    -rwxrwxrwx  1 torapture  staff   685M  3 30 18:02 raw.txt
    
     torapture@localhost ~/SSD/Codes/acm time ./external
    ./external  617.04s user 17.50s system 94% cpu 11:10.99 total
    
     torapture@localhost ~/SSD/Codes/acm time sort ./raw.txt > sort.txt
    sort ./raw.txt > sort.txt  496.36s user 87.25s system 81% cpu 11:58.01 total
    
     torapture@localhost ~/SSD/Codes/acm ls -lh raw.txt output.txt sort.txt
    -rwxrwxrwx  1 torapture  staff   685M  3 30 18:13 output.txt
    -rwxrwxrwx  1 torapture  staff   685M  3 30 18:02 raw.txt
    -rwxrwxrwx  1 torapture  staff   685M  3 30 18:28 sort.txt
     torapture@localhost ~/SSD/Codes/acm diff output.txt sort.txt
    
  • 相关阅读:
    3
    正确的消费理念和方式
    2
    1
    善待精力,保持体力,保持热情
    为什么不从今天开始呢?
    c++中的新成员
    函数重载分析下
    函数重载分析上
    函数参数的扩展
  • 原文地址:https://www.cnblogs.com/ToRapture/p/12600390.html
Copyright © 2020-2023  润新知