外部二路歸併排序的小嚐試
阿新 • • 發佈:2021-10-16
瞭解了外部排序的入門知識後,打算簡單實踐一下。(雖然不是什麼原理很難的東西,省略寫出焦油坑然後除錯半天的若干過程……)
預設元素資料型別int,使用fstream輸入輸出,測試在本機上跑1e8的資料集
全部讀入主存用時約35.8秒,最高記憶體佔用接近350M
在1e4大小的模擬主存跑用時約497.3秒,最高記憶體佔用接近11M
多用約14倍的時間,佔用約3%的空間,大概適合在嵌入式裝置大吞吐量讀寫場景應用?
(雖然經過測試,可能存在小問題,逃)
1.外部二路排序程式碼
#include <bits/stdc++.h> #include <cmath> typedef unsigned long long ull; const int MAXMM = 1e4; //max main memory int mm[MAXMM], ok = false, total; int init() { std::ifstream fin1("a1.txt"), fin2("a2.txt"); std::ofstream fout1("b1.txt"), fout2("b2.txt"); std::ofstream *fout = &fout1; std::ifstream *fin = &fin1; int cnt = 0; while(*fin) { int i, j, t; for(i = 0, t = 0; i < MAXMM && (*fin) >> t ; ++i) { mm[i] = t; } cnt += i; std::sort(mm, mm + i); for(j = 0 ; j < i ; ++j) { (*fout) << mm[j] << ' '; } fout = reinterpret_cast<std::ofstream *>((ull)(&fout1) + (ull)(&fout2) - (ull)fout); } return cnt; } template<typename it> int way2_merge(it &in1, it &in2, it &ieof, std::ofstream &fout, int lim = MAXMM) { if(in1 == ieof && in2 == ieof) return false; int p1 = 0, p2 = 0; while(p1 < lim && in1 != ieof && p2 < lim && in2 != ieof) { if(*in1 < *in2) { fout << *in1++; p1++; } else { fout << *in2++; p2++; } fout << ' '; } while(p1 < lim && in1 != ieof) { fout << *in1++ << ' '; p1++; } while(p2 < lim && in2 != ieof) { fout << *in2++ << ' '; p2++; } ok = p1 == total | p2 == total; printf("p1: %d, p2: %d\n", p1, p2); return 1; } int main() { int cnt = init(); total = cnt; typedef const char *cstring; cstring s1 = "a1.txt", s2 = "a2.txt", s3 = "b1.txt", s4 = "b2.txt"; cstring in1 = s3, in2 = s4, out1 = s1, out2 = s2; int result; for(int ex = 0; !ok; ex++) { std::ifstream fin1(in1), fin2(in2); std::ofstream fout1(out1), fout2(out2); std::istream_iterator<int> _in1(fin1), _in2(fin2), ieof; do{ printf("pass %d:\n", ex); result = way2_merge(_in1, _in2, ieof, fout1, MAXMM << ex); result &= way2_merge(_in1, _in2, ieof, fout2, MAXMM << ex); } while(result); std::swap(in1, out1); std::swap(in2, out2); } return 0; }
2.全部讀入主存程式碼
#include <algorithm> #include <cstdio> #include <cstring> #include <fstream> #include <iostream> int arr[int(1e8)]; int main() { std::ifstream fin("a.txt"); std::ofstream fout("b.txt"); for(int i = 0 ; i < int(1e8) ; ++i) { fin >> arr[i]; } std::sort(arr, arr + int(1e8)); for(int i = 0 ; i < int(1e8) ; ++i) { fout << arr[i]; } return 0; }
3.造資料用的
#include <algorithm> #include <cstdlib> #include <cstdio> #include <cstring> #include <ctime> #include <fstream> #include <iostream> const int MAXN = 1e8; int main() { srand(time(0)); std::ofstream fout1("a1.txt"), fout2("a2.txt"), fout3("b1.txt"), fout4("b2.txt"); for(int i = 0 ; i < MAXN ; ++i) fout1 << rand() << ' '; return 0; }
4. 測試正確性用的
#include <bits/stdc++.h>
const int MAXN = 1e8;
bool check(const char *filename, int total = MAXN)
{
std::ifstream fin(filename);
int tmp1, tmp2; fin >> tmp1; int cnt = 1;
while(fin >> tmp2 && tmp1 <= tmp2)
cnt++, tmp1 = tmp2;
printf("cnt: %d, total: %d\n", cnt, total);
return cnt == total;
}
bool identify(const char *sorted, const char *src)
{
std::ifstream fin1(sorted), fin2(src);
std::vector<int> bucket1(32768, 0), bucket2(32768, 0);
int tmp;
while(fin1 >> tmp)
bucket1[tmp]++;
while(fin2 >> tmp)
bucket2[tmp]++;
for(int i = 0 ; i < 32768 ; ++i)
if(bucket1[i] != bucket2[i]) return false;
return true;
}
int main()
{
printf("cnt: %s\n", (check("a1.txt") || check("b1.txt")) ? "Success" : "Failure");
printf("identification: %s\n", identify("a1.txt", "a.txt") ? "Success" : "Failure");
}