1. 程式人生 > 其它 >外部二路歸併排序的小嚐試

外部二路歸併排序的小嚐試

瞭解了外部排序的入門知識後,打算簡單實踐一下。(雖然不是什麼原理很難的東西,省略寫出焦油坑然後除錯半天的若干過程……)
預設元素資料型別int,使用fstream輸入輸出,測試在本機上跑1e8的資料集
全部讀入主存用時約35.8秒,最高記憶體佔用接近350M
在1e4大小的模擬主存跑用時約497.3秒,最高記憶體佔用接近11M
多用約14倍的時間,佔用約3%的空間,大概適合在嵌入式裝置大吞吐量讀寫場景應用?
(雖然經過測試,可能存在小問題,逃)

1.外部二路排序程式碼

#include <bits/stdc++.h>
#include <cmath>
typedef unsigned long long ull;
const int MAXMM = 1e4; //max main memory
int mm[MAXMM], ok = false, total;
int init()
{
	std::ifstream fin1("a1.txt"), fin2("a2.txt");
	std::ofstream fout1("b1.txt"), fout2("b2.txt");
	std::ofstream *fout = &fout1;
	std::ifstream *fin = &fin1;
	int cnt = 0;
	while(*fin)
	{
		int i, j, t;
		for(i = 0, t = 0; i < MAXMM && (*fin) >> t ; ++i)
		{
			mm[i] = t;
		}
		cnt += i;
		std::sort(mm, mm + i);
		for(j = 0 ; j < i ; ++j)
		{
			(*fout) << mm[j] << ' ';
		}
		fout = reinterpret_cast<std::ofstream *>((ull)(&fout1) + (ull)(&fout2) - (ull)fout);
	}
	return cnt;
}
template<typename it>
int way2_merge(it &in1, it &in2, it &ieof, std::ofstream &fout, int lim = MAXMM)
{
	if(in1 == ieof && in2 == ieof) return false;
	int p1 = 0, p2 = 0;
	while(p1 < lim && in1 != ieof && p2 < lim && in2 != ieof)
	{
		if(*in1 < *in2)
		{
			fout << *in1++;
			p1++;
		}
		else
		{
			fout << *in2++;
			p2++;
		}
		fout << ' ';
	}
	while(p1 < lim && in1 != ieof)
	{
		fout << *in1++ << ' ';
		p1++;
	}
	while(p2 < lim && in2 != ieof)
	{
		fout << *in2++ << ' ';
		p2++;
	}
	ok = p1 == total | p2 == total;
	printf("p1: %d, p2: %d\n", p1, p2);
	return 1;
}
int main()
{
	int cnt = init(); total = cnt;
	typedef const char *cstring;
	cstring s1 = "a1.txt", s2 = "a2.txt", s3 = "b1.txt", s4 = "b2.txt";
	cstring in1 = s3, in2 = s4, out1 = s1, out2 = s2;
	int result;
	for(int ex = 0; !ok; ex++)
	{
		std::ifstream fin1(in1), fin2(in2);
		std::ofstream fout1(out1), fout2(out2);
		std::istream_iterator<int> _in1(fin1), _in2(fin2), ieof;
		do{
			printf("pass %d:\n", ex);
			result = way2_merge(_in1, _in2, ieof, fout1, MAXMM << ex);
			result &= way2_merge(_in1, _in2, ieof, fout2, MAXMM << ex);
		}
		while(result);
		std::swap(in1, out1);
		std::swap(in2, out2);
	}
	return 0;
}

2.全部讀入主存程式碼

#include <algorithm>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
int arr[int(1e8)];
int main()
{
	std::ifstream fin("a.txt");
	std::ofstream fout("b.txt");
	for(int i = 0 ; i < int(1e8) ; ++i)
	{
		fin >> arr[i];
	}
	std::sort(arr, arr + int(1e8));
	for(int i = 0 ; i < int(1e8) ; ++i)
	{
		fout << arr[i];
	}
	return 0;
}

3.造資料用的

#include <algorithm>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
const int MAXN = 1e8;
int main()
{
	srand(time(0));
	std::ofstream fout1("a1.txt"), fout2("a2.txt"), fout3("b1.txt"), fout4("b2.txt");
	for(int i = 0 ; i < MAXN ; ++i)
		fout1 << rand() << ' ';
	return 0;
}

4. 測試正確性用的

#include <bits/stdc++.h>
const int MAXN = 1e8;
bool check(const char *filename, int total = MAXN)
{
	std::ifstream fin(filename);
	int tmp1, tmp2; fin >> tmp1; int cnt = 1;
	while(fin >> tmp2 && tmp1 <= tmp2)
		cnt++, tmp1 = tmp2;
	printf("cnt: %d, total: %d\n", cnt, total);
	return cnt == total;
}
bool identify(const char *sorted, const char *src)
{
	std::ifstream fin1(sorted), fin2(src);
	std::vector<int> bucket1(32768, 0), bucket2(32768, 0);
	int tmp;
	while(fin1 >> tmp)
		bucket1[tmp]++;
	while(fin2 >> tmp)
		bucket2[tmp]++;
	for(int i = 0 ; i < 32768 ; ++i)
		if(bucket1[i] != bucket2[i]) return false;
	return true;
}
int main()
{
	printf("cnt: %s\n", (check("a1.txt") || check("b1.txt")) ? "Success" : "Failure");
	printf("identification: %s\n", identify("a1.txt", "a.txt") ? "Success" : "Failure");
}