從海量資料中找中位數(c語言實現)
阿新 • • 發佈:2019-02-05
題目:5億個int,從中找出第k大的數
演算法:之後補上。。。
實現:
#include <assert.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/time.h> #include <sys/types.h> #include <sys/stat.h> typedef struct bucket_t { int *buf; /* 輸出緩衝區 */ int count; /* 當前有多少個數 */ int idx; /* 緩衝區的指標 */ } bucket_t; static unsigned int BUF_PAGES; /* 緩衝區有多少個page */ static unsigned int PAGE_SIZE; /* page的大小 */ static unsigned int BUF_SIZE; /* 緩衝區的大小, BUF_SIZE = BUF_PAGES*PAGE_SIZE */ static unsigned int nbuckets; /* 分成多少個桶 */ static unsigned int BUCKET_BUF_SIZE; static int *buffer; /* 輸入緩衝區 */ long get_time_usecs(); void write_to_file(bucket_t *bucket, int pos); int partition(int *a, int s, int t); int quick_select(int *a, int s, int t, int i); void swap(int *p, int *q); int main(int argc, char **argv) { char filename[20]; unsigned int bp, length, bucket_size, k; int fd, i, bytes; bucket_t *bucket; long start_usecs = get_time_usecs(); strcpy(filename, argv[1]); fd = open(filename, O_RDONLY); if (fd < 0) { printf("can't open file %s\n", filename); exit(0); } nbuckets = 1024; k = atoi(argv[2]); PAGE_SIZE = 4096; /* page = 4KB */ BUF_PAGES = 1024; BUF_SIZE = PAGE_SIZE*BUF_PAGES; /* 4KB * 1024 = 4M */ BUCKET_BUF_SIZE = PAGE_SIZE*128; /* 4KB * 128 = 512KB */ buffer = (int *)malloc(BUF_SIZE); //把1-2^32個數分成nbucket個組, nbuckets必須等於2的n次冪 bucket = malloc(sizeof(bucket_t)*nbuckets); if (bucket == NULL) exit(0); for (i = 0; i < nbuckets; i++) { bucket[i].buf = malloc(BUCKET_BUF_SIZE); if (bucket[i].buf == NULL) { exit(0); } bucket[i].idx = 0; bucket[i].count = 0; } bucket_size = (1<<22); /* 分成1024個桶,每個桶容納2^22個數 */ // 讀入第一批資料到輸入緩衝區 bytes = read(fd, buffer, BUF_SIZE); length = bytes/4; bp = 0; int element, pos; unsigned int base; bucket_t *p; base = 2147483648; while (1) { //從輸入緩衝區取出一個數,加到對應的桶 element = buffer[bp++]; pos = (((long)element)+base)>>22; p = &bucket[pos]; p->buf[p->idx++] = element; p->count++; //桶內的緩衝區已滿,寫入檔案 if (p->idx*4 == BUCKET_BUF_SIZE) { write_to_file(p, pos); p->idx = 0; } //輸入緩衝區的數已用完 if (bp == length) { bytes = read(fd, buffer, BUF_SIZE); if (bytes == 0) { break; } length = bytes/4; bp = 0; } } //把每個桶剩下的數寫入檔案 for (i = 0; i < nbuckets; i++) { write_to_file(bucket+i, i); } free(buffer); close(fd); buffer = malloc(bucket_size*4); if (buffer == NULL) exit(0); //找出第k大的數位於哪個檔案 unsigned sum = 0; for (i = 0; i < nbuckets && sum < k; i++) { sum += bucket[i].count; } i--; //把該檔案讀入記憶體 sprintf(filename, "foo_%d.dat", i); printf("第%d大的數位於檔案%s的第%d大的數\n", k, filename, k+bucket[i].count-sum); fd = open(filename, O_RDONLY); if (fd < 0) { printf("can't open file %s\n", filename); free(buffer); exit(0); } bytes = read(fd, buffer, bucket_size*4); length = bytes/4; //選擇檔案內第(k+bucket[i].count-sum)大的數 int answer; answer = quick_select(buffer, 1, length-1, k+bucket[i].count-sum); printf("第%d大的數 = %d\n", k, answer); close(fd); free(buffer); //free buckets for (i = 0; i < nbuckets; i++) { free(bucket[i].buf); } free(bucket); long end_usecs = get_time_usecs(); double secs = (double)(end_usecs - start_usecs) / (double)1000000; printf("it took %.02f seconds.\n", secs); return 0; } void write_to_file(bucket_t *bucket, int pos) { char filename[20]; int fd, bytes; sprintf(filename, "foo_%d.dat", pos); fd = open(filename, O_WRONLY | O_CREAT | O_APPEND, 0666); if (fd < 0) { printf("can't open file %s\n", filename); exit(0); } bytes = write(fd, bucket->buf, bucket->idx*4); if (bucket->idx*4 != bytes) { printf("idx = %d, bytes = %d, write error\n", bucket->idx, bytes); close(fd); exit(0); } close(fd); } long get_time_usecs() { struct timeval time; struct timezone tz; memset(&tz, '\0', sizeof(struct timezone)); gettimeofday(&time, &tz); long usecs = time.tv_sec*1000000 + time.tv_usec; return usecs; } void swap(int *p, int *q) { int tmp; tmp = *p; *p = *q; *q = tmp; } /* 把a[t]作為參考,將陣列分成三部分: 小於等於a[t], * a[t]以及大於a[t],分割完畢後,a[t]所在的下標即是a[t]的順序 */ int partition(int *a, int s, int t) { int i, j; /* i用來遍歷a[s]...a[t-1], j指向大於x部分的第一個元素 */ for (i = j = s; i < t; i++) { if (a[i] < a[t]) { swap(a+i, a+j); j++; } } swap(a+j, a+t); return j; } /* 選擇陣列中第i大的元素並返回 */ int quick_select(int *a, int s, int t, int i) { int p, m; if (s == t) return a[t]; p = partition(a, s, t); m = p - s + 1; if (m == i) return a[p]; if (m > i) { return quick_select(a, s, p-1, i); } return quick_select(a, p+1, t, i-m); }
執行和測試:
尋找第1111大的整數
dd if=/dev/urandom of=random.dat bs=1M count=1024
gcc main.c./a.out random.dat 1111