C++ 對TXT 的串並行讀寫
阿新 • • 發佈:2017-07-20
c++ index word 並行 div logs 實現 單詞數 能夠
任務說明:有36篇文檔,現在要讀入,並統計詞頻,字典長度25,希望能夠比較串並行讀寫操作的時間差距。
-
串行讀入並統計詞頻
// LoadDocsInUbuntu.cpp // #include <iostream> #include <stdio.h> #include <vector> using namespace std; int main() { char filename[100]; size_t d; FILE *fileptr; int word; vector< vector<int> > corpus; printf("load data ...\n"); for (d = 1; d < 37; d++){ sprintf(filename, "..//data/doc_%d.txt", d); fileptr = fopen(filename, "r"); vector<int> doc; int ff[25] = { 0 }; while (fscanf(fileptr, "%d", &word) != EOF) { ff[word - 1] = ff[word - 1] + 1; doc.push_back(word); } corpus.push_back(doc); fclose(fileptr); sprintf(filename, "..//result/freqUbuntuSerial_%d.txt", d); fileptr = fopen(filename, "w"); for (int f = 0; f < 25; f++) { fprintf(fileptr, "%d ", ff[f]); } fclose(fileptr); } cout <<"corpus.size()="<< corpus.size() << endl; return 0; }
-
這裏討論並行有三種思路:一,按照文檔序號進行分組讀入統計等操作;二,在文檔內按單詞數目分組進行統計;三,將統計與讀寫操作並行處理。
針對第一種思路,使用openmp做多線程處理:
// LoadDocsByOpenMP.cpp // #include <omp.h> #include <iostream> #include <stdio.h> #include <vector> #include <stdlib.h> #include <time.h> #include <string> using namespace std; int main() { char filename[100],resultname[100]; int d; FILE *fileptr[360]; int word; int ff[360][25] = { 0 }; //vector< vector<int> > corpus; clock_t start,finish; int f[360]={0}; start=clock(); printf("load data ...\n"); #pragma omp parallel for num_threads(4) for (d = 1; d < 361; d++){ printf("Hello world, I am %d, docs index %d.\n",omp_get_thread_num(),d); sprintf(filename, "..//data/doc_%d.txt", d); fileptr[d-1] = fopen(filename, "r"); //int ff[25]={0}; ////vector<int> doc; while (fscanf(fileptr[d-1], "%d", &word) != EOF) { ff[d-1][word - 1] = ff[d-1][word - 1] + 1; //ff[word-1]=ff[word-1]+1; // //doc.push_back(word); } ////corpus.push_back(doc); fclose(fileptr[d-1]); sprintf(resultname, "..//result/freqByOpenMP_%d.txt", d);//Be CAREFUL!For the name "filename" has been used before, we must name the string differently here. fileptr[d-1] = fopen(resultname, "w"); for (f[d-1] = 0; f[d-1] < 25; f[d-1]++) { fprintf(fileptr[d-1], "%d ", ff[f[d-1]]); } fclose(fileptr[d-1]); } //cout <<"corpus.size()="<< corpus.size() << endl; finish=clock(); cout<<"time cost : "<< (double)(finish-start)/ CLOCKS_PER_SEC<<endl; return 0; }
但初步比較openmp對串行讀取的速度並沒有太多提升,反而是當進程數多於系統物理核數的時候,程序時間會加長。
另外兩種實現思路在後續學習中繼續實現。
C++ 對TXT 的串並行讀寫