1. 程式人生 > >C++ 對TXT 的串並行讀寫

C++ 對TXT 的串並行讀寫

c++ index word 並行 div logs 實現 單詞數 能夠

任務說明:有36篇文檔,現在要讀入,並統計詞頻,字典長度25,希望能夠比較串並行讀寫操作的時間差距。

  1. 串行讀入並統計詞頻
    // LoadDocsInUbuntu.cpp
    
    //
    
     
    
    #include <iostream>
    
    #include <stdio.h>
    
    #include <vector>
    
     
    
    using namespace std;
    
     
    
    int main()
    
    {
    
        char filename[100];
    
        size_t d;
    
        FILE *fileptr;
    
        int word;
    
        vector< vector<int> > corpus;
    
     
    
        printf("load data ...\n");
    
     
    
        for (d = 1; d < 37; d++){
    
            
    
            sprintf(filename, "..//data/doc_%d.txt", d);
    
            fileptr = fopen(filename, "r");
    
            
    
            vector<int> doc;
    
            int ff[25] = { 0 };
    
     
    
            while (fscanf(fileptr, "%d", &word) != EOF)
    
            {
    
                ff[word - 1] = ff[word - 1] + 1;
    
                doc.push_back(word);
    
            }
    
            corpus.push_back(doc);
    
            fclose(fileptr);
    
            sprintf(filename, "..//result/freqUbuntuSerial_%d.txt", d);
    
            fileptr = fopen(filename, "w");
    
            for (int f = 0; f < 25; f++)
    
            {    
    
                fprintf(fileptr, "%d ", ff[f]);
    
            }
    
            fclose(fileptr);
    
        }
    
     
    
        cout <<"corpus.size()="<< corpus.size() << endl;
    
        return 0;
    
    }
    

  2. 這裏討論並行有三種思路:一,按照文檔序號進行分組讀入統計等操作;二,在文檔內按單詞數目分組進行統計;三,將統計與讀寫操作並行處理。

    針對第一種思路,使用openmp做多線程處理:

    

// LoadDocsByOpenMP.cpp 
//
#include <omp.h>
#include <iostream>
#include <stdio.h>
#include <vector>
#include <stdlib.h> 
#include <time.h> 
#include <string>
using namespace std;

int main()
{
	char filename[100],resultname[100];
	int d;
	FILE *fileptr[360];
	int word;
	int ff[360][25] = { 0 };
	//vector< vector<int> > corpus;
	clock_t start,finish;
	int f[360]={0};

	start=clock();
	printf("load data ...\n");
#pragma omp parallel for num_threads(4)
	for (d = 1; d < 361; d++){
		printf("Hello world, I am %d, docs index %d.\n",omp_get_thread_num(),d);
		sprintf(filename, "..//data/doc_%d.txt", d);
		fileptr[d-1] = fopen(filename, "r");
		//int ff[25]={0};
		////vector<int> doc;
		

		while (fscanf(fileptr[d-1], "%d", &word) != EOF)
		{
			ff[d-1][word - 1] = ff[d-1][word - 1] + 1;
			//ff[word-1]=ff[word-1]+1;
		//	//doc.push_back(word);
		}
		////corpus.push_back(doc);
		fclose(fileptr[d-1]);
		sprintf(resultname, "..//result/freqByOpenMP_%d.txt", d);//Be CAREFUL!For the name "filename" has been used before, we must name the string differently here.
		fileptr[d-1] = fopen(resultname, "w");
		for (f[d-1] = 0; f[d-1] < 25; f[d-1]++)
		{
			
			fprintf(fileptr[d-1], "%d ", ff[f[d-1]]);
		}
		fclose(fileptr[d-1]);
	}
	

	//cout <<"corpus.size()="<< corpus.size() << endl;
	finish=clock();
	cout<<"time cost : "<< (double)(finish-start)/ CLOCKS_PER_SEC<<endl;
	return 0;
}

但初步比較openmp對串行讀取的速度並沒有太多提升,反而是當進程數多於系統物理核數的時候,程序時間會加長。

另外兩種實現思路在後續學習中繼續實現。

C++ 對TXT 的串並行讀寫