C++讀寫檔案,處理UTF8檔案,處理GBK中文字元
阿新 • • 發佈:2019-01-31
讀檔案
//從檔案中提取詞典 void getLexiconFromTrainData(char* filepath){ maxLength = 0; lexicalItemCount=0; allSentenceCount=0; wordCount=0; ifstream infile; char a; string line; string word; infile.open(filepath);//開啟檔案 if(!infile){ cerr<<"error:unable to open input file: "<<infile<<endl; } //每次讀取一個字元進行處理 do{ infile.get(a); if(infile.eof()) break; cout<<a; getchar(); }while(!infile.eof()); //每次讀取一行進行處理,行與行之間以回車換行分隔 while(getline(infile,line)){ allSentence.push_back(line);//每一行都儲存到vector中 //cout<<line; } cout<<"檔案讀取完畢"<<endl; cout<<"vector長度"<<allSentence.size()<<endl; //每次讀一個詞進行處理,詞和詞之間用空格分開 while( infile> >word ){ //cout << "Read from file: "<< word<< endl; wordCount++; lexicalItem.insert(word); if(word.size()>maxLength){ maxLength = word.size(); cout<<"迄今為止,最長的詞"<<word<<",長度為:"<<word.size()/2<<endl; } //getchar(); } maxLength = maxLength; infile.close(); cout<<"總詞彙量:"<<wordCount<<endl; cout<<"詞典詞數:"<<lexicalItem.size()<<endl; }
寫檔案
處理UTF8檔案//把所有的句子儲存迴文件 void FileIOfunc::saveAllSentenceToFile(char* filepath,vector<vector<string> > resultSentence){ ofstream outfile; stringstream ss; //outfile.open(filepath,ios::app);//以追加方式寫檔案 outfile.open(filepath);//以覆蓋方式寫檔案 //把所有的句子儲存到檔案 for(vector<vector<string> >::iterator oneSentence = resultSentence.begin();oneSentence!=resultSentence.end();oneSentence++){ for(vector<string>::iterator oneWord = (*oneSentence).begin();oneWord!=(*oneSentence).end();oneWord++){ ss<<*oneWord<<" "; } ss<<endl; } outfile<<ss.str(); cout<<"所有的句子儲存迴文件完成"<<endl; outfile.clear(); outfile.close(); }
//處理utf8編碼檔案的函式,判斷取字串的偏移量 int utf8_char_len(char firstByte) { const unsigned char kFirstBitMask = 128; // 1000000 const unsigned char kSecondBitMask = 64; // 0100000 const unsigned char kThirdBitMask = 32; // 0010000 const unsigned char kFourthBitMask = 16; // 0001000 const unsigned char kFifthBitMask = 8; // 0000100 std::string::difference_type offset = 1; if(firstByte & kFirstBitMask) // This means the first byte has a value greater than 127, and so is beyond the ASCII range. { if(firstByte & kThirdBitMask) // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point. { if(firstByte & kFourthBitMask) // This means that the first byte has a value greater than 240, and so it must be a four-octet code point. { offset = 4; }else{ offset = 3; } }else{ offset = 2; } } return offset; }
處理GBK中文字元
//判斷一個字元是否是漢字,是則返回1,否則返回0,處理GBK檔案,GBK中漢字是兩個位元組
int SentenceEncoder::isChineseCharacter(string str){
char c = str.c_str()[0];
if(c&0x80){//如果字元高位是1,則是漢字,預設是UTF8編碼
return 1;
}else{
return 0;
}
}