1. 程式人生 > >perl 大文本詞頻統計.

perl 大文本詞頻統計.

pre bst geb don nbsp length $2 詞頻統計 int

思想是設置子文本最大長度,然後分割成多個子文本, 最後合並.

詞頻則是當前位置字和前一位置的字的組合 進入hash.

代碼如下

use Encode;   ##編碼解碼
system("time /t");  ##開始時間
$g_MaxBiNum=1000000;  ##最大文本長度
BiCount("train.txt");
MergeBi(\@BiTmp,"bi.txt");
foreach (@BiTmp){
		unlink($_);
}
system("time /t");   ##結束時間

TrainWordToNum;  ##統計字頻


sub BiCount  ##統計詞語
{
	my($File)=@_;
	$BiFile="tmp";
	open(In,"$File");
	$ZiNum=0;
	$ID=0;
	@BiTmp=();
	while(<In>){
		chomp;	
		s/\s+//g;
		$Line=$_;
		while( $Line ne "" ){
			$Len=1;
			if ( ord($Line) & 0x80 ){
				$Len=2;
			}
			$H2=substr($Line,0,$Len);
			if ( $H1 ne  "" ){
				$Bi=$H1."_".$H2;
				$hashBi{$Bi}++;
			}
			$H1=$H2;
			$ZiNum++;
		
			if ( $ZiNum > $g_MaxBiNum ){
				$BiFileTmp=$BiFile."_".$ID;
				push(@BiTmp,$BiFileTmp);
				open(Out,">$BiFileTmp");
				print "$BiFileTmp done!\n";
				foreach (sort keys %hashBi ){
					print Out "$_\t$hashBi{$_}\n";
				}
				%hashBi=();
				$ZiNum=0;
				close(Out);
				$ID++;
			}
			
			$Line=substr($Line,$Len,length($Line)-$Len);
		}
	}
		
	close(In);
	
}

sub MergeBi
{
	my($RefBiFileList,$Merged)=@_;
	open(Out,">$Merged");
	foreach (@{$RefBiFileList}){
		my $H="F".$_;
		open($H,"$_");
		if ( <$H>=~/(\S+)\t(\d+)/ ){
			${$hash{$1}}{$H}=$2;		
		}
	}
	@BiStr=sort keys %hash;
	while( @BiStr > 0 ){
		$Num=0;
		@Fhandle=();
		foreach $Handle(keys %{$hash{$BiStr[0]}} ){
			$Num+=${$hash{$BiStr[0]}}{$Handle};
			push(@Fhandle,$Handle);
		}
		print Out "$BiStr[0]\t$Num\n";
		
		delete $hash{$BiStr[0]};
		foreach $Handle(@Fhandle){
			
			if ( <$Handle>=~/(\S+)\t(\d+)/ ){
				${$hash{$1}}{$Handle}=$2;		
			}
		}
		@BiStr=sort keys %hash;
	}
	
	foreach (@{$RefBiFileList}){
		my $H="F".$_;
		close($H);
	}
}

sub TrainWordToNum{
	open(in,"train.txt");
	while(<in>)
	{
		chomp;
		$line=decode("GBK",$_);
		@AllW=$line=~/./g;
		foreach $_(@AllW)
		{
			$_=encode("GBK",$_);
			$Word2Num{$_}++;
		}
	}
	close(in);
}

  

perl 大文本詞頻統計.