perl 大文本詞頻統計.
阿新 • • 發佈:2017-05-27
pre bst geb don nbsp length $2 詞頻統計 int
思想是設置子文本最大長度,然後分割成多個子文本, 最後合並.
詞頻則是當前位置字和前一位置的字的組合 進入hash.
代碼如下
use Encode; ##編碼解碼 system("time /t"); ##開始時間 $g_MaxBiNum=1000000; ##最大文本長度 BiCount("train.txt"); MergeBi(\@BiTmp,"bi.txt"); foreach (@BiTmp){ unlink($_); } system("time /t"); ##結束時間 TrainWordToNum; ##統計字頻 sub BiCount ##統計詞語 { my($File)=@_; $BiFile="tmp"; open(In,"$File"); $ZiNum=0; $ID=0; @BiTmp=(); while(<In>){ chomp; s/\s+//g; $Line=$_; while( $Line ne "" ){ $Len=1; if ( ord($Line) & 0x80 ){ $Len=2; } $H2=substr($Line,0,$Len); if ( $H1 ne "" ){ $Bi=$H1."_".$H2; $hashBi{$Bi}++; } $H1=$H2; $ZiNum++; if ( $ZiNum > $g_MaxBiNum ){ $BiFileTmp=$BiFile."_".$ID; push(@BiTmp,$BiFileTmp); open(Out,">$BiFileTmp"); print "$BiFileTmp done!\n"; foreach (sort keys %hashBi ){ print Out "$_\t$hashBi{$_}\n"; } %hashBi=(); $ZiNum=0; close(Out); $ID++; } $Line=substr($Line,$Len,length($Line)-$Len); } } close(In); } sub MergeBi { my($RefBiFileList,$Merged)=@_; open(Out,">$Merged"); foreach (@{$RefBiFileList}){ my $H="F".$_; open($H,"$_"); if ( <$H>=~/(\S+)\t(\d+)/ ){ ${$hash{$1}}{$H}=$2; } } @BiStr=sort keys %hash; while( @BiStr > 0 ){ $Num=0; @Fhandle=(); foreach $Handle(keys %{$hash{$BiStr[0]}} ){ $Num+=${$hash{$BiStr[0]}}{$Handle}; push(@Fhandle,$Handle); } print Out "$BiStr[0]\t$Num\n"; delete $hash{$BiStr[0]}; foreach $Handle(@Fhandle){ if ( <$Handle>=~/(\S+)\t(\d+)/ ){ ${$hash{$1}}{$Handle}=$2; } } @BiStr=sort keys %hash; } foreach (@{$RefBiFileList}){ my $H="F".$_; close($H); } } sub TrainWordToNum{ open(in,"train.txt"); while(<in>) { chomp; $line=decode("GBK",$_); @AllW=$line=~/./g; foreach $_(@AllW) { $_=encode("GBK",$_); $Word2Num{$_}++; } } close(in); }
perl 大文本詞頻統計.