1. 程式人生 > >perl -正向最大匹配 轉自Sighan 提供的FMM程式

perl -正向最大匹配 轉自Sighan 提供的FMM程式

#!/usr/bin/perl -w
# GBK編碼,引數一 詞典  引數二 待分文字

#轉自Sighan 提供的FMM程式 ,不是原創

#所以把人家的宣告都放在下面了

###########################################################################
#                                                                         #
#                               SIGHAN                                    #
#                         Copyright (c) 2003                              #
#                        All Rights Reserved.                             #
#                                                                         #
#  Permission is hereby granted, free of charge, to use and distribute    #
#  this software and its documentation without restriction, including     #
#  without limitation the rights to use, copy, modify, merge, publish,    #
#  distribute, sublicense, and/or sell copies of this work, and to        #
#  permit persons to whom this work is furnished to do so, subject to     #
#  the following conditions:                                              #
#   1. The code must retain the above copyright notice, this list of      #
#      conditions and the following disclaimer.                           #
#   2. Any modifications must be clearly marked as such.                  #
#   3. Original authors' names are not deleted.                           #
#   4. The authors' names are not used to endorse or promote products     #
#      derived from this software without specific prior written          #
#      permission.                                                        #
#                                                                         #
#  SIGHAN AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES       #
#  WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF      #
#  MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SIGHAN NOR THE          #
#  CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL      #
#  DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA     #
#  OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER      #
#  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR       #
#  PERFORMANCE OF THIS SOFTWARE.                                          #
#                                                                         #
###########################################################################
#                                                                         #
# Author: Richard Sproat (
[email protected]
)                           #
#                                                                         #
###########################################################################


$USAGE = "Usage:\t$0 dictionary\n\t";


if (@ARGV < 1) {print "$USAGE\n"; exit;}


%dict = ();
$maxwlen = 0;


open (S, $ARGV[0]) or die "$ARGV[0]: $!\n";
while (<S>) { 
    chop; 
    $dict{$_} = 1; 
    my $l = length($_);
    $maxwlen = $l if $l > $maxwlen;
}
close (S);


shift @ARGV;


$n = 0;
while (<>) {
    chop;
    s/\s*//g;
    my $text = $_;

#print "$text\t";


    while ($text ne "") {
$sub = substr($text, 0, $maxwlen);
while ($sub ne "") {


   if ($dict{$sub}) {
print "$sub\/";
for (my $i = 0; $i < length($sub); ++$i) {
   $text =~ s/^.//;
}
last;
   }
   $sub =~ s/.$//;
}
if ($sub eq "")  {
   if ($text =~ /^([\x21-\x7e])/) {
print "$1\/";
$text =~ s/^.//;
   }
   elsif ($text =~ /^([^\x21-\x7e].)/) {
print "$1\/";
$text =~ s/^..//;
   }
   else { ## shouldn't happen
print STDERR "Oops: shouldn't be here: $n\n";
print "$1\/";
$text =~ s/^.//;
   }
}
    }
    print "\n";
    ++$n;
}


exit(0);