用java寫的詞頻統計
最近做論文,中間用到詞頻統計,這裡是自己用java語言寫的,貼出來共享.
DictionaryInterface.java檔案是定義的詞典介面.
import java.util.Iterator;
public interface DictionaryInterface {
public Object add(Object key , Object value);
public Object getValue(Object key);
...
}
Mydictionary .java檔案是詞典類.包含內部類Entry是詞典裡面成員的型別,包括鍵(即詞)和值(這裡是頻數)
public class Mydictionary implements DictionaryInterface{
private Entry[] entries;
private int currentSize = 0;
private final static int DEFAULT_MAX_SIZE = 25;
public Mydictionary(){
entries = new Entry[DEFAULT_MAX_SIZE];
currentSize = 0;
}
public Mydictionary(int maxSize){
entries = new Entry[maxSize];
currentSize = 0;
}
public Object add(Object key , Object value){
Object result = null;
int keyIndex = locateIndex(key);
//System.out.println(keyIndex);
if((keyIndex<currentSize)&&key.equals(entries[keyIndex].getKey()))
{
result=entries[keyIndex].getValue();
entries[keyIndex].setValue(value);
}
else
{
if(isArrayFull())
doubleArray();
makeRoom(keyIndex);
entries[keyIndex]= new Entry(key , value);
currentSize ++;
}
return result;
}
private void makeRoom(int keyIndex) {
for(int index = currentSize;index>keyIndex;index--)
entries[index] = entries[index-1];
}
private void doubleArray() {
Entry[] oldList = entries;
int oldSize = oldList.length;
entries = new Entry[2*oldSize];
for(int index = 0;index <oldSize;index++)
{
entries[index]=(Entry) oldList[index];
}
}
private boolean isArrayFull() {
if(currentSize<entries.length)
return false;
else
return true;
}
private int locateIndex(Object key) {
Comparable cKey = (Comparable)key;
int index = 0;
while ((index<currentSize)&&cKey.compareTo(entries[index].getKey())>0)
{
index++;
}
return index;
}
public Object getValue(Object key){
int index = locateIndex(key);
//System.out.println(index);
Object en=null;
if((index<currentSize)&&key.equals(entries[index].getKey())){
en = entries[index].getValue();
}
return en;
}
}
private class Entry implements java.io.Serializable{
private Object key=null;
private Object value=null;
private Entry(Object searchKey,Object dataValue)
{
key = searchKey;
value = dataValue;
}
Object getKey()
{
return key;
}
private Object getValue()
{
return value;
}
private void setValue(Object dataValue)
{
value = dataValue;
}
}
public void display(){
for(int i=0;i<currentSize;i++){
//System.out.println(currentSize);
System.out.println(entries[i].getKey());
System.out.println(entries[i].getValue());
}
}
public boolean save2shuzhu(Object[] w,Object[] f){
for(int i=0;i<currentSize;i++){
w[i]=entries[i].getKey();
f[i]=entries[i].getValue();
}
return false;
}
}
以下的統計類是中科院的ICTCLAS中文分詞的結果上進行編寫的
FrequencyCounter .java檔案是統計類,包括統計方法,顯示方法;
public class FrequencyCounter {
private Mydictionary wordTable=new Mydictionary();
public void count(Sentence sentence){ //這裡的Sentence是一個包含詞的list;
for(int i=0;i<sentence.totalWords();i++){
String word = new String();
Object value = new Object();
word = sentence.getWord(i).getWord();
// System.out.println(word);
value = wordTable.getValue(word);
if(value ==null){
wordTable.add(word, new Integer(1));
//System.out.println(wordTable.getValue(word));
}
else{
Integer counter =(Integer)value;
int wordFrequency =counter.intValue();
wordFrequency++;
wordTable.add(word, new Integer(wordFrequency));
}
}
}
public void display(){
// Iterator keyIterator = wordTable.getKeyIterator();
// Iterator valueIterator = wordTable.getKeyIterator();
wordTable.display();
}
public static void main(String[] args){
FrequencyCounter f = new FrequencyCounter();
// f.count(s);
// f.display();
}
}
將次分詞程式與中科院的ICTCLAS中文分詞系統相結合可以作文自然語言處理的工具.