特徵提取-計算tf-idf
阿新 • • 發佈:2019-02-06
用Java實現特徵提取計算tf-idf
(1)計算反文件頻次公式如下:
(2)計算TF-IDF公式如下:
tf-idf=tf*idf
(2)Java程式碼實現
package com.panguoyuan.datamining.first; import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; public class TestTFIDF { public static List<Map> articleList = new ArrayList<Map>(); public static List<Result> listResult = new ArrayList<Result>(); //文章 詞 tf-idf public static void main(String[] args) throws Exception{ BufferedReader fr = new BufferedReader(new FileReader("F:\\workspace1\\datamining\\data\\test_sohu_news_data")); initD(fr); calculateTF_IDF();print(); } private static void print(){ Collections.sort(listResult); for(Result r : listResult){ System.out.println("第【"+r.getArticleId()+"】篇文章 :"+r.getWord()+" "+r.getTFIDF()); } } public static void calculateTF_IDF() throws Exception{ int d = articleList.size(); for(int i=0;i<articleList.size();i++ ){ Map<String,Integer> wordMap = articleList.get(i); Set set = wordMap.entrySet(); Iterator iter = set.iterator(); while(iter.hasNext()){ String obj = iter.next().toString(); String[] wordcount = obj.split("="); int tf =Integer.parseInt(wordcount[1]); double idf = log(d/(getDF(wordcount[0])+1),2); double tf_idf = tf * idf ; // if(tf_idf > 0){ // System.out.println("第【"+i+"】篇文章:"+wordcount[0]+" "+tf_idf); Result r = new Result(i,wordcount[0],tf_idf); listResult.add(r); // } } } } public static void initD(BufferedReader br) throws Exception{ int i; while ((i = br.read()) != -1) { String line = br.readLine(); Map<String,Integer> wordMap = getWordCountMap(line); articleList.add(wordMap); } } public static int getDF(String word){ int count = 0; for(int i=0;i<articleList.size();i++){ Map<String,Integer> map = articleList.get(i); if(map.containsKey(word)){ count ++; } } return count; } public static Map<String,Integer> getWordCountMap(String article) throws Exception{ Map<String,Integer> wordMap = new HashMap<String,Integer>(); String[] words = article.split(" "); for (int j = 0; j < words.length; j++) { String word = words[j]; if (wordMap.get(word) != null && !"".equals(wordMap.get(word))) { wordMap.put(word, wordMap.get(word) + 1); } else { wordMap.put(word, 1); } } return wordMap; } public static double log(double value, double base) { return Math.log(value)/Math.log(base); } }
package com.panguoyuan.datamining.first; public class Result implements Comparable { private int articleId; private String word; private double TFIDF; public Result(int articleId, String word, double tFIDF) { super(); this.articleId = articleId; this.word = word; TFIDF = tFIDF; } public int getArticleId() { return articleId; } public void setArticleId(int articleId) { this.articleId = articleId; } public String getWord() { return word; } public void setWord(String word) { this.word = word; } public double getTFIDF() { return TFIDF; } public void setTFIDF(double tFIDF) { TFIDF = tFIDF; } @Override public int compareTo(Object o) { Result r = (Result)o; if(r.getTFIDF() > this.TFIDF){ return 1; }else if(r.getTFIDF()<this.TFIDF){ return -1; }else{ return 0; } } }
(3)排序後輸出結果