兩篇文章的相似度比較
阿新 • • 發佈:2019-01-29
僅僅考慮兩篇文章的片語,並未考慮文字的語義資訊。
實現原理:
1. 對兩篇文件進行詞頻統計;
2. 利用“TF-IDF和餘弦相似度”原理,計算兩篇文件的相似度。
實現過程:
1.利用lucene對大量文章建立索引,建立語料庫,來提高TF-IDF的準確度。
2. 通過餘弦公式計算出兩篇文章的相似度。
package twodocsimiliary; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.text.DecimalFormat; import java.util.HashMap; import java.util.Map; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; /** * 僅僅考慮片語,並未考慮文字的語義資訊 * @author wangss * @date Aug 26, 2014 */ public class comparisontwodoc { public static Map<Long,String> words = new HashMap<Long,String>(); public static void main(String []args){ // AnalyzerWord analyzer = new AnalyzerWord(); String path_a = "H:\\a.txt"; String path_b = "H:\\c.txt"; String str = readFiles(path_a); String str2 = readFiles(path_b); Map<Long,Double> tf_a = iniCosine(str); Map<Long,Double> tf_b = iniCosine(str2); long molecular=0;//分子 long denominator_a=0;//分母 long denominator_b=0; System.out.println("兩篇文件相似的詞有:"); DecimalFormat df = new DecimalFormat("0.00"); for(long tfa :tf_a.keySet()){ denominator_a += tf_a.get(tfa)*tf_a.get(tfa); molecular += tf_a.get(tfa)*(null==tf_b.get(tfa)?0:tf_b.get(tfa)); if(tf_a.get(tfa)!=null && tf_b.get(tfa)!=null){ System.out.println(words.get(tfa)+" TF-IDF詞頻統計 文件一:" +df.format(tf_a.get(tfa))+";文件二:"+df.format(tf_b.get(tfa))); } } for(long tfb : tf_b.keySet()){ denominator_b += tf_b.get(tfb)*tf_b.get(tfb); } double result = 0; if(denominator_a!=0 && denominator_b!=0){ result = (molecular/(Math.sqrt(denominator_a)*Math.sqrt(denominator_b))); } System.out.println("兩篇文件相似度:"+df.format(result*100) +"%"); } private static String readFiles(String path_a) { try { InputStreamReader file_a = new InputStreamReader(new FileInputStream(new File(path_a)), "GBK"); BufferedReader bufferedReader = new BufferedReader(file_a); StringBuffer str_a = new StringBuffer(); String lineTxt = null; while((lineTxt = bufferedReader.readLine()) != null){ // System.out.println(lineTxt); str_a.append(lineTxt); } file_a.close(); return str_a.toString(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } private static Map<Long,Double> iniCosine(String str) { Map<Long,Long> tf = new HashMap<Long,Long>(); Map<Long,Double> idf = new HashMap<Long,Double>(); Reader input = new StringReader(str); // 智慧分詞關閉(對分詞的精度影響很大) IKSegmenter iks = new IKSegmenter(input, true); Lexeme lexeme = null; // StringBuilder sb = new StringBuilder(); try { //讀取索引 IndexReader indexReader = DirectoryReader.open(FSDirectory.open(new File("H:\\testIndex"))); int allDocs = indexReader.numDocs();//文件總數 // System.out.println("文件總數:"+allDocs); /*QueryParser queryParser = new MultiFieldQueryParser( Version.LUCENE_45,new String[]{"content"} , new IKAnalyzer());*/ while ((lexeme = iks.next()) != null) { String lexemeText = lexeme.getLexemeText(); long hash = ELFHash(lexemeText); IndexSearcher indexSearcher = new IndexSearcher(indexReader); TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("content",lexemeText)), indexReader.maxDoc()); int totalHits = topDocs.scoreDocs.length; double log = Math.log(allDocs/(totalHits+1)); if(log<0) log = 0;//文件反轉頻度| idf.put(hash, log); tf.put(hash, null==tf.get(hash)?1:tf.get(hash).longValue()+1); words.put(hash, lexemeText); } //計算TF-IDF的值 for(long m : idf.keySet()){ idf.put(m, tf.get(m)*idf.get(m)); } } catch (IOException e) { e.printStackTrace(); } return idf; } public static long ELFHash(String str){ long hash = 0; long x = 0; for(int i = 0; i<str.length();i++){ hash = (hash << 4)+str.charAt(i); if((x = hash & 0xF0000000L)!= 0){ hash ^= (x >> 24); hash &= ~x; } } return (hash & 0x7FFFFFFF); } }
對比兩篇文件的相似度,分析結果如下: 樣本說明(字數600左右,來自新浪部落格文章):a.txt分別和b.txt c.txt d.txt進行對比 1. a.txt和b.txt內容一樣; 2. a.txt和c.txt有一半左右內容一樣; 3. a.txt和d.txt是兩篇不同的新浪部落格文章; 對比結果: 1. 兩篇文件(一樣的文件)相似度:100.00%; 2. 兩篇文件(一半類似)相似度:53.25%; 兩篇文件相似詞的詞頻統計如下: “訴說” TF-IDF詞頻統計 文件一:0.69;文件二:0.69 “文壇” TF-IDF詞頻統計 文件一:0.69;文件二:0.69 “