simhash短文字去重
阿新 • • 發佈:2018-12-08
simHash的java實現:
import com.hankcs.hanlp.seg.common.Term; import com.hankcs.hanlp.tokenizer.StandardTokenizer; import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.safety.Whitelist; /** * Created by Yangyang Deng on 17-9-7. */ public class SimhashAlgoService { public static void main(String[] args) { SimhashAlgoService simhashAlgoService = new SimhashAlgoService(); String string = "勞斯萊斯女神\n" + "\n" + "這個車標的設計者是英國畫家兼雕刻家查爾斯·賽克斯。20世紀初,經朋友蒙塔古邀請,賽克斯負責為勞斯萊斯設計一尊雕塑車標。當時,已婚的蒙塔古瘋狂地愛著他的女祕書桑頓,懇請賽克斯以桑頓為原型設計車標。所以,賽克斯的最初設計中,雕像是一尊披著長袍的女人將手指放在嘴脣上,象徵著蒙塔古與桑頓之間不能說的祕密情史。這個戀愛故事歷經重重磨難,桑頓身份地位曾是脫衣舞女郎,所以兩人根本無法在一起生活,在得到家庭與蒙塔古妻子的諒解後,兩人最終可以走到一起,不幸的是,後來桑頓在一次乘船旅行中不幸遭遇德軍水雷,永遠沉入了冰冷的大海。\n" + "\n" + "後來,他們這段美好的愛情又略帶悽慘故事就保留在了這個車標上,羅 -羅二人也是蒙塔古的好友,他們得知這件事之後非常感動。後來,他們邀請賽克斯又把它改為雙手如羽翼般向後伸展的形象,也就是今天的“飛天女神”。 1911年,它正式成為勞斯萊斯車的車標。從此,勞斯萊斯的飛天女神車標更是美麗的愛情象徵了!"; // 返回的指紋已經被切分成4段,方便利用指紋作對比。具體對比方式可自行百度。 List<String> fingerPrints = simhashAlgoService.simHash(string,64); System.out.println(fingerPrints); } private StandardTokenizer hanlpService; // 待分詞的文字 private String tokens; // 十進位制的指紋 private BigInteger intSimHash; // 二進位制的指紋 private String strSimHash; // 二進位制指紋的4個子指紋 private String strSimHashA; private String strSimHashB; private String strSimHashC; private String strSimHashD; private Map<String,Integer> wordCount; private int overCount = 5; public BigInteger getIntSimHash(){ return this.intSimHash; } public String getStrSimHash() { return this.strSimHash; } private String getStrSimHashA() { return this.strSimHashA; } private String getStrSimHashB() { return this.strSimHashB; } private String getStrSimHashC() { return this.strSimHashC; } private String getStrSimHashD() { return this.strSimHashD; } // 指紋的長度 private int hashbits = 64; // 停用的詞性 private Map<String,String> stopNatures = new HashMap<String, String>(); // 詞性的權重 private Map<String, Integer> weightOfNature = new HashMap<String, Integer>(); public void setTokens(String tokens) { this.tokens = tokens; } public void setHashbits(int hashbits) { this.hashbits = hashbits; } private void setMap() { // 停用詞性為w:標點 this.stopNatures.put("w",""); // 個性化設定詞性權重,這裡將n:名詞設定為2。(預設權重為1) this.weightOfNature.put("n",2); } private String preProcess(String content) { // 若輸入為HTML,下面會過濾掉所有的HTML的tag content = Jsoup.clean(content, Whitelist.none()); content = StringUtils.lowerCase(content); String[] strings = {" ","\n","\\r","\\n","\\t"," "}; for (String s:strings) { content = content.replace(s,""); } return content; } public List<String> simHash(String tokens, int hashbits) { tokens = preProcess(tokens); // cleanResume 刪除簡歷固有文字 this.tokens = cleanResume(tokens); this.hashbits = hashbits; this.wordCount = new HashMap<String, Integer>(); setMap(); // 定義特徵向量/陣列 int[] v = new int[this.hashbits]; // 1、將文字去掉格式後, 分詞. List<Term> termList = StandardTokenizer.segment(this.tokens); for (Term term:termList){ String word = term.word; String nature = term.nature.toString(); // 過濾超頻詞 if (this.wordCount.containsKey(word)) { int count = this.wordCount.get(word); if (count>this.overCount) {continue;} this.wordCount.put(word,count+1); } else { this.wordCount.put(word,1); } // 過濾停用詞性 if (this.stopNatures.containsKey(nature)) {continue;} // 2、將每一個分詞hash為一組固定長度的數列.比如 64bit 的一個整數. BigInteger t = this.hash(word); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = new BigInteger("1").shiftLeft(i); // 3、建立一個長度為64的整數陣列(假設要生成64位的數字指紋,也可以是其它數字), // 對每一個分詞hash後的數列進行判斷,如果是1000...1,那麼陣列的第一位和末尾一位加1, // 中間的62位減一,也就是說,逢1加1,逢0減1.一直到把所有的分詞hash數列全部判斷完畢. int weight = 1; if (this.weightOfNature.containsKey(nature)) { weight = this.weightOfNature.get(nature); } if (t.and(bitmask).signum() != 0) { // 這裡是計算整個文件的所有特徵的向量和 v[i] += weight; } else { v[i] -= weight; } } } BigInteger fingerprint = new BigInteger("0"); StringBuffer simHashBuffer = new StringBuffer(); for (int i = 0; i < this.hashbits; i++) { // 4、最後對陣列進行判斷,大於0的記為1,小於等於0的記為0,得到一個 64bit 的數字指紋/簽名. if (v[i] >= 0) { fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); simHashBuffer.append("1"); } else { simHashBuffer.append("0"); } } this.strSimHash = simHashBuffer.toString(); this.strSimHashA = simHashBuffer.substring(0,16); this.strSimHashB = simHashBuffer.substring(16,32); this.strSimHashC = simHashBuffer.substring(32,48); this.strSimHashD = simHashBuffer.substring(48,64); this.intSimHash = fingerprint; List<String> simHashList = new ArrayList<String>(); simHashList.add(this.getStrSimHashA()); simHashList.add(this.getStrSimHashB()); simHashList.add(this.getStrSimHashC()); simHashList.add(this.getStrSimHashD()); return simHashList; } private BigInteger hash(String source) { if (source == null || source.length() == 0) { return new BigInteger("0"); } else { /** * 當sourece 的長度過短,會導致hash演算法失效,因此需要對過短的詞補償 */ while (source.length()<3) { source = source+source.charAt(0); } char[] sourceArray = source.toCharArray(); BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7); BigInteger m = new BigInteger("1000003"); BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1")); for (char item : sourceArray) { BigInteger temp = BigInteger.valueOf((long) item); x = x.multiply(m).xor(temp).and(mask); } x = x.xor(new BigInteger(String.valueOf(source.length()))); if (x.equals(new BigInteger("-1"))) { x = new BigInteger("-2"); } return x; } } // 用於計算十進位制的hamming距離 public int hammingDistance(SimhashAlgoService other) { BigInteger x = this.intSimHash.xor(other.intSimHash); int tot = 0; // 統計x中二進位制位數為1的個數 // 我們想想,一個二進位制數減去1,那麼,從最後那個1(包括那個1)後面的數字全都反了,對吧,然後,n&(n-1)就相當於把後面的數字清0, // 我們看n能做多少次這樣的操作就OK了。 while (x.signum() != 0) { tot += 1; x = x.and(x.subtract(new BigInteger("1"))); } return tot; } // 用於計算二進位制的hamming距離 public int getDistance(String str1, String str2) { int distance; if (str1.length() != str2.length()) { distance = -1; } else { distance = 0; for (int i = 0; i < str1.length(); i++) { if (str1.charAt(i) != str2.charAt(i)) { distance++; } } } return distance; } public List subByDistance(SimhashAlgoService Simhash, int distance) { // 分成幾組來檢查 int numEach = this.hashbits / (distance + 1); List characters = new ArrayList(); StringBuffer buffer = new StringBuffer(); int k = 0; for (int i = 0; i < this.intSimHash.bitLength(); i++) { // 當且僅當設定了指定的位時,返回 true boolean sr = Simhash.intSimHash.testBit(i); if (sr) { buffer.append("1"); } else { buffer.append("0"); } if ((i + 1) % numEach == 0) { // 將二進位制轉為BigInteger BigInteger eachValue = new BigInteger(buffer.toString(), 2); System.out.println("----" + eachValue); buffer.delete(0, buffer.length()); characters.add(eachValue); } } return characters; } // 過濾無關內容 private String cleanResume(String content) { String[] tobeReplace = { "\n","\r","\t","\\n","\\r","\\t" }; for (String s:tobeReplace) { content = content.replace(s,""); } return content; } }
pom檔案依賴:
<dependencies>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.3.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
</dependencies>