simhash短文字去重

阿新 • • 發佈：2018-12-08

simHash的java實現：

import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;

import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;

/**
 * Created by Yangyang Deng on 17-9-7.
 */

public class SimhashAlgoService {

    public static void main(String[] args) {
        SimhashAlgoService simhashAlgoService = new SimhashAlgoService();
        String string = "勞斯萊斯女神\n" +
                "\n" +
                "這個車標的設計者是英國畫家兼雕刻家查爾斯·賽克斯。20世紀初，經朋友蒙塔古邀請，賽克斯負責為勞斯萊斯設計一尊雕塑車標。當時，已婚的蒙塔古瘋狂地愛著他的女祕書桑頓，懇請賽克斯以桑頓為原型設計車標。所以，賽克斯的最初設計中，雕像是一尊披著長袍的女人將手指放在嘴脣上，象徵著蒙塔古與桑頓之間不能說的祕密情史。這個戀愛故事歷經重重磨難，桑頓身份地位曾是脫衣舞女郎，所以兩人根本無法在一起生活，在得到家庭與蒙塔古妻子的諒解後，兩人最終可以走到一起，不幸的是，後來桑頓在一次乘船旅行中不幸遭遇德軍水雷，永遠沉入了冰冷的大海。\n" +
                "\n" +
                "後來，他們這段美好的愛情又略帶悽慘故事就保留在了這個車標上，羅 -羅二人也是蒙塔古的好友，他們得知這件事之後非常感動。後來，他們邀請賽克斯又把它改為雙手如羽翼般向後伸展的形象，也就是今天的“飛天女神”。 1911年，它正式成為勞斯萊斯車的車標。從此，勞斯萊斯的飛天女神車標更是美麗的愛情象徵了!";
        // 返回的指紋已經被切分成4段，方便利用指紋作對比。具體對比方式可自行百度。
        List<String> fingerPrints = simhashAlgoService.simHash(string,64);
        System.out.println(fingerPrints);
    }


    private StandardTokenizer hanlpService;

    // 待分詞的文字
    private String tokens;

    // 十進位制的指紋
    private BigInteger intSimHash;

    // 二進位制的指紋
    private String strSimHash;

    // 二進位制指紋的4個子指紋
    private String strSimHashA;
    private String strSimHashB;
    private String strSimHashC;
    private String strSimHashD;

    private Map<String,Integer> wordCount;

    private int overCount = 5;

    public BigInteger getIntSimHash(){
        return this.intSimHash;
    }

    public String getStrSimHash() {
        return this.strSimHash;
    }

    private String getStrSimHashA() {
        return this.strSimHashA;
    }

    private String getStrSimHashB() {
        return this.strSimHashB;
    }

    private String getStrSimHashC() {
        return this.strSimHashC;
    }

    private String getStrSimHashD() {
        return this.strSimHashD;
    }

    // 指紋的長度
    private int hashbits = 64;

    // 停用的詞性
    private Map<String,String> stopNatures = new HashMap<String, String>();

    // 詞性的權重
    private Map<String, Integer> weightOfNature = new HashMap<String, Integer>();


    public void setTokens(String tokens) {
        this.tokens = tokens;
    }

    public void setHashbits(int hashbits) {
        this.hashbits = hashbits;
    }

    private void setMap() {
        // 停用詞性為w:標點
        this.stopNatures.put("w","");
        // 個性化設定詞性權重，這裡將n：名詞設定為2。（預設權重為1）
        this.weightOfNature.put("n",2);
    }

    private String preProcess(String content) {
        // 若輸入為HTML,下面會過濾掉所有的HTML的tag
        content = Jsoup.clean(content, Whitelist.none());
        content = StringUtils.lowerCase(content);
        String[] strings = {" ","\n","\\r","\\n","\\t","&nbsp;"};
        for (String s:strings) {
            content = content.replace(s,"");
        }
        return content;
    }

    public List<String> simHash(String tokens, int hashbits) {
        tokens = preProcess(tokens);
        // cleanResume 刪除簡歷固有文字
        this.tokens = cleanResume(tokens);
        this.hashbits = hashbits;
        this.wordCount = new HashMap<String, Integer>();
        setMap();

        // 定義特徵向量/陣列
        int[] v = new int[this.hashbits];
        // 1、將文字去掉格式後, 分詞.
        List<Term> termList = StandardTokenizer.segment(this.tokens);
        for (Term term:termList){
            String word = term.word;
            String nature = term.nature.toString();
//             過濾超頻詞
            if (this.wordCount.containsKey(word)) {
                int count = this.wordCount.get(word);
                if (count>this.overCount) {continue;}
                this.wordCount.put(word,count+1);
            }
            else {
                this.wordCount.put(word,1);
            }

            // 過濾停用詞性
            if (this.stopNatures.containsKey(nature)) {continue;}
            // 2、將每一個分詞hash為一組固定長度的數列.比如 64bit 的一個整數.
            BigInteger t = this.hash(word);
            for (int i = 0; i < this.hashbits; i++) {
                BigInteger bitmask = new BigInteger("1").shiftLeft(i);
                // 3、建立一個長度為64的整數陣列(假設要生成64位的數字指紋,也可以是其它數字),
                // 對每一個分詞hash後的數列進行判斷,如果是1000...1,那麼陣列的第一位和末尾一位加1,
                // 中間的62位減一,也就是說,逢1加1,逢0減1.一直到把所有的分詞hash數列全部判斷完畢.
                int weight = 1;
                if (this.weightOfNature.containsKey(nature)) {
                    weight = this.weightOfNature.get(nature);
                }
                if (t.and(bitmask).signum() != 0) {
                    // 這裡是計算整個文件的所有特徵的向量和
                    v[i] += weight;
                } else {
                    v[i] -= weight;
                }
            }
        }
        BigInteger fingerprint = new BigInteger("0");
        StringBuffer simHashBuffer = new StringBuffer();
        for (int i = 0; i < this.hashbits; i++) {
            // 4、最後對陣列進行判斷,大於0的記為1,小於等於0的記為0,得到一個 64bit 的數字指紋/簽名.
            if (v[i] >= 0) {
                fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
                simHashBuffer.append("1");
            } else {
                simHashBuffer.append("0");
            }
        }
        this.strSimHash = simHashBuffer.toString();
        this.strSimHashA = simHashBuffer.substring(0,16);
        this.strSimHashB = simHashBuffer.substring(16,32);
        this.strSimHashC = simHashBuffer.substring(32,48);
        this.strSimHashD = simHashBuffer.substring(48,64);

        this.intSimHash = fingerprint;
        List<String> simHashList = new ArrayList<String>();
        simHashList.add(this.getStrSimHashA());
        simHashList.add(this.getStrSimHashB());
        simHashList.add(this.getStrSimHashC());
        simHashList.add(this.getStrSimHashD());
        return simHashList;
    }



    private BigInteger hash(String source) {

        if (source == null || source.length() == 0) {
            return new BigInteger("0");
        } else {
            /**
             * 當sourece 的長度過短，會導致hash演算法失效，因此需要對過短的詞補償
             */
            while (source.length()<3) {
                source = source+source.charAt(0);
            }
            char[] sourceArray = source.toCharArray();
            BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
            BigInteger m = new BigInteger("1000003");
            BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
            for (char item : sourceArray) {
                BigInteger temp = BigInteger.valueOf((long) item);
                x = x.multiply(m).xor(temp).and(mask);
            }
            x = x.xor(new BigInteger(String.valueOf(source.length())));
            if (x.equals(new BigInteger("-1"))) {
                x = new BigInteger("-2");
            }
            return x;
        }
    }

    // 用於計算十進位制的hamming距離
    public int hammingDistance(SimhashAlgoService other) {

        BigInteger x = this.intSimHash.xor(other.intSimHash);
        int tot = 0;

        // 統計x中二進位制位數為1的個數
        // 我們想想，一個二進位制數減去1，那麼，從最後那個1（包括那個1）後面的數字全都反了，對吧，然後，n&(n-1)就相當於把後面的數字清0，
        // 我們看n能做多少次這樣的操作就OK了。

        while (x.signum() != 0) {
            tot += 1;
            x = x.and(x.subtract(new BigInteger("1")));
        }
        return tot;
    }


    // 用於計算二進位制的hamming距離
    public int getDistance(String str1, String str2) {
        int distance;
        if (str1.length() != str2.length()) {
            distance = -1;
        } else {
            distance = 0;
            for (int i = 0; i < str1.length(); i++) {
                if (str1.charAt(i) != str2.charAt(i)) {
                    distance++;
                }
            }
        }
        return distance;
    }

    public List subByDistance(SimhashAlgoService Simhash, int distance) {
        // 分成幾組來檢查
        int numEach = this.hashbits / (distance + 1);
        List characters = new ArrayList();

        StringBuffer buffer = new StringBuffer();

        int k = 0;
        for (int i = 0; i < this.intSimHash.bitLength(); i++) {
            // 當且僅當設定了指定的位時，返回 true
            boolean sr = Simhash.intSimHash.testBit(i);

            if (sr) {
                buffer.append("1");
            } else {
                buffer.append("0");
            }

            if ((i + 1) % numEach == 0) {
                // 將二進位制轉為BigInteger
                BigInteger eachValue = new BigInteger(buffer.toString(), 2);
                System.out.println("----" + eachValue);
                buffer.delete(0, buffer.length());
                characters.add(eachValue);
            }
        }

        return characters;
    }

    // 過濾無關內容
    private String cleanResume(String content) {
        String[] tobeReplace = {
                "\n","\r","\t","\\n","\\r","\\t"
        };

        for (String s:tobeReplace) {
            content = content.replace(s,"");
        }


        return content;
    }



}

pom檔案依賴：

<dependencies>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.3.4</version>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>

</dependencies>

simhash短文字去重

simhash短文字去重

淺談基於simhash的文字去重原理

字符串去重

django的orm獲取字段去重值

使用SimHash進行海量文本去重[轉載]

新華三：字符串不分大小寫去重

怎樣根據某個字段去重，取得單據內碼

字符串、數組去重

使用SimHash進行海量文本去重[轉]

[Algorithm] 使用SimHash進行海量文字去重

火眼金睛演算法，教你海量短文字場景下去重

陣列中巢狀物件，根據物件的某個字對物件進行去重

面試|海量文字去重~simhash

文件去重演算法：SimHash和MinHash

關於SimHash去重原理的理解（能力工場小馬哥）

java-ArrayList中去重復字符串或重復對象、LinkedList集合、泛型、增強for、靜態導入、可變參數、asList()方法、集合嵌套

Scrapy-redis增量爬取以及Simhash相似文件的去重

使用simhash演算法對網頁去重

海量資料去重之SimHash演算法簡介和應用

simhash與Google的網頁去重

simhash短文字去重

相關推薦