using System;
using System.Collections.Generic;
using System.Linq;
using System.Numerics;
using System.Text;

namespace chx
    public class SimHash
        private String tokens;
        private BigInteger strSimHash;
        private int hashbits = 128;

BigInteger StrSimHash { get { return strSimHash; } } public SimHash(String tokens, int hashbits) { this.tokens = tokens; this.hashbits = hashbits; this.strSimHash = simHash(); } public
SimHash(String tokens) { this.tokens = tokens; this.strSimHash = simHash(); } private BigInteger simHash() { int[] v = new int[this.hashbits]; ChxTokenizer stringTokens = new ChxTokenizer(this.tokens); while
(stringTokens.hasMoreTokens()) { String temp = stringTokens.nextToken(); BigInteger t = this.hash(temp); //Console.WriteLine("temp = {0} : {1}", temp, t); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = BigInteger.One << i; if ((t & bitmask).Sign!=0) { v[i] += 1; } else { v[i] -= 1; } } } BigInteger fingerprint = BigInteger.Zero; for (int i = 0; i < this.hashbits; i++) { if (v[i] >= 0) { fingerprint = fingerprint + (BigInteger.Parse("1") << i); } } return fingerprint; } private BigInteger hash(string source) { if (source == null || source.Length == 0) { return BigInteger.Zero; } else { char[] sourceArray = source.ToCharArray(); BigInteger x = new BigInteger(((long)sourceArray[0]) << 7); BigInteger m = BigInteger.Parse("1000003"); BigInteger mask = BigInteger.Pow(new BigInteger(2), this.hashbits) - BigInteger.One; foreach (char item in sourceArray) { BigInteger temp = new BigInteger((long)item); x = ((x * m) ^ temp) & mask; } x = x ^ (new BigInteger(source.Length)); if (x.Equals(BigInteger.MinusOne)) { x = new BigInteger(-2); } return x; } } public int HammingDistance(SimHash other) { BigInteger m = (BigInteger.One << this.hashbits) - BigInteger.One; BigInteger x = (this.strSimHash ^ other.strSimHash) & m; int tot = 0; while (x.Sign != 0) { tot += 1; x = x & (x- BigInteger.One); } return tot; } } //簡單的分詞法,直接將中文分成單個漢。可以用其他分詞法代替 public class ChxTokenizer { private string source; private int index; private int length; public ChxTokenizer(string source) { this.source = source; this.index = 0; this.length = (source ?? "").Length; } public bool hasMoreTokens() { return index < length; } public string nextToken() { String s = source.Substring(index, 1); index++; return s; } } }


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace chx
    class Program
        static void Main(string[] args)
    private static void Test()
        var s1 = "中文分詞太麻煩了,也有些中文分片語件也不錯";
        var hash1 = new SimHash(s1);
        Console.WriteLine("S1.simhash: {0}", hash1.StrSimHash);
        var s2 = "有些中文分詞太麻煩了,也有些中文分片語件也不錯";
        var hash2 = new SimHash(s2);
        Console.WriteLine("S2.simhash: {0}", hash1.StrSimHash);
        var s3 = "有些中文分詞太麻煩了";
        var hash3 = new SimHash(s3);
        Console.WriteLine("S3.simhash: {0}", hash1.StrSimHash);



