Skip to content
Snippets Groups Projects
Commit 2c3f9f24 authored by Kristoffer Ström's avatar Kristoffer Ström
Browse files

Add hamming distance calculation to bloom filters

parent 24063341
Branches
No related tags found
1 merge request!1New
......@@ -217,6 +217,11 @@
"ImportPath": "github.com/mtchavez/jenkins",
"Rev": "5a816af6ef21ef401bff5e4b7dd255d63400f497"
},
{
"ImportPath": "github.com/steakknife/hamming",
"Comment": "0.0.2-2-g9ad4a62",
"Rev": "9ad4a620e3d573267a083c892f2b42a39302153b"
},
{
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "87e4e645d80ae9c537e8f2dee52b28036a5dd75e"
......
Copyright (c) 2014 Barry Allard
MIT license
package hamming
// SSE4.x PopCnt is 10x slower
// References: check out Hacker's Delight
const (
m1 uint64 = 0x5555555555555555 //binary: 0101...
m2 uint64 = 0x3333333333333333 //binary: 00110011..
m4 uint64 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ...
m8 uint64 = 0x00ff00ff00ff00ff //binary: 8 zeros, 8 ones ...
m16 uint64 = 0x0000ffff0000ffff //binary: 16 zeros, 16 ones ...
m32 uint64 = 0x00000000ffffffff //binary: 32 zeros, 32 ones
hff uint64 = 0xffffffffffffffff //binary: all ones
h01 uint64 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3...
)
var table = [256]byte{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}
// hamming distance of two uint64's
func Uint64(x, y uint64) int {
return CountBitsUint64(x ^ y)
}
// hamming distance of two bytes
func Byte(x, y byte) int {
return CountBitsByte(x ^ y)
}
func CountBitsUint64(x uint64) int {
x -= (x >> 1) & m1 // put count of each 2 bits into those 2 bits
x = (x & m2) + ((x >> 2) & m2) // put count of each 4 bits into those 4 bits
x = (x + (x >> 4)) & m4 // put count of each 8 bits into those 8 bits
return int((x * h01) >> 56) // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
}
func CountBitsByte(x byte) int {
return int(table[x])
}
package hamming
import (
"testing"
)
type testCountBitsUint64Case struct {
x uint64
n int
}
type testCountBitsByteCase struct {
x byte
n int
}
var testCountBitsByteCases = []testCountBitsByteCase{
{0x00, 0},
{0x01, 1},
{0x02, 1},
{0x03, 2},
{0xaa, 4},
{0x55, 4},
{0x7f, 7},
{0xff, 8},
}
var testCountBitsUint64Cases = []testCountBitsUint64Case{
{0x00, 0},
{0x01, 1},
{0x02, 1},
{0x03, 2},
{0xaa, 4},
{0x55, 4},
{0x7f, 7},
{0xff, 8},
{0xffff, 16},
{0xffffffff, 32},
{0x1ffffffff, 33},
{0x3ffffffff, 34},
{0x7ffffffff, 35},
{0xfffffffff, 36},
{0x3fffffffffffffff, 62},
{0x7fffffffffffffff, 63},
{0xffffffffffffffff, 64},
}
func TestCountBitByte(t *testing.T) {
for _, c := range testCountBitsByteCases {
if actualN := CountBitsByte(c.x); actualN != c.n {
t.Fatal("CountBitsByte(", c.x, ") = ", actualN, " != ", c.n)
} else {
t.Log("CountBitsByte(", c.x, ") == ", c.n)
}
}
}
func TestCountBitUint64(t *testing.T) {
for _, c := range testCountBitsUint64Cases {
if actualN := CountBitsUint64(c.x); actualN != c.n {
t.Fatal("CountBitsUint64(", c.x, ") = ", actualN, " != ", c.n)
} else {
t.Log("CountBitsUint64(", c.x, ") == ", c.n)
}
}
}
func BenchmarkCountBitsUint64(b *testing.B) {
j := 0
for i := 0; i < b.N; i++ {
CountBitsUint64(testCountBitsUint64Cases[j].x)
j++
if j == len(testCountBitsUint64Cases) {
j = 0
}
}
}
func BenchmarkCountBitsByte(b *testing.B) {
j := 0
for i := 0; i < b.N; i++ {
CountBitsByte(testCountBitsByteCases[j].x)
j++
if j == len(testCountBitsByteCases) {
j = 0
}
}
}
......@@ -6,6 +6,7 @@ import (
"errors"
// Non crypto hash, because speed
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/mtchavez/jenkins"
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/steakknife/hamming"
"hash"
)
......@@ -13,6 +14,7 @@ type Filter interface {
Add([]byte)
Find([]byte) bool
Merge(Filter) (Filter, error)
HammingDistance(Filter) (int, error)
}
func NewFilter(size int) Filter {
......@@ -100,3 +102,23 @@ func (f *filter) Merge(o Filter) (Filter, error) {
return nfilt, nil
}
func (f *filter) HammingDistance(o Filter) (int, error) {
casfil, ok := o.(*filter)
if !ok {
return 0, errors.New("Unsupported filter type")
}
if len(f.filter) != len(casfil.filter) {
return 0, errors.New("filter lengths must match!")
}
acc := 0
// xor together
for i := 0; i < len(f.filter); i++ {
acc += hamming.Byte(f.filter[i], casfil.filter[i])
}
return acc, nil
}
......@@ -78,3 +78,17 @@ func TestMerge(t *testing.T) {
}
}
}
func TestHamming(t *testing.T) {
f1 := NewFilter(128)
f2 := NewFilter(128)
f1.Add([]byte("no collision"))
f1.Add([]byte("collision? no!"))
dist, _ := f1.HammingDistance(f2)
if dist != 6 {
t.Fatal("Should have 6 bit difference")
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment