-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcountmin_test.go
117 lines (93 loc) · 2.87 KB
/
countmin_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package disttopk
import "testing"
import (
"fmt"
"math"
)
/*
func TestCountMinHashSerialize(t *testing.T) {
hash := NewCountMinHash(2, 2)
b, err := GobBytesEncode(hash)
if err != nil {
panic(err)
}
if len(b) != hash.ByteSize() {
t.Error("len is", len(b), "but bytesize is", hash.ByteSize())
}
var obj CountMinHash
err = GobBytesDecode(&obj, b)
if err != nil {
panic(err)
}
if obj.Columns != hash.Columns || obj.Hashes != hash.Hashes {
t.Fail()
}
if len(obj.hasha) != hash.Hashes {
t.Fail()
}
}*/
func TestCountMinHashComp(t *testing.T) {
err_sum := 0.0
for _, eps := range []float64{0.1, 0.01, 0.001, 0.0001} {
for _, n := range []int{1, 10, 100, 1000} {
serialized, compressed, valueBits := RunCountMinHashComp(eps, n)
expected := GetExpected(eps, n, valueBits)
err := float64(expected-(compressed-11)) / float64(expected)
err_sum += err
t.Log(eps, n, "Serialized", serialized, "Compressed", compressed, "Compressed No Overhead", compressed-11, "Expected", expected, "diff", float64(expected-(compressed-11))/float64(expected))
}
}
}
func GetExpected(eps float64, n int, valueBits uint8) int {
//overhead 8 in count min, 9 in count array
hash_array := GetExpectedHashArray(eps, n) + 8 + 9
values := GetExpectedValuesEncoding(n, valueBits)
//println("Bytes", hash_array, values, math.Log2(1.0/eps))
return int(hash_array + values)
}
func GetExpectedHashArray(eps float64, n int) int {
// for k=opt : m = n * 1.44 * log_2(1/eps) = n * 1.44 * 1/ln(2) * ln (1/eps)
return int(math.Ceil((float64(n) * 1.44 * 0.7 * math.Log2(1.0/eps)) / 8.0))
}
func GetExpectedValuesEncoding(n int, valueBits uint8) int {
return int(math.Ceil((float64(n) * float64(valueBits)) / 8))
}
func RunCountMinHashComp(eps float64, n int) (int, int, uint8) {
src := NewSimpleZipfSource(uint32(n), 0.7, 1)
// for k=1 : m ~ n/eps
hash := NewCountMinSketch(1, int(float64(n)/float64(eps)))
list := src.GetList()
for k, item := range list {
score := uint32(item.Score)
//println("score", score, eps, n)
if score == 0 {
panic(fmt.Sprint(score, n, eps, k))
}
// println("score", uint32(item.Score))
hash.Add(IntKeyToByteKey(item.Id), score)
//hash.Add(IntKeyToByteKey(item.Id), 8)
}
//println("Here", eps, n)
ser, err := SerializeObject(hash)
if err != nil {
panic(err)
}
comp := CompressBytes(ser)
return len(ser), len(comp), hash.GetValueBits(0)
}
func RunCountMinHashCompHashArrayOnly(eps float64, n int) (int, int, uint8) {
src := NewSimpleZipfSource(uint32(n), 0.7, 1)
// for k=1 : m ~ n/eps
hash := NewCountMinSketch(1, int(float64(n)/float64(eps)))
list := src.GetList()
for _, item := range list {
hash.Add(IntKeyToByteKey(item.Id), uint32(item.Score))
//hash.Add(IntKeyToByteKey(item.Id), 8)
}
ser, err := SerializeObject(hash)
if err != nil {
panic(err)
}
comp := CompressBytes(ser)
return len(ser), len(comp), hash.GetValueBits(0)
}