@@ -3,17 +3,85 @@ package lshensemble
33import "errors"
44
55var (
6- ErrDomainSizeOrder = errors .New ("Domain records must be sorted in ascending order of size" )
6+ errDomainSizeOrder = errors .New ("Domain records must be sorted in ascending order of size" )
77)
88
9- func bootstrap (index * LshEnsemble , totalNumDomains int , sortedDomains <- chan * DomainRecord ) error {
9+ func bootstrapOptimalPartitions (domains <- chan * DomainRecord , numPart int ) []Partition {
10+ sizes , counts := computeSizeDistribution (domains )
11+ partitions := optimalPartitions (sizes , counts , numPart )
12+ return partitions
13+ }
14+
15+ func bootstrapOptimal (index * LshEnsemble , sortedDomains <- chan * DomainRecord ) error {
16+ var currPart int
17+ var currSize int
18+ for rec := range sortedDomains {
19+ if currSize > rec .Size {
20+ return errDomainSizeOrder
21+ }
22+ currSize = rec .Size
23+ if currSize > index .Partitions [currPart ].Upper {
24+ currPart ++
25+ }
26+ if currPart >= len (index .Partitions ) ||
27+ ! (index .Partitions [currPart ].Lower <= currSize &&
28+ currSize <= index .Partitions [currPart ].Upper ) {
29+ return errors .New ("Domain records does not match the existing partitions" )
30+ }
31+ index .Add (rec .Key , rec .Signature , currPart )
32+ }
33+ index .Index ()
34+ return nil
35+ }
36+
37+ // BootstrapLshEnsembleOptimal builds an index from domains using optimal
38+ // partitioning.
39+ // The returned index consists of MinHash LSH implemented using LshForest.
40+ // numPart is the number of partitions to create.
41+ // numHash is the number of hash functions in MinHash.
42+ // maxK is the maximum value for the MinHash parameter K - the number of hash
43+ // functions per "band".
44+ // sortedDomainFactory is factory function that returns a DomainRecord channel
45+ // emitting domains in sorted order by their sizes.
46+ func BootstrapLshEnsembleOptimal (numPart , numHash , maxK int ,
47+ sortedDomainFactory func () <- chan * DomainRecord ) (* LshEnsemble , error ) {
48+ partitions := bootstrapOptimalPartitions (sortedDomainFactory (), numPart )
49+ index := NewLshEnsemble (partitions , numHash , maxK )
50+ err := bootstrapOptimal (index , sortedDomainFactory ())
51+ if err != nil {
52+ return nil , err
53+ }
54+ return index , nil
55+ }
56+
57+ // BootstrapLshEnsemblePlusOptimal builds an index from domains using optimal
58+ // partitioning.
59+ // The returned index consists of MinHash LSH implemented using LshForestArray.
60+ // numPart is the number of partitions to create.
61+ // numHash is the number of hash functions in MinHash.
62+ // maxK is the maximum value for the MinHash parameter K - the number of hash
63+ // functions per "band".
64+ // sortedDomainFactory is factory function that returns a DomainRecord channel
65+ // emitting domains in sorted order by their sizes.
66+ func BootstrapLshEnsemblePlusOptimal (numPart , numHash , maxK int ,
67+ sortedDomainFactory func () <- chan * DomainRecord ) (* LshEnsemble , error ) {
68+ partitions := bootstrapOptimalPartitions (sortedDomainFactory (), numPart )
69+ index := NewLshEnsemblePlus (partitions , numHash , maxK )
70+ err := bootstrapOptimal (index , sortedDomainFactory ())
71+ if err != nil {
72+ return nil , err
73+ }
74+ return index , nil
75+ }
76+
77+ func bootstrapEquiDepth (index * LshEnsemble , totalNumDomains int , sortedDomains <- chan * DomainRecord ) error {
1078 numPart := len (index .Partitions )
1179 depth := totalNumDomains / numPart
1280 var currDepth , currPart int
1381 var currSize int
1482 for rec := range sortedDomains {
1583 if currSize > rec .Size {
16- return ErrDomainSizeOrder
84+ return errDomainSizeOrder
1785 }
1886 currSize = rec .Size
1987 index .Add (rec .Key , rec .Signature , currPart )
@@ -29,30 +97,36 @@ func bootstrap(index *LshEnsemble, totalNumDomains int, sortedDomains <-chan *Do
2997 return nil
3098}
3199
32- // BoostrapLshEnsemble builds an index from a channel of domains.
100+ // BootstrapLshEnsembleEquiDepth builds an index from a channel of domains
101+ // using equi-depth partitions -- partitions have approximately the same
102+ // number of domains.
33103// The returned index consists of MinHash LSH implemented using LshForest.
34104// numPart is the number of partitions to create.
35105// numHash is the number of hash functions in MinHash.
36106// maxK is the maximum value for the MinHash parameter K - the number of hash functions per "band".
37107// sortedDomains is a DomainRecord channel emitting domains in sorted order by their sizes.
38- func BootstrapLshEnsemble (numPart , numHash , maxK , totalNumDomains int , sortedDomains <- chan * DomainRecord ) (* LshEnsemble , error ) {
108+ func BootstrapLshEnsembleEquiDepth (numPart , numHash , maxK , totalNumDomains int ,
109+ sortedDomains <- chan * DomainRecord ) (* LshEnsemble , error ) {
39110 index := NewLshEnsemble (make ([]Partition , numPart ), numHash , maxK )
40- err := bootstrap (index , totalNumDomains , sortedDomains )
111+ err := bootstrapEquiDepth (index , totalNumDomains , sortedDomains )
41112 if err != nil {
42113 return nil , err
43114 }
44115 return index , nil
45116}
46117
47- // BoostrapLshEnsemblePlus builds an index from a channel of domains.
118+ // BootstrapLshEnsemblePlusEquiDepth builds an index from a channel of domains
119+ // using equi-depth partitions -- partitions have approximately the same
120+ // number of domains.
48121// The returned index consists of MinHash LSH implemented using LshForestArray.
49122// numPart is the number of partitions to create.
50123// numHash is the number of hash functions in MinHash.
51124// maxK is the maximum value for the MinHash parameter K - the number of hash functions per "band".
52125// sortedDomains is a DomainRecord channel emitting domains in sorted order by their sizes.
53- func BootstrapLshEnsemblePlus (numPart , numHash , maxK , totalNumDomains int , sortedDomains <- chan * DomainRecord ) (* LshEnsemble , error ) {
126+ func BootstrapLshEnsemblePlusEquiDepth (numPart , numHash , maxK ,
127+ totalNumDomains int , sortedDomains <- chan * DomainRecord ) (* LshEnsemble , error ) {
54128 index := NewLshEnsemblePlus (make ([]Partition , numPart ), numHash , maxK )
55- err := bootstrap (index , totalNumDomains , sortedDomains )
129+ err := bootstrapEquiDepth (index , totalNumDomains , sortedDomains )
56130 if err != nil {
57131 return nil , err
58132 }
0 commit comments