Skip to content

Commit c8d99d9

Browse files
rsneddsymonds
authored andcommitted
s2: Add idSetLexicon and sequenceLexicon.
These are used in a number of places in builder and boolean operations. Signed-off-by: David Symonds <dsymonds@golang.org>
1 parent 9a88175 commit c8d99d9

File tree

3 files changed

+355
-2
lines changed

3 files changed

+355
-2
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ Approximately ~40% complete.
141141
conversion methods to and from ST-space, UV-space, and XYZ-space.
142142
* s2wedge_relations
143143
* ShapeIndex
144+
* idSetLexicon,sequenceLexicon
144145

145146
**Mostly Complete** Files that have almost all of the features of the original
146147
C++ code, and are reasonably complete enough to use in live code. Up to date
@@ -185,7 +186,6 @@ started.
185186
* ClosestPointQuery
186187
* EdgeTesselator
187188
* LoopMeasures
188-
* MinDistanceTargets
189189
* PointIndex
190190
* PointRegion
191191
* PointUtil
@@ -194,7 +194,6 @@ started.
194194
* RegionTermIndexer
195195
* RegionUnion
196196
* ShapeIndexRegion - Allows ShapeIndexes to be used as Regions for things like
197-
* lexicon
198197

199198
### Encode/Decode
200199

s2/lexicon.go

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
// Copyright 2020 Google Inc. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package s2
16+
17+
import (
18+
"encoding/binary"
19+
"hash/adler32"
20+
"math"
21+
"sort"
22+
)
23+
24+
// TODO(roberts): If any of these are worth making public, change the
25+
// method signatures and type names.
26+
27+
// emptySetID represents the last ID that will ever be generated.
28+
// (Non-negative IDs are reserved for singleton sets.)
29+
var emptySetID = int32(math.MinInt32)
30+
31+
// idSetLexicon compactly represents a set of non-negative
32+
// integers such as array indices ("ID sets"). It is especially suitable when
33+
// either (1) there are many duplicate sets, or (2) there are many singleton
34+
// or empty sets. See also sequenceLexicon.
35+
//
36+
// Each distinct ID set is mapped to a 32-bit integer. Empty and singleton
37+
// sets take up no additional space; the set itself is represented
38+
// by the unique ID assigned to the set. Duplicate sets are automatically
39+
// eliminated. Note also that ID sets are referred to using 32-bit integers
40+
// rather than pointers.
41+
type idSetLexicon struct {
42+
idSets *sequenceLexicon
43+
}
44+
45+
func newIDSetLexicon() *idSetLexicon {
46+
return &idSetLexicon{
47+
idSets: newSequenceLexicon(),
48+
}
49+
}
50+
51+
// add adds the given set of integers to the lexicon if it is not already
52+
// present, and return the unique ID for this set. The values are automatically
53+
// sorted and duplicates are removed.
54+
//
55+
// The primary difference between this and sequenceLexicon are:
56+
// 1. Empty and singleton sets are represented implicitly; they use no space.
57+
// 2. Sets are represented rather than sequences; the ordering of values is
58+
// not important and duplicates are removed.
59+
// 3. The values must be 32-bit non-negative integers only.
60+
func (l *idSetLexicon) add(ids ...int32) int32 {
61+
// Empty sets have a special ID chosen not to conflict with other IDs.
62+
if len(ids) == 0 {
63+
return emptySetID
64+
}
65+
66+
// Singleton sets are represented by their element.
67+
if len(ids) == 1 {
68+
return ids[0]
69+
}
70+
71+
// Canonicalize the set by sorting and removing duplicates.
72+
//
73+
// Creates a new slice in order to not alter the supplied values.
74+
set := uniqueInt32s(ids)
75+
76+
// Non-singleton sets are represented by the bitwise complement of the ID
77+
// returned by the sequenceLexicon
78+
return ^l.idSets.add(set)
79+
}
80+
81+
// idSet returns the set of integers corresponding to an ID returned by add.
82+
func (l *idSetLexicon) idSet(setID int32) []int32 {
83+
if setID >= 0 {
84+
return []int32{setID}
85+
}
86+
if setID == emptySetID {
87+
return []int32{}
88+
}
89+
90+
return l.idSets.sequence(^setID)
91+
}
92+
93+
func (l *idSetLexicon) clear() {
94+
l.idSets.clear()
95+
}
96+
97+
// sequenceLexicon compactly represents a sequence of values (e.g., tuples).
98+
// It automatically eliminates duplicates slices, and maps the remaining
99+
// sequences to sequentially increasing integer IDs. See also idSetLexicon.
100+
//
101+
// Each distinct sequence is mapped to a 32-bit integer.
102+
type sequenceLexicon struct {
103+
values []int32
104+
begins []uint32
105+
106+
// idSet is a mapping of a sequence hash to sequence index in the lexicon.
107+
idSet map[uint32]int32
108+
}
109+
110+
func newSequenceLexicon() *sequenceLexicon {
111+
return &sequenceLexicon{
112+
begins: []uint32{0},
113+
idSet: make(map[uint32]int32),
114+
}
115+
}
116+
117+
// clears all data from the lexicon.
118+
func (l *sequenceLexicon) clear() {
119+
l.values = nil
120+
l.begins = []uint32{0}
121+
l.idSet = make(map[uint32]int32)
122+
}
123+
124+
// add adds the given value to the lexicon if it is not already present, and
125+
// returns its ID. IDs are assigned sequentially starting from zero.
126+
func (l *sequenceLexicon) add(ids []int32) int32 {
127+
if id, ok := l.idSet[hashSet(ids)]; ok {
128+
return id
129+
}
130+
for _, v := range ids {
131+
l.values = append(l.values, v)
132+
}
133+
l.begins = append(l.begins, uint32(len(l.values)))
134+
135+
id := int32(len(l.begins)) - 2
136+
l.idSet[hashSet(ids)] = id
137+
138+
return id
139+
}
140+
141+
// sequence returns the original sequence of values for the given ID.
142+
func (l *sequenceLexicon) sequence(id int32) []int32 {
143+
return l.values[l.begins[id]:l.begins[id+1]]
144+
}
145+
146+
// size reports the number of value sequences in the lexicon.
147+
func (l *sequenceLexicon) size() int {
148+
// Subtract one because the list of begins starts out with the first element set to 0.
149+
return len(l.begins) - 1
150+
}
151+
152+
// hash returns a hash of this sequence of int32s.
153+
func hashSet(s []int32) uint32 {
154+
// TODO(roberts): We just need a way to nicely hash all the values down to
155+
// a 32-bit value. To ensure no unnecessary dependencies we use the core
156+
// library types available to do this. Is there a better option?
157+
a := adler32.New()
158+
binary.Write(a, binary.LittleEndian, s)
159+
return a.Sum32()
160+
}
161+
162+
// uniqueInt32s returns the sorted and uniqued set of int32s from the input.
163+
func uniqueInt32s(in []int32) []int32 {
164+
var vals []int32
165+
m := make(map[int32]bool)
166+
for _, i := range in {
167+
if m[i] {
168+
continue
169+
}
170+
m[i] = true
171+
vals = append(vals, i)
172+
}
173+
sort.Slice(vals, func(i, j int) bool { return vals[i] < vals[j] })
174+
return vals
175+
}

s2/lexicon_test.go

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
// Copyright 2020 Google Inc. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package s2
16+
17+
import (
18+
"math"
19+
"reflect"
20+
"testing"
21+
)
22+
23+
func TestSequenceLexiconAdd(t *testing.T) {
24+
tests := []struct {
25+
have []int32
26+
want int32
27+
}{
28+
{have: []int32{}, want: 0},
29+
{have: []int32{5}, want: 1},
30+
{have: []int32{}, want: 0},
31+
{have: []int32{5, 5}, want: 2},
32+
{have: []int32{5, 0, -3}, want: 3},
33+
{have: []int32{5}, want: 1},
34+
{have: []int32{0x7fffffff}, want: 4},
35+
{have: []int32{5, 0, -3}, want: 3},
36+
{have: []int32{}, want: 0},
37+
}
38+
39+
lex := newSequenceLexicon()
40+
for _, test := range tests {
41+
if got := lex.add(test.have); got != test.want {
42+
t.Errorf("lexicon.add(%v) = %v, want %v", test.have, got, test.want)
43+
}
44+
45+
}
46+
47+
if lex.size() != 5 {
48+
t.Errorf("lexicon.size() = %v, want 5", lex.size())
49+
}
50+
51+
for _, test := range tests {
52+
if got := lex.sequence(test.want); !reflect.DeepEqual(got, test.have) {
53+
t.Errorf("lexicon.sequence(%v) = %v, want %v", test.want, got, test.have)
54+
}
55+
}
56+
}
57+
58+
func TestSequenceLexiconClear(t *testing.T) {
59+
lex := newSequenceLexicon()
60+
61+
if got, want := lex.add([]int32{1}), int32(0); got != want {
62+
t.Errorf("lex.add([]int32{1}) = %v, want %v", got, want)
63+
}
64+
if got, want := lex.add([]int32{2}), int32(1); got != want {
65+
t.Errorf("lex.add(sequence{2}) = %v, want %v", got, want)
66+
}
67+
lex.clear()
68+
if got, want := lex.add([]int32{2}), int32(0); got != want {
69+
t.Errorf("lex.add([]int32{2}) = %v, want %v", got, want)
70+
}
71+
if got, want := lex.add([]int32{1}), int32(1); got != want {
72+
t.Errorf("lex.add([]int32{1}) = %v, want %v", got, want)
73+
}
74+
}
75+
76+
func TestIDSetLexiconSingletonSets(t *testing.T) {
77+
var m int32 = math.MaxInt32
78+
tests := []struct {
79+
have int32
80+
want int32
81+
}{
82+
{5, 5},
83+
{0, 0},
84+
{1, 1},
85+
{m, m},
86+
}
87+
88+
lex := newIDSetLexicon()
89+
// Test adding
90+
for _, test := range tests {
91+
if got := lex.add(test.have); got != test.want {
92+
t.Errorf("lexicon.add(%v) = %v, want %v", test.have, got, test.want)
93+
}
94+
}
95+
96+
// Test recall
97+
for _, test := range tests {
98+
if got := lex.idSet(test.want); !reflect.DeepEqual(got, []int32{test.have}) {
99+
t.Errorf("lexicon.idSet(%v) = %v, want %v", test.want, got, test.have)
100+
}
101+
}
102+
}
103+
104+
func TestIDSetLexiconSetsAreSorted(t *testing.T) {
105+
tests := []struct {
106+
have []int32
107+
want int32
108+
}{
109+
// This test relies on order of test cases to get the expected IDs.
110+
{
111+
have: []int32{2, 5},
112+
want: ^0,
113+
},
114+
{
115+
have: []int32{3, 2, 5},
116+
want: ^1,
117+
},
118+
{
119+
have: []int32{2, 2, 2, 2, 5, 2, 5},
120+
want: ^0,
121+
},
122+
{
123+
have: []int32{2, 5},
124+
want: ^0,
125+
},
126+
{
127+
have: []int32{5, 3, 2, 5},
128+
want: ^1,
129+
},
130+
}
131+
132+
lexicon := newIDSetLexicon()
133+
for _, test := range tests {
134+
if got := lexicon.add(test.have...); got != test.want {
135+
t.Errorf("lexicon.addSet(%v) = %v, want %v", test.have, got, test.want)
136+
}
137+
}
138+
139+
recallTests := []struct {
140+
have int32
141+
want []int32
142+
}{
143+
{
144+
have: ^0,
145+
want: []int32{2, 5},
146+
},
147+
{
148+
have: ^1,
149+
want: []int32{2, 3, 5},
150+
},
151+
}
152+
153+
for _, test := range recallTests {
154+
if got := lexicon.idSet(test.have); !reflect.DeepEqual(got, test.want) {
155+
t.Errorf("lexicon.idSet(%v) = %+v, want %+v", test.have, got, test.want)
156+
}
157+
}
158+
}
159+
160+
func TestIDSetLexiconClear(t *testing.T) {
161+
lex := newIDSetLexicon()
162+
163+
if got, want := lex.add(1, 2), int32(^0); got != want {
164+
t.Errorf("lex.add([]int32{1, 2}) = %v, want %v", got, want)
165+
}
166+
if got, want := lex.add(3, 4), int32(^1); got != want {
167+
t.Errorf("lex.add(sequence{3, 4}) = %v, want %v", got, want)
168+
}
169+
lex.clear()
170+
if got, want := lex.add(3, 4), int32(^0); got != want {
171+
t.Errorf("lex.add([]int32{3, 4}) = %v, want %v", got, want)
172+
}
173+
if got, want := lex.add(1, 2), int32(^1); got != want {
174+
t.Errorf("lex.add([]int32{1, 2}) = %v, want %v", got, want)
175+
}
176+
}
177+
178+
// TODO(roberts): Differences from C++
179+
// Benchmarking methods.

0 commit comments

Comments
 (0)