s2: Add idSetLexicon and sequenceLexicon.

rsned · dsymonds · commit c8d99d922bc4 · 2020-03-19T12:15:08.000+11:00
These are used in a number of places in builder and boolean operations.

Signed-off-by: David Symonds &lt;dsymonds@golang.org&gt;
diff --git a/README.md b/README.md
@@ -141,6 +141,7 @@ Approximately ~40% complete.
     conversion methods to and from ST-space, UV-space, and XYZ-space.
 *   s2wedge_relations
 *   ShapeIndex
+*   idSetLexicon,sequenceLexicon
 
 **Mostly Complete** Files that have almost all of the features of the original
 C++ code, and are reasonably complete enough to use in live code. Up to date
@@ -185,7 +186,6 @@ started.
 *   ClosestPointQuery
 *   EdgeTesselator
 *   LoopMeasures
-*   MinDistanceTargets
 *   PointIndex
 *   PointRegion
 *   PointUtil
@@ -194,7 +194,6 @@ started.
 *   RegionTermIndexer
 *   RegionUnion
 *   ShapeIndexRegion - Allows ShapeIndexes to be used as Regions for things like
-*   lexicon
 
 ### Encode/Decode
 
diff --git a/s2/lexicon.go b/s2/lexicon.go
@@ -0,0 +1,175 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package s2
+
+import (
+	"encoding/binary"
+	"hash/adler32"
+	"math"
+	"sort"
+)
+
+// TODO(roberts): If any of these are worth making public, change the
+// method signatures and type names.
+
+// emptySetID represents the last ID that will ever be generated.
+// (Non-negative IDs are reserved for singleton sets.)
+var emptySetID = int32(math.MinInt32)
+
+// idSetLexicon compactly represents a set of non-negative
+// integers such as array indices ("ID sets"). It is especially suitable when
+// either (1) there are many duplicate sets, or (2) there are many singleton
+// or empty sets. See also sequenceLexicon.
+//
+// Each distinct ID set is mapped to a 32-bit integer. Empty and singleton
+// sets take up no additional space; the set itself is represented
+// by the unique ID assigned to the set. Duplicate sets are automatically
+// eliminated. Note also that ID sets are referred to using 32-bit integers
+// rather than pointers.
+type idSetLexicon struct {
+	idSets *sequenceLexicon
+}
+
+func newIDSetLexicon() *idSetLexicon {
+	return &idSetLexicon{
+		idSets: newSequenceLexicon(),
+	}
+}
+
+// add adds the given set of integers to the lexicon if it is not already
+// present, and return the unique ID for this set. The values are automatically
+// sorted and duplicates are removed.
+//
+// The primary difference between this and sequenceLexicon are:
+// 1. Empty and singleton sets are represented implicitly; they use no space.
+// 2. Sets are represented rather than sequences; the ordering of values is
+//    not important and duplicates are removed.
+// 3. The values must be 32-bit non-negative integers only.
+func (l *idSetLexicon) add(ids ...int32) int32 {
+	// Empty sets have a special ID chosen not to conflict with other IDs.
+	if len(ids) == 0 {
+		return emptySetID
+	}
+
+	// Singleton sets are represented by their element.
+	if len(ids) == 1 {
+		return ids[0]
+	}
+
+	// Canonicalize the set by sorting and removing duplicates.
+	//
+	// Creates a new slice in order to not alter the supplied values.
+	set := uniqueInt32s(ids)
+
+	// Non-singleton sets are represented by the bitwise complement of the ID
+	// returned by the sequenceLexicon
+	return ^l.idSets.add(set)
+}
+
+// idSet returns the set of integers corresponding to an ID returned by add.
+func (l *idSetLexicon) idSet(setID int32) []int32 {
+	if setID >= 0 {
+		return []int32{setID}
+	}
+	if setID == emptySetID {
+		return []int32{}
+	}
+
+	return l.idSets.sequence(^setID)
+}
+
+func (l *idSetLexicon) clear() {
+	l.idSets.clear()
+}
+
+// sequenceLexicon compactly represents a sequence of values (e.g., tuples).
+// It automatically eliminates duplicates slices, and maps the remaining
+// sequences to sequentially increasing integer IDs. See also idSetLexicon.
+//
+// Each distinct sequence is mapped to a 32-bit integer.
+type sequenceLexicon struct {
+	values []int32
+	begins []uint32
+
+	// idSet is a mapping of a sequence hash to sequence index in the lexicon.
+	idSet map[uint32]int32
+}
+
+func newSequenceLexicon() *sequenceLexicon {
+	return &sequenceLexicon{
+		begins: []uint32{0},
+		idSet:  make(map[uint32]int32),
+	}
+}
+
+// clears all data from the lexicon.
+func (l *sequenceLexicon) clear() {
+	l.values = nil
+	l.begins = []uint32{0}
+	l.idSet = make(map[uint32]int32)
+}
+
+// add adds the given value to the lexicon if it is not already present, and
+// returns its ID. IDs are assigned sequentially starting from zero.
+func (l *sequenceLexicon) add(ids []int32) int32 {
+	if id, ok := l.idSet[hashSet(ids)]; ok {
+		return id
+	}
+	for _, v := range ids {
+		l.values = append(l.values, v)
+	}
+	l.begins = append(l.begins, uint32(len(l.values)))
+
+	id := int32(len(l.begins)) - 2
+	l.idSet[hashSet(ids)] = id
+
+	return id
+}
+
+// sequence returns the original sequence of values for the given ID.
+func (l *sequenceLexicon) sequence(id int32) []int32 {
+	return l.values[l.begins[id]:l.begins[id+1]]
+}
+
+// size reports the number of value sequences in the lexicon.
+func (l *sequenceLexicon) size() int {
+	// Subtract one because the list of begins starts out with the first element set to 0.
+	return len(l.begins) - 1
+}
+
+// hash returns a hash of this sequence of int32s.
+func hashSet(s []int32) uint32 {
+	// TODO(roberts): We just need a way to nicely hash all the values down to
+	// a 32-bit value. To ensure no unnecessary dependencies we use the core
+	// library types available to do this. Is there a better option?
+	a := adler32.New()
+	binary.Write(a, binary.LittleEndian, s)
+	return a.Sum32()
+}
+
+// uniqueInt32s returns the sorted and uniqued set of int32s from the input.
+func uniqueInt32s(in []int32) []int32 {
+	var vals []int32
+	m := make(map[int32]bool)
+	for _, i := range in {
+		if m[i] {
+			continue
+		}
+		m[i] = true
+		vals = append(vals, i)
+	}
+	sort.Slice(vals, func(i, j int) bool { return vals[i] < vals[j] })
+	return vals
+}
diff --git a/s2/lexicon_test.go b/s2/lexicon_test.go
@@ -0,0 +1,179 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package s2
+
+import (
+	"math"
+	"reflect"
+	"testing"
+)
+
+func TestSequenceLexiconAdd(t *testing.T) {
+	tests := []struct {
+		have []int32
+		want int32
+	}{
+		{have: []int32{}, want: 0},
+		{have: []int32{5}, want: 1},
+		{have: []int32{}, want: 0},
+		{have: []int32{5, 5}, want: 2},
+		{have: []int32{5, 0, -3}, want: 3},
+		{have: []int32{5}, want: 1},
+		{have: []int32{0x7fffffff}, want: 4},
+		{have: []int32{5, 0, -3}, want: 3},
+		{have: []int32{}, want: 0},
+	}
+
+	lex := newSequenceLexicon()
+	for _, test := range tests {
+		if got := lex.add(test.have); got != test.want {
+			t.Errorf("lexicon.add(%v) = %v, want %v", test.have, got, test.want)
+		}
+
+	}
+
+	if lex.size() != 5 {
+		t.Errorf("lexicon.size() = %v, want 5", lex.size())
+	}
+
+	for _, test := range tests {
+		if got := lex.sequence(test.want); !reflect.DeepEqual(got, test.have) {
+			t.Errorf("lexicon.sequence(%v) = %v, want %v", test.want, got, test.have)
+		}
+	}
+}
+
+func TestSequenceLexiconClear(t *testing.T) {
+	lex := newSequenceLexicon()
+
+	if got, want := lex.add([]int32{1}), int32(0); got != want {
+		t.Errorf("lex.add([]int32{1}) = %v, want %v", got, want)
+	}
+	if got, want := lex.add([]int32{2}), int32(1); got != want {
+		t.Errorf("lex.add(sequence{2}) = %v, want %v", got, want)
+	}
+	lex.clear()
+	if got, want := lex.add([]int32{2}), int32(0); got != want {
+		t.Errorf("lex.add([]int32{2}) = %v, want %v", got, want)
+	}
+	if got, want := lex.add([]int32{1}), int32(1); got != want {
+		t.Errorf("lex.add([]int32{1}) = %v, want %v", got, want)
+	}
+}
+
+func TestIDSetLexiconSingletonSets(t *testing.T) {
+	var m int32 = math.MaxInt32
+	tests := []struct {
+		have int32
+		want int32
+	}{
+		{5, 5},
+		{0, 0},
+		{1, 1},
+		{m, m},
+	}
+
+	lex := newIDSetLexicon()
+	// Test adding
+	for _, test := range tests {
+		if got := lex.add(test.have); got != test.want {
+			t.Errorf("lexicon.add(%v) = %v, want %v", test.have, got, test.want)
+		}
+	}
+
+	// Test recall
+	for _, test := range tests {
+		if got := lex.idSet(test.want); !reflect.DeepEqual(got, []int32{test.have}) {
+			t.Errorf("lexicon.idSet(%v) = %v, want %v", test.want, got, test.have)
+		}
+	}
+}
+
+func TestIDSetLexiconSetsAreSorted(t *testing.T) {
+	tests := []struct {
+		have []int32
+		want int32
+	}{
+		// This test relies on order of test cases to get the expected IDs.
+		{
+			have: []int32{2, 5},
+			want: ^0,
+		},
+		{
+			have: []int32{3, 2, 5},
+			want: ^1,
+		},
+		{
+			have: []int32{2, 2, 2, 2, 5, 2, 5},
+			want: ^0,
+		},
+		{
+			have: []int32{2, 5},
+			want: ^0,
+		},
+		{
+			have: []int32{5, 3, 2, 5},
+			want: ^1,
+		},
+	}
+
+	lexicon := newIDSetLexicon()
+	for _, test := range tests {
+		if got := lexicon.add(test.have...); got != test.want {
+			t.Errorf("lexicon.addSet(%v) = %v, want %v", test.have, got, test.want)
+		}
+	}
+
+	recallTests := []struct {
+		have int32
+		want []int32
+	}{
+		{
+			have: ^0,
+			want: []int32{2, 5},
+		},
+		{
+			have: ^1,
+			want: []int32{2, 3, 5},
+		},
+	}
+
+	for _, test := range recallTests {
+		if got := lexicon.idSet(test.have); !reflect.DeepEqual(got, test.want) {
+			t.Errorf("lexicon.idSet(%v) = %+v, want %+v", test.have, got, test.want)
+		}
+	}
+}
+
+func TestIDSetLexiconClear(t *testing.T) {
+	lex := newIDSetLexicon()
+
+	if got, want := lex.add(1, 2), int32(^0); got != want {
+		t.Errorf("lex.add([]int32{1, 2}) = %v, want %v", got, want)
+	}
+	if got, want := lex.add(3, 4), int32(^1); got != want {
+		t.Errorf("lex.add(sequence{3, 4}) = %v, want %v", got, want)
+	}
+	lex.clear()
+	if got, want := lex.add(3, 4), int32(^0); got != want {
+		t.Errorf("lex.add([]int32{3, 4}) = %v, want %v", got, want)
+	}
+	if got, want := lex.add(1, 2), int32(^1); got != want {
+		t.Errorf("lex.add([]int32{1, 2}) = %v, want %v", got, want)
+	}
+}
+
+// TODO(roberts): Differences from C++
+// Benchmarking methods.