-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathterm.go
171 lines (145 loc) · 3.59 KB
/
term.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package query
import (
"fmt"
"math"
"sort"
)
type block struct {
maxDoc int32
maxIdx int
}
type TermQuery struct {
docId int32
cursor int
postings []int32
blocks []block
currentBlockIndex int
currentBlock block
term string
idf float32 // XXX: unnormalized idf
boost float32
}
func computeIDF(N, d int) float32 {
// idf is log(1 + N/D)
// N = total documents in the index
// d = documents matching (len(postings))
return float32(math.Log1p(float64(N) / float64(d)))
}
// splits the postings list into chunks that are binary searched and inside each chunk linearly searching for next advance()
var TERM_CHUNK_SIZE = 4096
// Basic []int32{} that the whole interface works on top
// score is IDF (not tf*idf, just 1*idf, since we dont store the term frequency for now)
// if you dont know totalDocumentsInIndex, which could be the case sometimes, pass any constant > 0
// WARNING: the query *can not* be reused
// WARNING: the query it not thread safe
func Term(totalDocumentsInIndex int, t string, postings []int32) *TermQuery {
q := &TermQuery{
term: t,
cursor: -1,
postings: postings,
docId: NOT_READY,
currentBlock: block{maxIdx: 0, maxDoc: NOT_READY},
idf: computeIDF(totalDocumentsInIndex, len(postings)),
boost: 1,
}
if len(postings) == 0 {
q.idf = 0
return q
}
chunkSize := TERM_CHUNK_SIZE
q.blocks = make([]block, ((len(postings) + chunkSize - 1) / chunkSize)) // ceil
blockIndex := 0
for i := 0; i < len(postings); i += chunkSize {
minIdx := i
maxIdx := (minIdx + chunkSize) - 1
if maxIdx >= len(postings)-1 {
maxIdx = len(postings) - 1
}
q.blocks[blockIndex] = block{
maxDoc: postings[maxIdx],
maxIdx: maxIdx,
}
blockIndex++
}
return q
}
func (t *TermQuery) GetDocId() int32 {
return t.docId
}
func (t *TermQuery) Cost() int {
return len(t.postings) - t.cursor
}
func (t *TermQuery) String() string {
return fmt.Sprintf("%s/%.2f", t.term, t.idf)
}
func (t *TermQuery) Score() float32 {
return t.idf * t.boost
}
func (t *TermQuery) findBlock(target int32) int32 {
if len(t.blocks) == 0 {
return NO_MORE
}
if len(t.blocks)-t.currentBlockIndex < 32 {
for i := t.currentBlockIndex; i < len(t.blocks); i++ {
current := t.blocks[i]
if target <= current.maxDoc {
t.currentBlockIndex = i
t.currentBlock = current
return target
}
}
return NO_MORE
}
found := sort.Search(len(t.blocks)-t.currentBlockIndex, func(i int) bool {
current := t.blocks[i+t.currentBlockIndex]
return target <= current.maxDoc
}) + t.currentBlockIndex
if found < len(t.blocks) {
t.currentBlockIndex = found
t.currentBlock = t.blocks[found]
return target
}
return NO_MORE
}
func (t *TermQuery) Advance(target int32) int32 {
if target > t.currentBlock.maxDoc {
if t.findBlock(target) == NO_MORE {
t.docId = NO_MORE
return NO_MORE
}
}
if t.cursor < 0 {
t.cursor = 0
}
t.docId = NO_MORE
for i := t.cursor; i <= t.currentBlock.maxIdx; i++ {
x := t.postings[i]
if x >= target {
t.cursor = i
t.docId = x
return x
}
}
// invariant, can not happen because at this point we will be within the block
// panic?
return t.docId
}
func (t *TermQuery) Next() int32 {
t.cursor++
if t.cursor >= len(t.postings) {
t.docId = NO_MORE
} else {
t.docId = t.postings[t.cursor]
}
return t.docId
}
func (t *TermQuery) SetBoost(b float32) Query {
t.boost = b
return t
}
func (t *TermQuery) PayloadDecode(p Payload) {
panic("unsupported")
}
func (t *TermQuery) AddSubQuery(Query) Query {
panic("unsupported")
}