Skip to content

Commit 5fc1a53

Browse files
authored
Clean up finder (#11)
* removed PetarDambovaliev machine and added a fork of cludflare that returns position * removed unused deps * fixed testes and readme * fixed some comments
1 parent 2e5bfe6 commit 5fc1a53

File tree

11 files changed

+131
-139
lines changed

11 files changed

+131
-139
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ https://github.com/anknown/ahocorasick and
3535
https://github.com/petar-dambovaliev/aho-corasick and a regexp implementation for the `RegexEngine`.
3636
But any other library can be used as long as it "implements" the `SubstringEngine` or `RegexEngine` interface.
3737
```go
38-
subEng := &finder.PetarDambovalievEngine{}
38+
subEng := &finder.CloudflareForkEngine{}
3939
rgxEng := &finder.RegexpEngine{}
4040
caseSensitive := true
4141
findthem := finder.NewFinder(subEng, rgxEng, caseSensitive)
@@ -73,7 +73,7 @@ And finally you can check which expressions match on each text.
7373
}
7474
fmt.Printf("----------Text %d case sensitive-----------\n", i)
7575
for _, expRes := range resp {
76-
fmt.Printf("exp: [%s]%s | %v\n", expRes.Tag, expRes.ExpresionStr, expRes.Evaluation)
76+
fmt.Printf("exp %d: [%s]%s\n", expRes.ExpresionIndex, expRes.Tag, expRes.ExpresionStr)
7777
}
7878
}
7979
```

benchmarks/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ go_test(
99
"//finder",
1010
"@com_github_anknown_ahocorasick//:ahocorasick",
1111
"@com_github_cloudflare_ahocorasick//:ahocorasick",
12-
"@com_github_petar_dambovaliev_aho_corasick//:aho-corasick",
12+
"@com_github_pedroegsilva_ahocorasick//ahocorasick",
1313
],
1414
)

benchmarks/benchmark_test.go

Lines changed: 40 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ import (
1414

1515
akahocorasick "github.com/anknown/ahocorasick"
1616
cfahocorasick "github.com/cloudflare/ahocorasick"
17+
forkahocorasick "github.com/pedroegsilva/ahocorasick/ahocorasick"
1718
"github.com/pedroegsilva/gofindthem/dsl"
1819
"github.com/pedroegsilva/gofindthem/finder"
19-
pdahocorasick "github.com/petar-dambovaliev/aho-corasick"
2020
)
2121

2222
func init() {
@@ -113,8 +113,8 @@ func BenchmarkAhocorasickAnknownBuild100(b *testing.B) {
113113
BMAnknownBuild(exp100, b)
114114
}
115115

116-
func BenchmarkAhocorasickPetarDambovalievBuild100(b *testing.B) {
117-
BMPetarDambovalievBuild(exp100, b)
116+
func BenchmarkAhocorasickCloudflareForkBuild100(b *testing.B) {
117+
BMCloudflareForkBuild(exp100, b)
118118
}
119119

120120
func BenchmarkAhocorasickCloudFlareSearch100(b *testing.B) {
@@ -133,12 +133,12 @@ func BenchmarkDslWithAnknown100(b *testing.B) {
133133
BMDslSearch([]string{exp100}, &finder.AnknownEngine{}, b)
134134
}
135135

136-
func BenchmarkAhocorasickPetarDambovalievSearch100(b *testing.B) {
137-
BMPetarDambovalievSearch([]string{exp100}, b)
136+
func BenchmarkAhocorasickCloudflareForkSearch100(b *testing.B) {
137+
BMCloudflareForkSearch([]string{exp100}, b)
138138
}
139139

140-
func BenchmarkDslWithPetarDambovaliev100(b *testing.B) {
141-
BMDslSearch([]string{exp100}, &finder.PetarDambovalievEngine{}, b)
140+
func BenchmarkDslWithCloudflareFork100(b *testing.B) {
141+
BMDslSearch([]string{exp100}, &finder.CloudflareForkEngine{}, b)
142142
}
143143

144144
// 10000 terms
@@ -171,8 +171,8 @@ func BenchmarkAhocorasickAnknownBuild10000(b *testing.B) {
171171
BMAnknownBuild(exp10000, b)
172172
}
173173

174-
func BenchmarkAhocorasickPetarDambovalievBuild10000(b *testing.B) {
175-
BMPetarDambovalievBuild(exp10000, b)
174+
func BenchmarkAhocorasickCloudflareForkBuild10000(b *testing.B) {
175+
BMCloudflareForkBuild(exp10000, b)
176176
}
177177

178178
func BenchmarkAhocorasickCloudFlareSearch10000(b *testing.B) {
@@ -191,12 +191,12 @@ func BenchmarkDslWithAnknown10000(b *testing.B) {
191191
BMDslSearch([]string{exp10000}, &finder.AnknownEngine{}, b)
192192
}
193193

194-
func BenchmarkAhocorasickPetarDambovalievSearch10000(b *testing.B) {
195-
BMPetarDambovalievSearch([]string{exp10000}, b)
194+
func BenchmarkAhocorasickCloudflareForkSearch10000(b *testing.B) {
195+
BMCloudflareForkSearch([]string{exp10000}, b)
196196
}
197197

198-
func BenchmarkDslWithPetarDambovaliev10000(b *testing.B) {
199-
BMDslSearch([]string{exp10000}, &finder.PetarDambovalievEngine{}, b)
198+
func BenchmarkDslWithCloudflareFork10000(b *testing.B) {
199+
BMDslSearch([]string{exp10000}, &finder.CloudflareForkEngine{}, b)
200200
}
201201

202202
// dsl specific
@@ -220,12 +220,12 @@ func BenchmarkDslWithAnknown10Exps(b *testing.B) {
220220
BMDslSearch(exps10, &finder.AnknownEngine{}, b)
221221
}
222222

223-
func BenchmarkOnlyPetarDambovaliev10Exps(b *testing.B) {
224-
BMPetarDambovalievSearch(exps10, b)
223+
func BenchmarkOnlyCloudflareFork10Exps(b *testing.B) {
224+
BMCloudflareForkSearch(exps10, b)
225225
}
226226

227-
func BenchmarkDslWithPetarDambovaliev10Exps(b *testing.B) {
228-
BMDslSearch(exps10, &finder.PetarDambovalievEngine{}, b)
227+
func BenchmarkDslWithCloudflareFork10Exps(b *testing.B) {
228+
BMDslSearch(exps10, &finder.CloudflareForkEngine{}, b)
229229
}
230230

231231
func BenchmarkDslWithEmptyEngine100Exps(b *testing.B) {
@@ -248,12 +248,12 @@ func BenchmarkDslWithAnknown100Exps(b *testing.B) {
248248
BMDslSearch(exps100, &finder.AnknownEngine{}, b)
249249
}
250250

251-
func BenchmarkOnlyPetarDambovaliev100Exps(b *testing.B) {
252-
BMPetarDambovalievSearch(exps100, b)
251+
func BenchmarkOnlyCloudflareFork100Exps(b *testing.B) {
252+
BMCloudflareForkSearch(exps100, b)
253253
}
254254

255-
func BenchmarkDslWithPetarDambovaliev100Exps(b *testing.B) {
256-
BMDslSearch(exps100, &finder.PetarDambovalievEngine{}, b)
255+
func BenchmarkDslWithCloudflareFork100Exps(b *testing.B) {
256+
BMDslSearch(exps100, &finder.CloudflareForkEngine{}, b)
257257
}
258258

259259
func BenchmarkDslWithEmptyEngine1000Exps(b *testing.B) {
@@ -276,33 +276,33 @@ func BenchmarkDslWithAnknown1000Exps(b *testing.B) {
276276
BMDslSearch(exps1000, &finder.AnknownEngine{}, b)
277277
}
278278

279-
func BenchmarkOnlyPetarDambovaliev1000Exps(b *testing.B) {
280-
BMPetarDambovalievSearch(exps1000, b)
279+
func BenchmarkOnlyCloudflareFork1000Exps(b *testing.B) {
280+
BMCloudflareForkSearch(exps1000, b)
281281
}
282282

283-
func BenchmarkDslWithPetarDambovaliev1000Exps(b *testing.B) {
284-
BMDslSearch(exps1000, &finder.PetarDambovalievEngine{}, b)
283+
func BenchmarkDslWithCloudflareFork1000Exps(b *testing.B) {
284+
BMDslSearch(exps1000, &finder.CloudflareForkEngine{}, b)
285285
}
286286

287287
func BenchmarkUseCasesDsl(b *testing.B) {
288288
expressions := []string{
289289
`"foo" and "bar"`,
290290
}
291-
BMDslSearch(expressions, &finder.PetarDambovalievEngine{}, b)
291+
BMDslSearch(expressions, &finder.CloudflareForkEngine{}, b)
292292
}
293293

294294
func BenchmarkUseCasesDslWithRegex(b *testing.B) {
295295
expressions := []string{
296296
`r"foo.*bar" and r"bar.*foo"`,
297297
}
298-
BMDslSearch(expressions, &finder.PetarDambovalievEngine{}, b)
298+
BMDslSearch(expressions, &finder.CloudflareForkEngine{}, b)
299299
}
300300

301301
func BenchmarkUseCasesDslWithInord(b *testing.B) {
302302
expressions := []string{
303303
`INORD("foo" and "bar") and INORD("bar" and "foo")`,
304304
}
305-
BMDslSearch(expressions, &finder.PetarDambovalievEngine{}, b)
305+
BMDslSearch(expressions, &finder.CloudflareForkEngine{}, b)
306306
}
307307

308308
func BenchmarkUseCasesRegexOnly(b *testing.B) {
@@ -367,23 +367,16 @@ func BMAnknownBuild(exp string, b *testing.B) {
367367
}
368368
}
369369

370-
func BMPetarDambovalievBuild(exp string, b *testing.B) {
371-
p := dsl.NewParser(strings.NewReader(exp100), true)
370+
func BMCloudflareForkBuild(exp string, b *testing.B) {
371+
p := dsl.NewParser(strings.NewReader(exp), true)
372372
p.Parse()
373-
dict := []string{}
374-
373+
dict := [][]byte{}
375374
for key := range p.GetKeywords() {
376-
dict = append(dict, key)
375+
dict = append(dict, []byte(key))
377376
}
378377

379378
for i := 0; i < b.N; i++ {
380-
builder := pdahocorasick.NewAhoCorasickBuilder(pdahocorasick.Opts{
381-
AsciiCaseInsensitive: true,
382-
MatchOnlyWholeWords: false,
383-
MatchKind: pdahocorasick.LeftMostLongestMatch,
384-
DFA: true,
385-
})
386-
builder.Build(dict)
379+
forkahocorasick.NewMatcher(dict)
387380
}
388381
}
389382

@@ -426,27 +419,22 @@ func BMAnknownSearch(exps []string, b *testing.B) {
426419
}
427420
}
428421

429-
func BMPetarDambovalievSearch(exps []string, b *testing.B) {
422+
func BMCloudflareForkSearch(exps []string, b *testing.B) {
430423
findthem := finder.NewFinder(&finder.EmptyEngine{}, &finder.RegexpEngine{}, true)
431424
for _, exp := range exps {
432425
findthem.AddExpression(exp)
433426
}
434427

435-
dict := []string{}
436-
428+
dict := [][]byte{}
437429
for key := range findthem.GetKeywords() {
438-
dict = append(dict, key)
430+
dict = append(dict, []byte(key))
439431
}
440432

441-
builder := pdahocorasick.NewAhoCorasickBuilder(pdahocorasick.Opts{
442-
AsciiCaseInsensitive: true,
443-
MatchOnlyWholeWords: false,
444-
MatchKind: pdahocorasick.LeftMostLongestMatch,
445-
DFA: true,
446-
})
447-
bld := builder.Build(dict)
433+
m := forkahocorasick.NewMatcher(dict)
434+
435+
content := []byte(randText100000)
448436
for i := 0; i < b.N; i++ {
449-
bld.FindAll(randText100000)
437+
m.MatchAll(content)
450438
}
451439
}
452440

deps.bzl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ def go_dependencies():
2626
version = "v1.1.0",
2727
)
2828
go_repository(
29-
name = "com_github_petar_dambovaliev_aho_corasick",
30-
importpath = "github.com/petar-dambovaliev/aho-corasick",
31-
sum = "h1:WuXe30Ig5zUIYEHyzsLMBFPP5l0yRQ5IiZScODHwy8g=",
32-
version = "v0.0.0-20210512121028-af76a9ff7276",
29+
name = "com_github_pedroegsilva_ahocorasick",
30+
importpath = "github.com/pedroegsilva/ahocorasick",
31+
sum = "h1:N5egH9vhDB1eGp00uZmi8OFzb3cFrE9dG5MfrjRKcGc=",
32+
version = "v0.1.0",
3333
)
3434
go_repository(
3535
name = "com_github_pmezard_go_difflib",

examples/finder/main.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Mauris feugiat vitae eros et facilisis.
1919
Donec facilisis mattis dignissim.`,
2020
}
2121

22-
subEng := &finder.PetarDambovalievEngine{}
22+
subEng := &finder.CloudflareForkEngine{}
2323
rgxEng := &finder.RegexpEngine{}
2424
caseSensitive := true
2525
findthem := finder.NewFinder(subEng, rgxEng, caseSensitive)
@@ -51,11 +51,11 @@ Donec facilisis mattis dignissim.`,
5151
}
5252
fmt.Printf("----------Text %d case sensitive-----------\n", i)
5353
for _, expRes := range resp {
54-
fmt.Printf("exp: [%s]%s | %v\n", expRes.Tag, expRes.ExpresionStr, expRes.Evaluation)
54+
fmt.Printf("exp %d: [%s]%s\n", expRes.ExpresionIndex, expRes.Tag, expRes.ExpresionStr)
5555
}
5656
}
5757

58-
subEng2 := &finder.PetarDambovalievEngine{}
58+
subEng2 := &finder.CloudflareForkEngine{}
5959
rgxEng2 := &finder.RegexpEngine{}
6060
findthem2 := finder.NewFinder(subEng2, rgxEng2, !caseSensitive)
6161

@@ -74,7 +74,7 @@ Donec facilisis mattis dignissim.`,
7474
}
7575
fmt.Printf("----------Text %d case insensitive-----------\n", i)
7676
for _, expRes := range resp {
77-
fmt.Printf("exp: [%s]%s | %v\n", expRes.Tag, expRes.ExpresionStr, expRes.Evaluation)
77+
fmt.Printf("exp %d: [%s]%s\n", expRes.ExpresionIndex, expRes.Tag, expRes.ExpresionStr)
7878
}
7979
}
8080
}

finder/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ go_library(
1313
"//dsl",
1414
"@com_github_anknown_ahocorasick//:ahocorasick",
1515
"@com_github_cloudflare_ahocorasick//:ahocorasick",
16-
"@com_github_petar_dambovaliev_aho_corasick//:aho-corasick",
16+
"@com_github_pedroegsilva_ahocorasick//ahocorasick",
1717
],
1818
)
1919

finder/finder.go

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ type exprWrapper struct {
2323

2424
// ExpressionResult
2525
type ExpressionResult struct {
26-
ExpresionStr string
27-
Tag string
28-
Evaluation bool
26+
ExpresionIndex int
27+
ExpresionStr string
28+
Tag string
2929
}
3030

3131
// Finder stores the needed information to find the terms and solve the expressions
@@ -62,6 +62,33 @@ func (finder *Finder) AddExpression(expression string) error {
6262
return finder.AddExpressionWithTag(expression, "")
6363
}
6464

65+
// AddExpressions adds the expressions to the finder. It also collect
66+
// and store the terms from all expressions that are going to be used by
67+
// the substring engine If the expression is malformed returns an error.
68+
func (finder *Finder) AddExpressions(expressions []string) error {
69+
for _, expression := range expressions {
70+
err := finder.AddExpressionWithTag(expression, "")
71+
if err != nil {
72+
return err
73+
}
74+
}
75+
return nil
76+
}
77+
78+
// AddExpressionsWithTag adds the expressions to the finder with the same tag.
79+
// the tag will be returned on the process text. It also collect
80+
// and store the terms that are going to be used by the substring engine
81+
// If the expression is malformed returns an error.
82+
func (finder *Finder) AddExpressionsWithTag(expressions []string, tag string) error {
83+
for _, expression := range expressions {
84+
err := finder.AddExpressionWithTag(expression, tag)
85+
if err != nil {
86+
return err
87+
}
88+
}
89+
return nil
90+
}
91+
6592
// AddExpressionWithTag adds the expression to the finder with a tag.
6693
// the tag will be returned on the process text. It also collect
6794
// and store the terms that are going to be used by the substring engine
@@ -89,7 +116,7 @@ func (finder *Finder) AddExpressionWithTag(expression string, tag string) error
89116

90117
// ProcessText uses all the unique terms to create the substring engine.
91118
// Searches for matching terms and solves the expressions.
92-
// and returns a map with the expression string as key and its evaluation as value
119+
// and returns an array of ExpressionResult for the all expressions that were evaluetad as true.
93120
func (finder *Finder) ProcessText(text string) (expRes []ExpressionResult, err error) {
94121
if !finder.caseSensitive {
95122
text = strings.ToLower(text)
@@ -149,18 +176,20 @@ func (finder *Finder) addMatchesToSolverMap(matches []*Match, sortedMatchesByKey
149176
}
150177
}
151178

152-
// solveExpressions solves all expressions using the values of the solverMap
179+
// solveExpressions returns all expressions that were true using the values of the solverMap
153180
func (finder *Finder) solveExpressions(sortedMatchesByKeyword map[string][]int) (expRes []ExpressionResult, err error) {
154-
expRes = make([]ExpressionResult, len(finder.expressions))
181+
expRes = make([]ExpressionResult, 0)
155182
for i, exp := range finder.expressions {
156183
res, err := exp.solverOrd.Solve(sortedMatchesByKeyword)
157184
if err != nil {
158185
return nil, err
159186
}
160-
expRes[i] = ExpressionResult{
161-
Evaluation: res,
162-
Tag: exp.tag,
163-
ExpresionStr: exp.exprString,
187+
if res {
188+
expRes = append(expRes, ExpressionResult{
189+
Tag: exp.tag,
190+
ExpresionStr: exp.exprString,
191+
ExpresionIndex: i,
192+
})
164193
}
165194
}
166195
return

0 commit comments

Comments
 (0)