Skip to content

Commit 720b173

Browse files
authored
Merge pull request #256 from S0obi/feature/use-wordnet-database
feat: use wordnet database for fp-finder
2 parents 4102bb4 + 28a8d4e commit 720b173

File tree

8 files changed

+80
-111
lines changed

8 files changed

+80
-111
lines changed

cmd/util/fp_finder/fp_finder.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,10 @@ package fpFinder
66
import (
77
"fmt"
88
"os"
9-
"strings"
109

1110
"github.com/spf13/cobra"
1211

1312
"github.com/coreruleset/crs-toolchain/v2/cmd/internal"
14-
"github.com/coreruleset/crs-toolchain/v2/configuration"
1513
"github.com/coreruleset/crs-toolchain/v2/util"
1614
)
1715

@@ -37,17 +35,6 @@ from stdin instead.`,
3735
return fmt.Errorf("file %s doesn't exist", filenameArg)
3836
}
3937

40-
// CLI parameter is prioritized, if not provided config file is looked up
41-
// By default will be set to DefaultDictionaryCommitRef
42-
if strings.TrimSpace(englishDictionaryCommitRef) == "" {
43-
dictionaryContext := cmdContext.RootContext().Configuration().Sources.EnglishDictionary
44-
if dictionaryContext.WasCommitRefSet {
45-
englishDictionaryCommitRef = dictionaryContext.CommitRef
46-
} else {
47-
englishDictionaryCommitRef = configuration.DefaultDictionaryCommitRef
48-
}
49-
}
50-
5138
if extendedDictPath != "" && !checkFilePath(extendedDictPath) {
5239
return fmt.Errorf("extended dictionary %s doesn't exist", extendedDictPath)
5340
}
@@ -62,7 +49,6 @@ from stdin instead.`,
6249

6350
func buildFlags(cmd *cobra.Command) {
6451
cmd.Flags().StringVarP(&extendedDictPath, "extended-dictionary", "e", "", "Absolute or relative path to the extended dictionary")
65-
cmd.Flags().StringVarP(&englishDictionaryCommitRef, "english-dictionary-commit-ref", "c", "", "English dictionary commit ref from GitHub https://github.com/dwyl/english-words/blob/master/words_alpha.txt")
6652
}
6753

6854
func checkFilePath(path string) bool {

configuration/configuration.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,9 @@ import (
1414
const DefaultDictionaryCommitRef = "refs/heads/master"
1515

1616
type Configuration struct {
17-
Sources Sources
1817
Patterns Patterns
1918
}
2019

21-
type Sources struct {
22-
EnglishDictionary EnglishDictionary `yaml:"english_dictionary,omitempty"`
23-
}
24-
25-
type EnglishDictionary struct {
26-
CommitRef string `yaml:"commit_ref"`
27-
WasCommitRefSet bool
28-
}
29-
3020
type Patterns struct {
3121
AntiEvasion Pattern `yaml:"anti_evasion"`
3222
AntiEvasionSuffix Pattern `yaml:"anti_evasion_suffix"`
@@ -61,13 +51,5 @@ func New(directory string, filename string) *Configuration {
6151
newConfiguration.Patterns.AntiEvasionNoSpaceSuffix.Unix = strings.TrimSpace(newConfiguration.Patterns.AntiEvasionNoSpaceSuffix.Unix)
6252
newConfiguration.Patterns.AntiEvasionNoSpaceSuffix.Windows = strings.TrimSpace(newConfiguration.Patterns.AntiEvasionNoSpaceSuffix.Windows)
6353

64-
if strings.TrimSpace(newConfiguration.Sources.EnglishDictionary.CommitRef) == "" {
65-
newConfiguration.Sources.EnglishDictionary.CommitRef = DefaultDictionaryCommitRef
66-
newConfiguration.Sources.EnglishDictionary.WasCommitRefSet = false
67-
} else {
68-
newConfiguration.Sources.EnglishDictionary.CommitRef = strings.TrimSpace(newConfiguration.Sources.EnglishDictionary.CommitRef)
69-
newConfiguration.Sources.EnglishDictionary.WasCommitRefSet = true
70-
}
71-
7254
return newConfiguration
7355
}

configuration/configuration_test.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,6 @@ func (s *configurationTestSuite) TestReadingConfiguration() {
5959

6060
func newTestConfiguration() *Configuration {
6161
return &Configuration{
62-
Sources: Sources{
63-
EnglishDictionary: EnglishDictionary{
64-
CommitRef: "refs/heads/master",
65-
WasCommitRefSet: true,
66-
},
67-
},
6862
Patterns: Patterns{
6963
AntiEvasion: Pattern{
7064
Unix: "_av-u_",

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ require (
99

1010
require (
1111
github.com/Masterminds/semver/v3 v3.4.0
12+
github.com/coreruleset/wnram v0.1.0
1213
github.com/creativeprojects/go-selfupdate v1.5.0
1314
github.com/google/uuid v1.6.0
1415
github.com/hashicorp/go-getter/v2 v2.2.3

go.sum

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
code.gitea.io/sdk/gitea v0.21.0 h1:69n6oz6kEVHRo1+APQQyizkhrZrLsTLXey9142pfkD4=
22
code.gitea.io/sdk/gitea v0.21.0/go.mod h1:tnBjVhuKJCn8ibdyyhvUyxrR1Ca2KHEoTWoukNhXQPA=
3-
dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s=
4-
dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk=
53
dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
64
dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
75
github.com/42wim/httpsig v1.2.2 h1:ofAYoHUNs/MJOLqQ8hIxeyz2QxOz8qdSVvp3PX/oPgA=
86
github.com/42wim/httpsig v1.2.2/go.mod h1:P/UYo7ytNBFwc+dg35IubuAUIs8zj5zzFIgUCEl55WY=
9-
github.com/Masterminds/semver/v3 v3.3.1 h1:QtNSWtVZ3nBfk8mAOu/B6v7FMJ+NHTIgUPi7rj+4nv4=
10-
github.com/Masterminds/semver/v3 v3.3.1/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
117
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
128
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
139
github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d h1:xDfNPAt8lFiC1UJrqV3uuy861HCTo708pDMbjHHdCas=
1410
github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d/go.mod h1:6QX/PXZ00z/TKoufEY6K/a0k6AhaJrQKdFe6OfVXsa4=
1511
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
12+
github.com/coreruleset/wnram v0.1.0 h1:9tNgUX67h8E1WorbiU9OOlgKUJClYjunjBfb7cXPUas=
13+
github.com/coreruleset/wnram v0.1.0/go.mod h1:EeNpPR2NxOfy4x4yQtEQR+Tenn0Qpqcld+kZAzwc5dc=
1614
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
1715
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
1816
github.com/creativeprojects/go-selfupdate v1.5.0 h1:4zuFafc/qGpymx7umexxth2y2lJXoBR49c3uI0Hr+zU=
@@ -90,8 +88,6 @@ github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
9088
github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
9189
github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
9290
github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
93-
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
94-
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
9591
github.com/stretchr/testify v1.11.0 h1:ib4sjIrwZKxE5u/Japgo/7SJV3PvgjGiRNAvTVGqQl8=
9692
github.com/stretchr/testify v1.11.0/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
9793
github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc=

util/fp_finder.go

Lines changed: 41 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,21 @@ import (
1212
"slices"
1313
"strings"
1414

15+
"github.com/coreruleset/wnram"
16+
1517
"github.com/coreruleset/crs-toolchain/v2/utils"
1618
)
1719

1820
type FpFinderError struct{}
1921

20-
const dictionaryURLFormat = "https://raw.githubusercontent.com/dwyl/english-words/%s/%s"
21-
const dictionaryBaseFileName = "words_alpha.txt"
22+
const dictionaryURLFormat = "https://wordnetcode.princeton.edu/%s"
23+
const dictionaryBaseFileName = "wn3.1.dict.tar.gz"
2224
const minSize = 3
2325

26+
type WordNet interface {
27+
Lookup(criteria wnram.Criteria) ([]wnram.Lookup, error)
28+
}
29+
2430
func (t *FpFinderError) Error() string {
2531
return "FpFinder error"
2632
}
@@ -33,41 +39,35 @@ func NewFpFinder() *FpFinder {
3339

3440
func (t *FpFinder) FpFinder(inputFilePath string, extendedDictionaryFilePath string, englishDictionaryCommitRef string) error {
3541
// Get the dictionary path in ~/.crs-toolchain
36-
dictionaryFileName := fmt.Sprintf("%s-%s", englishDictionaryCommitRef, dictionaryBaseFileName)
37-
dictionaryPath, err := utils.GetCacheFilePath(dictionaryFileName)
42+
dictionaryPath, err := utils.GetCacheFilePath(dictionaryBaseFileName)
3843
if err != nil {
3944
logger.Fatal().Err(err).Msg("Error getting dictionary path")
4045
}
4146

4247
// Check if the dictionary exists, if not, download it
4348
if _, err := os.Stat(dictionaryPath); os.IsNotExist(err) {
44-
logger.Debug().Msg("Dictionary file not found. Downloading...")
45-
dictionaryURL := fmt.Sprintf(dictionaryURLFormat, englishDictionaryCommitRef, dictionaryBaseFileName)
46-
if err := utils.DownloadFile(dictionaryPath, dictionaryURL); err != nil {
49+
logger.Debug().Msg("Dictionary folder not found. Downloading...")
50+
dictionaryArchivePath, err := utils.GetCacheFilePath(dictionaryBaseFileName)
51+
if err != nil {
52+
logger.Fatal().Err(err).Msg("Error getting dictionary path")
53+
}
54+
55+
dictionaryURL := fmt.Sprintf(dictionaryURLFormat, dictionaryBaseFileName)
56+
logger.Debug().Msgf("Downloading dictionary from %s to %s", dictionaryURL, dictionaryArchivePath)
57+
if err := utils.DownloadFile(dictionaryArchivePath, dictionaryURL); err != nil {
4758
logger.Fatal().Err(err).Msg("Failed to download dictionary")
4859
}
4960
logger.Debug().Msg("Download complete.")
5061
} else {
51-
logger.Debug().Msg("Dictionary file found, skipping download.")
62+
logger.Debug().Msg("Dictionary folder found, skipping download.")
5263
}
5364

54-
// Load dictionary into memory
55-
englishDict, err := t.loadDictionary(dictionaryPath, minSize)
56-
if err != nil {
57-
logger.Fatal().Err(err).Msg("Failed to load english dictionary")
58-
}
59-
60-
var dict map[string]struct{}
65+
var extendedDict map[string]struct{}
6166
if extendedDictionaryFilePath != "" {
62-
extendedDict, err := t.loadDictionary(extendedDictionaryFilePath, 0)
67+
extendedDict, err = t.loadDictionary(extendedDictionaryFilePath, 0)
6368
if err != nil {
6469
logger.Fatal().Err(err).Msg("Failed to load extended dictionary")
6570
}
66-
67-
// Add words from extendedDictionary
68-
dict = t.mergeDictionaries(englishDict, extendedDict)
69-
} else {
70-
dict = englishDict
7171
}
7272

7373
// Load input file into memory
@@ -76,8 +76,10 @@ func (t *FpFinder) FpFinder(inputFilePath string, extendedDictionaryFilePath str
7676
logger.Fatal().Err(err).Msg("Failed to load input file")
7777
}
7878

79+
wn, _ := wnram.New(dictionaryPath)
80+
7981
// Process words from inputfile, sort the output and remove duplicates
80-
filteredWords := t.processWords(inputFile, dict, minSize)
82+
filteredWords := t.processWords(inputFile, wn, extendedDict, minSize)
8183

8284
for _, str := range filteredWords {
8385
fmt.Println(str)
@@ -102,19 +104,6 @@ func (t *FpFinder) loadDictionary(path string, minWordLength int) (map[string]st
102104
return content, nil
103105
}
104106

105-
func (t *FpFinder) mergeDictionaries(a, b map[string]struct{}) map[string]struct{} {
106-
merged := make(map[string]struct{})
107-
108-
for k := range a {
109-
merged[k] = struct{}{}
110-
}
111-
for k := range b {
112-
merged[k] = struct{}{}
113-
}
114-
115-
return merged
116-
}
117-
118107
func (t *FpFinder) loadInput(path string) ([]string, error) {
119108
if path == "-" {
120109
return t.loadInputFromStdIn()
@@ -158,9 +147,9 @@ func (t *FpFinder) wordsFromInput(reader io.Reader) ([]string, error) {
158147
return content, nil
159148
}
160149

161-
func (t *FpFinder) processWords(inputFile []string, dict map[string]struct{}, minSize int) []string {
150+
func (t *FpFinder) processWords(inputFile []string, wn WordNet, extendedDict map[string]struct{}, minSize int) []string {
162151
// Filter words not in the dictionary
163-
filteredWords := t.filterContent(inputFile, dict, minSize)
152+
filteredWords := t.filterContent(inputFile, wn, extendedDict, minSize)
164153

165154
// Sort words alphabetically (case-insensitive)
166155
slices.SortFunc(filteredWords, func(a, b string) int {
@@ -173,7 +162,7 @@ func (t *FpFinder) processWords(inputFile []string, dict map[string]struct{}, mi
173162
return filteredWords
174163
}
175164

176-
func (t *FpFinder) filterContent(inputFile []string, dict map[string]struct{}, minSize int) []string {
165+
func (t *FpFinder) filterContent(inputFile []string, wn WordNet, extendedDict map[string]struct{}, minSize int) []string {
177166
var commentPattern = regexp.MustCompile(`^\s*#`)
178167
var filteredWords []string
179168
for _, word := range inputFile {
@@ -184,10 +173,21 @@ func (t *FpFinder) filterContent(inputFile []string, dict map[string]struct{}, m
184173
if word == "" || len(word) < minSize {
185174
continue
186175
}
176+
// Check if the word exists in WordNet
177+
found, err := wn.Lookup(wnram.Criteria{Matching: word})
178+
if err != nil {
179+
logger.Fatal().Err(err).Msg("Failed to lookup word in WordNet")
180+
}
187181

188-
// If the word is not in the dictionary, add it to the filtered list
189-
if _, found := dict[word]; !found {
190-
filteredWords = append(filteredWords, word)
182+
// If the word is not in the dictionary and extended dictionary, add it to the filtered list
183+
if len(found) == 0 {
184+
if _, found := extendedDict[word]; !found {
185+
filteredWords = append(filteredWords, word)
186+
} else {
187+
logger.Debug().Msgf("Word '%s' found in extended dictionary", word)
188+
}
189+
} else {
190+
logger.Debug().Msgf("Word '%s' found in WordNet", word)
191191
}
192192
}
193193

util/fp_finder_test.go

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,22 @@ package util
66
import (
77
"testing"
88

9+
"github.com/coreruleset/wnram"
910
"github.com/stretchr/testify/suite"
1011
)
1112

13+
// mockWordNet is a fake implementation of WordNet
14+
type mockWordNet struct {
15+
lookup map[string][]wnram.Lookup
16+
}
17+
18+
func (m *mockWordNet) Lookup(criteria wnram.Criteria) ([]wnram.Lookup, error) {
19+
if v, ok := m.lookup[criteria.Matching]; ok {
20+
return v, nil
21+
}
22+
return nil, nil
23+
}
24+
1225
type fpFinderTestSuite struct {
1326
suite.Suite
1427
}
@@ -27,53 +40,50 @@ func (s *fpFinderTestSuite) TestFpFinder_FilterContent() {
2740
"apple", "banana", "apple", "",
2841
}
2942

30-
dict := map[string]struct{}{
31-
"apple": {},
32-
"dog": {},
43+
extendedDict := map[string]struct{}{}
44+
mockWN := &mockWordNet{
45+
lookup: map[string][]wnram.Lookup{
46+
"apple": {{}}, // fake lookup result
47+
},
3348
}
34-
3549
expected := []string{"banana"}
3650

37-
result := NewFpFinder().filterContent(input, dict, 3)
51+
result := NewFpFinder().filterContent(input, mockWN, extendedDict, 3)
3852
s.Equal(expected, result)
3953
}
4054

4155
func (s *fpFinderTestSuite) TestFpFinder_ProcessWords() {
4256
input := []string{"apple", "banana", "orange", "banana", "pear", "#comment", "banana"}
43-
dict := map[string]struct{}{
44-
"apple": {},
57+
58+
extendedDict := map[string]struct{}{
4559
"orange": {},
4660
}
61+
mockWN := &mockWordNet{
62+
lookup: map[string][]wnram.Lookup{
63+
"apple": {{}}, // fake lookup result
64+
},
65+
}
4766

4867
expected := []string{"banana", "pear"}
4968

50-
result := NewFpFinder().processWords(input, dict, 3)
69+
result := NewFpFinder().processWords(input, mockWN, extendedDict, 3)
5170

5271
s.Equal(expected, result)
5372
}
5473

5574
func (s *fpFinderTestSuite) TestFpFinder_ProcessWords_Sorting() {
5675
input := []string{"pear", "Banana", ".hiddenfruit", "kiwi", "banana", "Apple", ".dotfruit"}
57-
dict := map[string]struct{}{} // empty dictionary, so no filtering
5876

59-
expected := []string{".dotfruit", ".hiddenfruit", "Apple", "Banana", "banana", "kiwi", "pear"}
60-
61-
result := NewFpFinder().processWords(input, dict, 3)
62-
63-
s.Equal(expected, result)
64-
}
77+
extendedDict := map[string]struct{}{}
78+
mockWN := &mockWordNet{
79+
lookup: map[string][]wnram.Lookup{
80+
"": {{}}, // fake lookup result
81+
},
82+
}
6583

66-
func (s *fpFinderTestSuite) TestFpFinder_MergeDictionaries() {
67-
a := map[string]struct{}{"apple": {}, "banana": {}}
68-
b := map[string]struct{}{"cherry": {}, "date": {}}
84+
expected := []string{".dotfruit", ".hiddenfruit", "Apple", "Banana", "banana", "kiwi", "pear"}
6985

70-
expected := map[string]struct{}{
71-
"apple": {},
72-
"banana": {},
73-
"cherry": {},
74-
"date": {},
75-
}
86+
result := NewFpFinder().processWords(input, mockWN, extendedDict, 3)
7687

77-
result := NewFpFinder().mergeDictionaries(a, b)
7888
s.Equal(expected, result)
7989
}

utils/utils.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ func DownloadFile(filepath, url string) error {
2626
request := &getter.Request{
2727
Src: url,
2828
Dst: filepath,
29-
GetMode: getter.ModeFile,
29+
GetMode: getter.ModeAny,
3030
}
3131
client := &getter.Client{
3232
Getters: []getter.Getter{

0 commit comments

Comments
 (0)