Skip to content

Commit 109f631

Browse files
authored
Merge pull request #2 from S0obi/feature/add-morph-support
feat: add basic morph support
2 parents e32b438 + e396e5e commit 109f631

File tree

6 files changed

+288
-74
lines changed

6 files changed

+288
-74
lines changed

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ jobs:
2727
cache: true
2828

2929
- name: Run golangci-lint
30-
uses: golangci/golangci-lint
30+
uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 # v8.0.0
3131

3232
- name: Run Go Tests
3333
run: go test -coverprofile=coverage.out ./...

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
coverage.out
1+
coverage.out
2+
coverage.html

README.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@ seconds on a modest laptop.
1919
* All relation types (Antonyms, Hyponyms, Hypernyms, etc)
2020
* Iteration of the database
2121
* Lemmatization
22-
23-
## Missing features
24-
2522
* Morphology - specifically generating a lemma from input text
2623

2724
## Example Usage

data/noun.exc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
children child
2+
geese goose
3+
noes no
4+
stamina stamen
5+
teeth tooth
6+
theses thesis
7+
wolves wolf

wordnet.go

Lines changed: 165 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,16 @@ import (
55
"os"
66
"path"
77
"path/filepath"
8+
"slices"
89
"strings"
9-
"time"
1010
)
1111

1212
// An initialized read-only, in-ram instance of the wordnet database.
1313
// May safely be shared by multiple threads of execution
1414
type Handle struct {
15-
index map[string][]*cluster
16-
db []*cluster
15+
index map[string][]*cluster
16+
db []*cluster
17+
exceptions map[string]string
1718
}
1819

1920
// The results of a search against the wordnet database
@@ -53,24 +54,39 @@ type PartOfSpeech uint8
5354
// A set of multiple parts of speech
5455
type PartOfSpeechList []PartOfSpeech
5556

57+
var suffixes = []string{
58+
// Noun suffixes
59+
"s", "ses", "xes", "zes", "ches", "shes", "men", "ies",
60+
// Verb suffixes
61+
"s", "ies", "es", "es", "ed", "ed", "ing", "ing",
62+
// Adjective suffixes
63+
"er", "est", "er", "est",
64+
}
65+
66+
var pluralEndings = []string{
67+
// Noun endings
68+
"", "s", "x", "z", "ch", "sh", "man", "y",
69+
// Verb endings
70+
"", "y", "e", "", "e", "", "e", "",
71+
// Adjective endings
72+
"", "", "e", "e",
73+
}
74+
75+
var offsets = []int{0, 8, 16}
76+
var counts = []int{8, 8, 4}
77+
5678
func (l PartOfSpeechList) Empty() bool {
5779
return len(l) == 0
5880
}
5981

6082
func (l PartOfSpeechList) Contains(want PartOfSpeech) bool {
61-
for _, got := range l {
62-
if got == want {
63-
return true
64-
}
65-
}
66-
return false
83+
return slices.Contains(l, want)
6784
}
6885

6986
const (
7087
Noun PartOfSpeech = iota
7188
Verb
7289
Adjective
73-
// AdjectiveSatellite
7490
Adverb
7591
)
7692

@@ -154,12 +170,15 @@ func (w *Lookup) DumpStr() string {
154170
s := fmt.Sprintf("Word: %s\n", w.String())
155171
s += "Synonyms: "
156172
words := []string{}
173+
157174
for _, w := range w.cluster.words {
158175
words = append(words, w.word)
159176
}
177+
160178
s += strings.Join(words, ", ") + "\n"
161179
s += fmt.Sprintf("%d semantic relationships\n", len(w.cluster.relations))
162180
s += "| " + w.cluster.gloss + "\n"
181+
163182
return s
164183
}
165184

@@ -211,14 +230,15 @@ func (w *Lookup) Related(r Relation) (relationships []Lookup) {
211230
// Initialize a new in-ram WordNet databases reading files from the
212231
// specified directory.
213232
func New(dir string) (*Handle, error) {
214-
cnt := 0
215233
type ix struct {
216234
index string
217235
pos PartOfSpeech
218236
}
237+
219238
byOffset := map[ix]*cluster{}
239+
exceptions := map[string]string{}
240+
220241
err := filepath.Walk(dir, func(filename string, info os.FileInfo, err error) error {
221-
start := time.Now()
222242
if err != nil || info.IsDir() {
223243
return err
224244
}
@@ -227,74 +247,92 @@ func New(dir string) (*Handle, error) {
227247
if strings.HasPrefix(path.Base(filename), ".") || strings.HasSuffix(filename, "~") || strings.HasSuffix(filename, "#") {
228248
return nil
229249
}
230-
// read only data files
231-
if !strings.HasPrefix(path.Base(filename), "data") {
232-
return nil
233-
}
234250

235-
err = inPlaceReadLineFromPath(filename, func(data []byte, line, offset int64) error {
236-
cnt++
237-
if p, err := parseLine(data, line); err != nil {
238-
return fmt.Errorf("%s", err)
239-
} else if p != nil {
240-
// first, let's identify the cluster
241-
index := ix{p.byteOffset, p.pos}
242-
c, ok := byOffset[index]
243-
if !ok {
244-
c = &cluster{}
245-
byOffset[index] = c
246-
}
247-
// now update
248-
c.pos = p.pos
249-
c.words = p.words
250-
c.gloss = p.gloss
251-
c.debug = p.byteOffset
252-
253-
// now let's build relations
254-
for _, r := range p.rels {
255-
rindex := ix{r.offset, r.pos}
256-
rcluster, ok := byOffset[rindex]
251+
// read data files
252+
if strings.HasPrefix(path.Base(filename), "data") {
253+
err = inPlaceReadLineFromPath(filename, func(data []byte, line, offset int64) error {
254+
if p, err := parseLine(data, line); err != nil {
255+
return fmt.Errorf("%s", err)
256+
} else if p != nil {
257+
// first, let's identify the cluster
258+
index := ix{p.byteOffset, p.pos}
259+
c, ok := byOffset[index]
257260
if !ok {
258-
// create the other side of the relationship
259-
rcluster = &cluster{}
260-
byOffset[rindex] = rcluster
261+
c = &cluster{}
262+
byOffset[index] = c
261263
}
262-
if r.isSemantic {
263-
c.relations = append(c.relations, semanticRelation{
264-
rel: r.rel,
265-
target: rcluster,
266-
})
267-
} else {
268-
if int(r.source) >= len(c.words) {
269-
return fmt.Errorf("%s:%d: error parsing relations, bogus source (words: %d, offset: %d) [%s]", filename, line, r.source, len(c.words), string(data))
264+
265+
// now update
266+
c.pos = p.pos
267+
c.words = p.words
268+
c.gloss = p.gloss
269+
c.debug = p.byteOffset
270+
271+
// now let's build relations
272+
for _, r := range p.rels {
273+
rindex := ix{r.offset, r.pos}
274+
rcluster, ok := byOffset[rindex]
275+
if !ok {
276+
// create the other side of the relationship
277+
rcluster = &cluster{}
278+
byOffset[rindex] = rcluster
279+
}
280+
if r.isSemantic {
281+
c.relations = append(c.relations, semanticRelation{
282+
rel: r.rel,
283+
target: rcluster,
284+
})
285+
} else {
286+
if int(r.source) >= len(c.words) {
287+
return fmt.Errorf("%s:%d: error parsing relations, bogus source (words: %d, offset: %d) [%s]", filename, line, r.source, len(c.words), string(data))
288+
}
289+
c.words[r.source].relations = append(c.words[r.source].relations, syntacticRelation{
290+
rel: r.rel,
291+
target: rcluster,
292+
wordNumber: r.dest,
293+
})
270294
}
271-
c.words[r.source].relations = append(c.words[r.source].relations, syntacticRelation{
272-
rel: r.rel,
273-
target: rcluster,
274-
wordNumber: r.dest,
275-
})
276295
}
296+
277297
}
298+
return nil
299+
})
300+
301+
return err
302+
}
303+
304+
// read exception files
305+
if strings.HasSuffix(path.Base(filename), ".exc") {
306+
err = inPlaceReadLineFromPath(filename, func(data []byte, line, offset int64) error {
307+
parts := strings.SplitN(string(data), " ", 2)
308+
if len(parts) == 2 {
309+
exceptions[parts[0]] = parts[1]
310+
} else {
311+
return fmt.Errorf("malformed exception line %d: %q", line, string(data))
312+
}
313+
return nil
314+
})
315+
}
278316

279-
}
280-
return nil
281-
})
282-
fmt.Printf("%s in %s\n", filename, time.Since(start).String())
283317
return err
284318
})
319+
285320
if err != nil {
286321
return nil, err
287322
}
288323

289-
// now that we've built up the in ram database, lets' index it
290324
h := Handle{
291-
db: make([]*cluster, 0, len(byOffset)),
292-
index: make(map[string][]*cluster),
325+
db: make([]*cluster, 0, len(byOffset)),
326+
index: make(map[string][]*cluster),
327+
exceptions: exceptions,
293328
}
329+
330+
// now that we've built up the in ram database, lets' index it
294331
for _, c := range byOffset {
295332
if len(c.words) == 0 {
296333
return nil, fmt.Errorf("ERROR, internal consistency error -> cluster without words %v", c)
297334
}
335+
298336
// add to the global slice of synsets (supports iteration)
299337
h.db = append(h.db, c)
300338

@@ -324,27 +362,44 @@ func (h *Handle) Lookup(crit Criteria) ([]Lookup, error) {
324362
if crit.Matching == "" {
325363
return nil, fmt.Errorf("empty string passed as criteria to lookup")
326364
}
365+
327366
searchStr := normalize(crit.Matching)
367+
368+
// Check if searchStr is a known plural exception
369+
// if so, replace it with the singular form
370+
if val, ok := h.exceptions[searchStr]; ok {
371+
searchStr = val
372+
}
373+
328374
clusters := h.index[searchStr]
329-
found := []Lookup{}
330-
for _, c := range clusters {
331-
if len(crit.POS) > 0 {
332-
satisfied := false
333-
for _, p := range crit.POS {
334-
if p == c.pos {
335-
satisfied = true
375+
if clusters == nil {
376+
// Try to find a baseform (lemma) of the search string
377+
for _, pos := range []PartOfSpeech{Noun, Verb, Adjective, Adverb} {
378+
if base := h.MorphWord(searchStr, pos); base != "" {
379+
clusters = h.index[base]
380+
if clusters != nil {
336381
break
337382
}
338383
}
384+
}
385+
}
386+
387+
found := []Lookup{}
388+
389+
for _, c := range clusters {
390+
if len(crit.POS) > 0 {
391+
satisfied := slices.Contains(crit.POS, c.pos)
339392
if !satisfied {
340393
continue
341394
}
342395
}
396+
343397
found = append(found, Lookup{
344398
word: crit.Matching,
345399
cluster: c,
346400
})
347401
}
402+
348403
return found, nil
349404
}
350405

@@ -353,13 +408,55 @@ func (h *Handle) Iterate(pos PartOfSpeechList, cb func(Lookup) error) error {
353408
if !pos.Empty() && !pos.Contains(c.pos) {
354409
continue
355410
}
411+
356412
err := cb(Lookup{
357413
word: c.words[0].word,
358414
cluster: c,
359415
})
416+
360417
if err != nil {
361418
return err
362419
}
363420
}
421+
364422
return nil
365423
}
424+
425+
// wordbase removes a suffix from 'word' if it matches suffixes[ender], then appends plugalEndings[ender].
426+
func wordbase(word string, ender int) string {
427+
copy := word
428+
if strings.HasSuffix(copy, suffixes[ender]) {
429+
// Remove the suffix
430+
copy = copy[:len(copy)-len(suffixes[ender])]
431+
// Append the pluralEndings string
432+
copy += pluralEndings[ender]
433+
}
434+
435+
return copy
436+
}
437+
438+
// Try to find all possible baseforms (lemmas) of individual word in POS.
439+
func (h *Handle) MorphWord(word string, pos PartOfSpeech) string {
440+
if pos == Adverb {
441+
// Adverbs are not inflected in WordNet
442+
return ""
443+
} else if pos == Noun {
444+
if strings.HasSuffix(word, "ful") {
445+
return word[:len(word)-3]
446+
} else if strings.HasSuffix(word, "ss") || len(word) <= 2 {
447+
return ""
448+
}
449+
}
450+
451+
offset := offsets[int(pos)]
452+
count := counts[int(pos)]
453+
454+
for i := range count {
455+
retval := wordbase(word, offset+i)
456+
if h.index[retval] != nil && retval != word {
457+
return retval
458+
}
459+
}
460+
461+
return ""
462+
}

0 commit comments

Comments
 (0)