@@ -5,15 +5,16 @@ import (
55 "os"
66 "path"
77 "path/filepath"
8+ "slices"
89 "strings"
9- "time"
1010)
1111
1212// An initialized read-only, in-ram instance of the wordnet database.
1313// May safely be shared by multiple threads of execution
1414type Handle struct {
15- index map [string ][]* cluster
16- db []* cluster
15+ index map [string ][]* cluster
16+ db []* cluster
17+ exceptions map [string ]string
1718}
1819
1920// The results of a search against the wordnet database
@@ -53,24 +54,39 @@ type PartOfSpeech uint8
5354// A set of multiple parts of speech
5455type PartOfSpeechList []PartOfSpeech
5556
57+ var suffixes = []string {
58+ // Noun suffixes
59+ "s" , "ses" , "xes" , "zes" , "ches" , "shes" , "men" , "ies" ,
60+ // Verb suffixes
61+ "s" , "ies" , "es" , "es" , "ed" , "ed" , "ing" , "ing" ,
62+ // Adjective suffixes
63+ "er" , "est" , "er" , "est" ,
64+ }
65+
66+ var pluralEndings = []string {
67+ // Noun endings
68+ "" , "s" , "x" , "z" , "ch" , "sh" , "man" , "y" ,
69+ // Verb endings
70+ "" , "y" , "e" , "" , "e" , "" , "e" , "" ,
71+ // Adjective endings
72+ "" , "" , "e" , "e" ,
73+ }
74+
75+ var offsets = []int {0 , 8 , 16 }
76+ var counts = []int {8 , 8 , 4 }
77+
5678func (l PartOfSpeechList ) Empty () bool {
5779 return len (l ) == 0
5880}
5981
6082func (l PartOfSpeechList ) Contains (want PartOfSpeech ) bool {
61- for _ , got := range l {
62- if got == want {
63- return true
64- }
65- }
66- return false
83+ return slices .Contains (l , want )
6784}
6885
6986const (
7087 Noun PartOfSpeech = iota
7188 Verb
7289 Adjective
73- // AdjectiveSatellite
7490 Adverb
7591)
7692
@@ -154,12 +170,15 @@ func (w *Lookup) DumpStr() string {
154170 s := fmt .Sprintf ("Word: %s\n " , w .String ())
155171 s += "Synonyms: "
156172 words := []string {}
173+
157174 for _ , w := range w .cluster .words {
158175 words = append (words , w .word )
159176 }
177+
160178 s += strings .Join (words , ", " ) + "\n "
161179 s += fmt .Sprintf ("%d semantic relationships\n " , len (w .cluster .relations ))
162180 s += "| " + w .cluster .gloss + "\n "
181+
163182 return s
164183}
165184
@@ -211,14 +230,15 @@ func (w *Lookup) Related(r Relation) (relationships []Lookup) {
211230// Initialize a new in-ram WordNet databases reading files from the
212231// specified directory.
213232func New (dir string ) (* Handle , error ) {
214- cnt := 0
215233 type ix struct {
216234 index string
217235 pos PartOfSpeech
218236 }
237+
219238 byOffset := map [ix ]* cluster {}
239+ exceptions := map [string ]string {}
240+
220241 err := filepath .Walk (dir , func (filename string , info os.FileInfo , err error ) error {
221- start := time .Now ()
222242 if err != nil || info .IsDir () {
223243 return err
224244 }
@@ -227,74 +247,92 @@ func New(dir string) (*Handle, error) {
227247 if strings .HasPrefix (path .Base (filename ), "." ) || strings .HasSuffix (filename , "~" ) || strings .HasSuffix (filename , "#" ) {
228248 return nil
229249 }
230- // read only data files
231- if ! strings .HasPrefix (path .Base (filename ), "data" ) {
232- return nil
233- }
234250
235- err = inPlaceReadLineFromPath (filename , func (data []byte , line , offset int64 ) error {
236- cnt ++
237- if p , err := parseLine (data , line ); err != nil {
238- return fmt .Errorf ("%s" , err )
239- } else if p != nil {
240- // first, let's identify the cluster
241- index := ix {p .byteOffset , p .pos }
242- c , ok := byOffset [index ]
243- if ! ok {
244- c = & cluster {}
245- byOffset [index ] = c
246- }
247- // now update
248- c .pos = p .pos
249- c .words = p .words
250- c .gloss = p .gloss
251- c .debug = p .byteOffset
252-
253- // now let's build relations
254- for _ , r := range p .rels {
255- rindex := ix {r .offset , r .pos }
256- rcluster , ok := byOffset [rindex ]
251+ // read data files
252+ if strings .HasPrefix (path .Base (filename ), "data" ) {
253+ err = inPlaceReadLineFromPath (filename , func (data []byte , line , offset int64 ) error {
254+ if p , err := parseLine (data , line ); err != nil {
255+ return fmt .Errorf ("%s" , err )
256+ } else if p != nil {
257+ // first, let's identify the cluster
258+ index := ix {p .byteOffset , p .pos }
259+ c , ok := byOffset [index ]
257260 if ! ok {
258- // create the other side of the relationship
259- rcluster = & cluster {}
260- byOffset [rindex ] = rcluster
261+ c = & cluster {}
262+ byOffset [index ] = c
261263 }
262- if r .isSemantic {
263- c .relations = append (c .relations , semanticRelation {
264- rel : r .rel ,
265- target : rcluster ,
266- })
267- } else {
268- if int (r .source ) >= len (c .words ) {
269- return fmt .Errorf ("%s:%d: error parsing relations, bogus source (words: %d, offset: %d) [%s]" , filename , line , r .source , len (c .words ), string (data ))
264+
265+ // now update
266+ c .pos = p .pos
267+ c .words = p .words
268+ c .gloss = p .gloss
269+ c .debug = p .byteOffset
270+
271+ // now let's build relations
272+ for _ , r := range p .rels {
273+ rindex := ix {r .offset , r .pos }
274+ rcluster , ok := byOffset [rindex ]
275+ if ! ok {
276+ // create the other side of the relationship
277+ rcluster = & cluster {}
278+ byOffset [rindex ] = rcluster
279+ }
280+ if r .isSemantic {
281+ c .relations = append (c .relations , semanticRelation {
282+ rel : r .rel ,
283+ target : rcluster ,
284+ })
285+ } else {
286+ if int (r .source ) >= len (c .words ) {
287+ return fmt .Errorf ("%s:%d: error parsing relations, bogus source (words: %d, offset: %d) [%s]" , filename , line , r .source , len (c .words ), string (data ))
288+ }
289+ c .words [r .source ].relations = append (c .words [r .source ].relations , syntacticRelation {
290+ rel : r .rel ,
291+ target : rcluster ,
292+ wordNumber : r .dest ,
293+ })
270294 }
271- c .words [r .source ].relations = append (c .words [r .source ].relations , syntacticRelation {
272- rel : r .rel ,
273- target : rcluster ,
274- wordNumber : r .dest ,
275- })
276295 }
296+
277297 }
298+ return nil
299+ })
300+
301+ return err
302+ }
303+
304+ // read exception files
305+ if strings .HasSuffix (path .Base (filename ), ".exc" ) {
306+ err = inPlaceReadLineFromPath (filename , func (data []byte , line , offset int64 ) error {
307+ parts := strings .SplitN (string (data ), " " , 2 )
308+ if len (parts ) == 2 {
309+ exceptions [parts [0 ]] = parts [1 ]
310+ } else {
311+ return fmt .Errorf ("malformed exception line %d: %q" , line , string (data ))
312+ }
313+ return nil
314+ })
315+ }
278316
279- }
280- return nil
281- })
282- fmt .Printf ("%s in %s\n " , filename , time .Since (start ).String ())
283317 return err
284318 })
319+
285320 if err != nil {
286321 return nil , err
287322 }
288323
289- // now that we've built up the in ram database, lets' index it
290324 h := Handle {
291- db : make ([]* cluster , 0 , len (byOffset )),
292- index : make (map [string ][]* cluster ),
325+ db : make ([]* cluster , 0 , len (byOffset )),
326+ index : make (map [string ][]* cluster ),
327+ exceptions : exceptions ,
293328 }
329+
330+ // now that we've built up the in ram database, lets' index it
294331 for _ , c := range byOffset {
295332 if len (c .words ) == 0 {
296333 return nil , fmt .Errorf ("ERROR, internal consistency error -> cluster without words %v" , c )
297334 }
335+
298336 // add to the global slice of synsets (supports iteration)
299337 h .db = append (h .db , c )
300338
@@ -324,27 +362,44 @@ func (h *Handle) Lookup(crit Criteria) ([]Lookup, error) {
324362 if crit .Matching == "" {
325363 return nil , fmt .Errorf ("empty string passed as criteria to lookup" )
326364 }
365+
327366 searchStr := normalize (crit .Matching )
367+
368+ // Check if searchStr is a known plural exception
369+ // if so, replace it with the singular form
370+ if val , ok := h .exceptions [searchStr ]; ok {
371+ searchStr = val
372+ }
373+
328374 clusters := h .index [searchStr ]
329- found := []Lookup {}
330- for _ , c := range clusters {
331- if len (crit .POS ) > 0 {
332- satisfied := false
333- for _ , p := range crit .POS {
334- if p == c .pos {
335- satisfied = true
375+ if clusters == nil {
376+ // Try to find a baseform (lemma) of the search string
377+ for _ , pos := range []PartOfSpeech {Noun , Verb , Adjective , Adverb } {
378+ if base := h .MorphWord (searchStr , pos ); base != "" {
379+ clusters = h .index [base ]
380+ if clusters != nil {
336381 break
337382 }
338383 }
384+ }
385+ }
386+
387+ found := []Lookup {}
388+
389+ for _ , c := range clusters {
390+ if len (crit .POS ) > 0 {
391+ satisfied := slices .Contains (crit .POS , c .pos )
339392 if ! satisfied {
340393 continue
341394 }
342395 }
396+
343397 found = append (found , Lookup {
344398 word : crit .Matching ,
345399 cluster : c ,
346400 })
347401 }
402+
348403 return found , nil
349404}
350405
@@ -353,13 +408,55 @@ func (h *Handle) Iterate(pos PartOfSpeechList, cb func(Lookup) error) error {
353408 if ! pos .Empty () && ! pos .Contains (c .pos ) {
354409 continue
355410 }
411+
356412 err := cb (Lookup {
357413 word : c .words [0 ].word ,
358414 cluster : c ,
359415 })
416+
360417 if err != nil {
361418 return err
362419 }
363420 }
421+
364422 return nil
365423}
424+
425+ // wordbase removes a suffix from 'word' if it matches suffixes[ender], then appends plugalEndings[ender].
426+ func wordbase (word string , ender int ) string {
427+ copy := word
428+ if strings .HasSuffix (copy , suffixes [ender ]) {
429+ // Remove the suffix
430+ copy = copy [:len (copy )- len (suffixes [ender ])]
431+ // Append the pluralEndings string
432+ copy += pluralEndings [ender ]
433+ }
434+
435+ return copy
436+ }
437+
438+ // Try to find all possible baseforms (lemmas) of individual word in POS.
439+ func (h * Handle ) MorphWord (word string , pos PartOfSpeech ) string {
440+ if pos == Adverb {
441+ // Adverbs are not inflected in WordNet
442+ return ""
443+ } else if pos == Noun {
444+ if strings .HasSuffix (word , "ful" ) {
445+ return word [:len (word )- 3 ]
446+ } else if strings .HasSuffix (word , "ss" ) || len (word ) <= 2 {
447+ return ""
448+ }
449+ }
450+
451+ offset := offsets [int (pos )]
452+ count := counts [int (pos )]
453+
454+ for i := range count {
455+ retval := wordbase (word , offset + i )
456+ if h .index [retval ] != nil && retval != word {
457+ return retval
458+ }
459+ }
460+
461+ return ""
462+ }
0 commit comments