Merge pull request #2 from thewh1teagle/feat/rename-dirty-to-lexicon

neurlang · web-flow · commit 9474c09b035f · 2025-03-29T19:02:41.000+01:00
rename dirty to lexicon
diff --git a/cmd/train_phonemizer2/main.go b/cmd/train_phonemizer2/main.go
@@ -1,31 +1,33 @@
 package main
 
 import (
+	"bytes"
 	"encoding/json"
+	"flag"
+	"fmt"
 	"io/ioutil"
-	"bytes"
-)
+	"math/rand"
+	"os"
+	"runtime"
+	"sync/atomic"
+	"time"
 
-import "os"
-import "sync/atomic"
-import "fmt"
-import "runtime"
-import "flag"
-import "math/rand"
-import "time"
+	"github.com/neurlang/classifier/datasets"
+	"github.com/neurlang/classifier/datasets/phonemizer"
+	"github.com/neurlang/classifier/hashtron"
+	"github.com/neurlang/classifier/layer/crossattention"
+	"github.com/neurlang/classifier/layer/sochastic"
+	"github.com/neurlang/classifier/layer/sum"
+	"github.com/neurlang/classifier/net/feedforward"
+	"github.com/neurlang/classifier/parallel"
+	"github.com/neurlang/quaternary"
+)
 
-import "github.com/neurlang/classifier/datasets/phonemizer"
 //import "github.com/neurlang/classifier/layer/majpool2d"
-import "github.com/neurlang/classifier/layer/sum"
-import "github.com/neurlang/classifier/layer/sochastic"
+
 //import "github.com/neurlang/classifier/layer/parity"
-import "github.com/neurlang/classifier/layer/crossattention"
-import "github.com/neurlang/classifier/datasets"
-import "github.com/neurlang/classifier/hashtron"
+
 //import "github.com/neurlang/classifier/learning"
-import "github.com/neurlang/quaternary"
-import "github.com/neurlang/classifier/net/feedforward"
-import "github.com/neurlang/classifier/parallel"
 
 func error_abs(a, b uint32) (out uint32) {
 	xor := a ^ b
@@ -62,7 +64,7 @@ func write_histogram(langjson string, histogram []string) {
 		fmt.Println("Error marshalling JSON:", err)
 		return
 	}
-	
+
 	updatedData = bytes.ReplaceAll(updatedData, []byte(`"],"`), []byte("\"],\n\""))
 
 	// Step 5: Write the updated JSON back to the file
@@ -75,7 +77,7 @@ func write_histogram(langjson string, histogram []string) {
 }
 
 func main() {
-	dirtytsv := flag.String("dirtytsv", "", "dirty tsv dataset for the language")
+	lexicontsv := flag.String("lexicontsv", "", "lexicon tsv dataset for the language")
 	learntsv := flag.String("learntsv", "", "learn tsv dataset for the language")
 	langjson := flag.String("langjson", "", "language.json for the language to write histogram")
 	premodulo := flag.Int("premodulo", 0, "premodulo")
@@ -92,7 +94,7 @@ func main() {
 
 	var improved_success_rate = 0
 
-	if dirtytsv == nil || *dirtytsv == "" {
+	if lexicontsv == nil || *lexicontsv == "" {
 		println("clean tsv is mandatory")
 		return
 	}
@@ -106,15 +108,15 @@ func main() {
 	}
 
 	histogram := phonemizer.NewHistogram(*learntsv, reverse != nil && *reverse)
-	
+
 	if langjson != nil && *langjson != "" {
 		write_histogram(*langjson, histogram)
 	}
-	
+
 	fmt.Println(histogram)
 
-	data := phonemizer.SplitAreg(phonemizer.NewDatasetAreg(*learntsv, *dirtytsv, reverse != nil && *reverse, histogram))
-	
+	data := phonemizer.SplitAreg(phonemizer.NewDatasetAreg(*learntsv, *lexicontsv, reverse != nil && *reverse, histogram))
+
 	if len(data) == 0 {
 		println("it looks like no data for this language, or language is unambiguous (no model needed)")
 		return
@@ -123,7 +125,7 @@ func main() {
 	const fanout1 = 16
 	const fanout2 = 2
 	const fanout3 = 3
-	
+
 	var net feedforward.FeedforwardNetwork
 	net.NewLayer(fanout1*fanout2, 0)
 	for i := 0; i < fanout3; i++ {
@@ -134,10 +136,9 @@ func main() {
 	}
 	net.NewCombiner(sochastic.MustNew(fanout1*fanout2, 32, fanout3))
 	net.NewLayer(fanout1*fanout2, 0)
-	net.NewCombiner(sum.MustNew([]uint{fanout1*fanout2}, 0))
+	net.NewCombiner(sum.MustNew([]uint{fanout1 * fanout2}, 0))
 	net.NewLayer(1, 0)
 
-
 	trainWorst := func(worst int) func() {
 		var tally = new(datasets.Tally)
 		tally.Init()
@@ -148,7 +149,7 @@ func main() {
 		if minpremodulo != nil && *minpremodulo > 0 && maxpremodulo != nil && *maxpremodulo > 0 {
 			const span = 50 * 50
 			value := (100 - improved_success_rate) * (100 - improved_success_rate)
-			premodulo := value * ( *minpremodulo - *maxpremodulo ) / span + *maxpremodulo
+			premodulo := value*(*minpremodulo-*maxpremodulo)/span + *maxpremodulo
 			//println(improved_success_rate, premodulo)
 			if premodulo < 2 {
 				premodulo = 2
@@ -161,17 +162,17 @@ func main() {
 			rand.Shuffle(len(data), func(i, j int) { data[i], data[j] = data[j], data[i] })
 			parts = *part
 		}
-		
+
 		parallel.ForEach(len(data)/parts, 1000, func(jjj int) {
 			{
-					var io = data[jjj]
-					
-					io.Dimension = fanout1
+				var io = data[jjj]
 
-					net.Tally4(&io, worst, tally, nil)
+				io.Dimension = fanout1
+
+				net.Tally4(&io, worst, tally, nil)
 			}
 		})
-		
+
 		if !tally.GetImprovementPossible() {
 			return nil
 		}
@@ -194,7 +195,7 @@ func main() {
 		tally.Free()
 		runtime.GC()
 
-		return func(){
+		return func() {
 			*ptr = backup
 		}
 	}
@@ -212,17 +213,17 @@ func main() {
 				io.Dimension = fanout1
 
 				var predicted = net.Infer2(&io) & 1
-						
+
 				h.MustPutUint16(j, predicted)
-				
+
 				if predicted == io.Output() {
 					percent.Add(1)
 				}
 				errsum.Add(uint64(error_abs(uint32(predicted), uint32(io.Output()))))
 			}
 		})
-		success := 100 * int(percent.Load()) / (len(data)/parts)
-		println("[success rate]", success, "%", "with", uint64(parts) * errsum.Load(), "errors")
+		success := 100 * int(percent.Load()) / (len(data) / parts)
+		println("[success rate]", success, "%", "with", uint64(parts)*errsum.Load(), "errors")
 
 		if dstmodel == nil || *dstmodel == "" {
 			err := net.WriteZlibWeightsToFile("output." + fmt.Sprint(success) + ".json.t.lzw")
diff --git a/datasets/phonemizer_multi/multi.go b/datasets/phonemizer_multi/multi.go
@@ -1,20 +1,20 @@
 package phonemizer_multi
 
-import "github.com/jbarham/primegen"
-import (
-	"github.com/neurlang/classifier/hash"
-	"encoding/json"
-	"sort"
-	"strconv"
-)
 import (
 	"bufio"
+	"encoding/json"
 	"fmt"
 	"os"
+	"sort"
+	"strconv"
 	"strings"
-	//"encoding/json"
+
+	"github.com/jbarham/primegen"
+	"github.com/neurlang/classifier/hash"
 )
 
+//"encoding/json"
+
 var Primes []uint32
 
 func init() {
@@ -45,15 +45,15 @@ func (t *Token) Len() int {
 
 func (s *Sample) V1(dim, pos int) SampleSentence {
 	return SampleSentence{
-		Sample: s,
-		position: pos,
+		Sample:    s,
+		position:  pos,
 		dimension: dim,
 	}
 }
 
 type SampleSentence struct {
-	Sample *Sample
-	position int
+	Sample    *Sample
+	position  int
 	dimension int
 }
 
@@ -66,13 +66,13 @@ func (s *SampleSentence) Len() int {
 
 type SampleSentenceIO struct {
 	SampleSentence *SampleSentence
-	choice int
+	choice         int
 }
 
 func (s *SampleSentence) IO(n int) (ret *SampleSentenceIO) {
 	return &SampleSentenceIO{
 		SampleSentence: s,
-		choice: n,
+		choice:         n,
 	}
 }
 
@@ -85,14 +85,14 @@ func (s *SampleSentenceIO) Feature(n int) (ret uint32) {
 	if s.Parity() == 1 {
 		ret = 1 << 31
 	}
-	if n % 3 == 0 {
-		for ; pos < len((s.SampleSentence.Sample.Sentence)); pos += (s.SampleSentence.dimension/3) {
+	if n%3 == 0 {
+		for ; pos < len((s.SampleSentence.Sample.Sentence)); pos += (s.SampleSentence.dimension / 3) {
 			ret += uint32(s.SampleSentence.Sample.Sentence[pos].Homograph)
 		}
 		return
 
 	}
-	for ; pos < len((s.SampleSentence.Sample.Sentence)); pos += (s.SampleSentence.dimension/3) {
+	for ; pos < len((s.SampleSentence.Sample.Sentence)); pos += (s.SampleSentence.dimension / 3) {
 		if pos < s.SampleSentence.position {
 			ret += uint32(s.SampleSentence.Sample.Sentence[pos].Solution)
 		} else if pos == s.SampleSentence.position {
@@ -113,7 +113,7 @@ func (s *SampleSentenceIO) Parity() (ret uint16) {
 	return uint16(len(s.SampleSentence.Sample.Sentence) & 1)
 }
 func (s *SampleSentenceIO) Output() (ret uint16) {
-	if (s.SampleSentence.Sample.Sentence[s.SampleSentence.position].Choices[s.choice][0] == s.SampleSentence.Sample.Sentence[s.SampleSentence.position].Solution) {
+	if s.SampleSentence.Sample.Sentence[s.SampleSentence.position].Choices[s.choice][0] == s.SampleSentence.Sample.Sentence[s.SampleSentence.position].Solution {
 		return 1
 	}
 	return 0
@@ -159,7 +159,6 @@ func loop(filename string, do func(string, string, string)) {
 	}
 }
 
-
 func addTags(bag map[uint32]string, tags ...string) map[uint32]string {
 	for _, v := range tags {
 		bag[hash.StringHash(0, v)] = v
@@ -202,13 +201,12 @@ func serializeTags(tags map[uint32]string) (key uint32, ret string) {
 	return
 }
 
-
 func NewDataset(dir string) (ret []Sample) {
 
 	var tags = make(map[uint32]string)
 	var m = make(map[string]map[string]uint32)
 
-	loop(dir + string(os.PathSeparator) + "dirty.tsv", func(src string, dst, tag string) {
+	loop(dir+string(os.PathSeparator)+"lexicon.tsv", func(src string, dst, tag string) {
 		if _, ok := m[src]; !ok {
 			m[src] = make(map[string]uint32)
 		}
@@ -232,7 +230,7 @@ func NewDataset(dir string) (ret []Sample) {
 		}
 	})
 
-	loop(dir + string(os.PathSeparator) + "multi.tsv", func(src string, dst, _ string) {
+	loop(dir+string(os.PathSeparator)+"multi.tsv", func(src string, dst, _ string) {
 		srcv := strings.Split(src, " ")
 		dstv := strings.Split(dst, " ")
 		if len(srcv) != len(dstv) {
@@ -249,7 +247,7 @@ func NewDataset(dir string) (ret []Sample) {
 				fmt.Println("ERROR: Word not in dict:", srcv[i], dstv[i])
 				t := Token{
 					Homograph: hash.StringHash(0, srcv[i]),
-					Solution: 0,
+					Solution:  0,
 				}
 				s.Sentence = append(s.Sentence, t)
 				continue
@@ -284,8 +282,8 @@ func NewDataset(dir string) (ret []Sample) {
 			}
 			t := Token{
 				Homograph: hash.StringHash(0, srcv[i]),
-				Solution: sol,
-				Choices: array,
+				Solution:  sol,
+				Choices:   array,
 			}
 			s.Sentence = append(s.Sentence, t)
 		}