Skip to content

Commit ab736f0

Browse files
Added substitution commands and moved gap commands
1 parent 460bba9 commit ab736f0

File tree

7 files changed

+225
-62
lines changed

7 files changed

+225
-62
lines changed

align/align.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ type Alignment interface {
4343
IterateAll(it func(name string, sequence []rune, comment string))
4444
NbSequences() int
4545
Length() int
46+
Mutate(rate float64) // Adds uniform substitutions in the alignment (~sequencing errors)
4647
ShuffleSequences()
4748
ShuffleSites(rate float64, roguerate float64) []string
4849
SimulateRogue(prop float64, proplen float64) ([]string, []string)
@@ -400,6 +401,40 @@ func (a *align) AddGaps(lenprop float64, prop float64) {
400401
}
401402
}
402403

404+
// Add substitutions uniformly to the alignment
405+
// if rate < 0 : does nothing
406+
// if rate > 1 : rate=1
407+
func (a *align) Mutate(rate float64) {
408+
if rate <= 0 {
409+
return
410+
}
411+
if rate > 1 {
412+
rate = 1
413+
}
414+
r := 0.0
415+
newchar := 0
416+
leng := a.Length()
417+
nb := a.NbSequences()
418+
// We take a random position in the sequences between min and max
419+
for i := 0; i < nb; i++ {
420+
seq := a.seqs[i]
421+
for j := 0; j < leng; j++ {
422+
r = rand.Float64()
423+
// We mutate only if rand is <= rate
424+
// taking a random nucleotide or amino acid uniformly
425+
if r <= rate {
426+
if a.Alphabet() == AMINOACIDS {
427+
newchar = rand.Intn(len(stdaminoacid))
428+
seq.sequence[j] = stdaminoacid[newchar]
429+
} else {
430+
newchar = rand.Intn(len(stdnucleotides))
431+
seq.sequence[j] = stdnucleotides[newchar]
432+
}
433+
}
434+
}
435+
}
436+
}
437+
403438
// Simulate rogue taxa in the alignment:
404439
// take the proportion prop of sequences as rogue taxa => R
405440
// For each t in R

cmd/addgaps.go

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,34 @@
11
package cmd
22

33
import (
4+
"math/rand"
5+
46
"github.com/spf13/cobra"
57
)
68

7-
var gaprate float64
89
var gapnbseqs float64
910

1011
// rogueCmd represents the rogue command
1112
var addgapsCmd = &cobra.Command{
1213
Use: "gaps",
13-
Short: "Adds random (uniformly) gaps",
14-
Long: `Adds random (uniformly) gaps on an input alignment.
14+
Short: "Adds gaps uniformly in an input alignment",
15+
Long: `Adds gaps uniformly in an input alignment.
1516
1617
Example:
17-
goalign shuffle gaps -i align.fa -n 0.5 -r 0.5
18-
18+
goalign mutate gaps -i align.fa -n 0.5 -r 0.5
1919
`,
2020
Run: func(cmd *cobra.Command, args []string) {
21-
22-
f := openWriteFile(shuffleOutput)
23-
nameFile := openWriteFile(rogueNameFile)
21+
rand.Seed(mutateSeed)
22+
f := openWriteFile(mutateOutput)
2423
for al := range rootaligns {
25-
al.AddGaps(gaprate, gapnbseqs)
24+
al.AddGaps(mutateRate, gapnbseqs)
2625
writeAlign(al, f)
2726
}
28-
nameFile.Close()
2927
f.Close()
3028
},
3129
}
3230

3331
func init() {
34-
shuffleCmd.AddCommand(addgapsCmd)
35-
36-
addgapsCmd.PersistentFlags().Float64VarP(&gapnbseqs, "prop-seq", "n", 0.5, "Proportion of the sequences to add gaps")
37-
addgapsCmd.PersistentFlags().Float64VarP(&gaprate, "gap-rate", "r", 0.5, "Proportion of gaps to add per sequences")
32+
mutateCmd.AddCommand(addgapsCmd)
33+
addgapsCmd.PersistentFlags().Float64VarP(&gapnbseqs, "prop-seq", "n", 0.5, "Proportion of the sequences in which to add gaps")
3834
}

cmd/gaps.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package cmd
2+
3+
import (
4+
"math/rand"
5+
6+
"github.com/spf13/cobra"
7+
)
8+
9+
// mutateCmd represents the mutate command
10+
var gapsCmd = &cobra.Command{
11+
Use: "snvs",
12+
Short: "Adds substitutions uniformly in an input alignment",
13+
Long: `Adds substitutions uniformly in an input alignment.
14+
15+
if rate <= 0 : does nothing
16+
if rate > 1 : then rate = 1
17+
`,
18+
Run: func(cmd *cobra.Command, args []string) {
19+
rand.Seed(mutateSeed)
20+
f := openWriteFile(mutateOutput)
21+
for al := range rootaligns {
22+
al.Mutate(mutateRate)
23+
writeAlign(al, f)
24+
}
25+
f.Close()
26+
},
27+
}
28+
29+
func init() {
30+
mutateCmd.AddCommand(gapsCmd)
31+
}

cmd/mutate.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package cmd
2+
3+
import (
4+
"time"
5+
6+
"github.com/spf13/cobra"
7+
)
8+
9+
var mutateOutput string
10+
var mutateRate float64
11+
var mutateSeed int64
12+
13+
// mutateCmd represents the mutate command
14+
var mutateCmd = &cobra.Command{
15+
Use: "mutate",
16+
Short: "Adds substitutions (~sequencing errors), or gaps, uniformly in an input alignment",
17+
Long: `Adds substitutions (~sequencing error), or gaps, uniformly in an input alignment.
18+
`,
19+
}
20+
21+
func init() {
22+
RootCmd.AddCommand(mutateCmd)
23+
mutateCmd.PersistentFlags().Float64VarP(&mutateRate, "rate", "r", 0.1, "Mutation rate per nucleotide/amino acid")
24+
mutateCmd.PersistentFlags().StringVarP(&mutateOutput, "output", "o", "stdout", "Mutated alignment output file")
25+
mutateCmd.PersistentFlags().Int64VarP(&mutateSeed, "seed", "s", time.Now().UTC().UnixNano(), "Initial Random Seed")
26+
}

docs/commands/mutate.md

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# Goalign: toolkit and api for alignment manipulation
2+
3+
## Commands
4+
5+
### mutate
6+
This command adds different type of noises in an input alignment, with these sub-commands:
7+
* `goalign mutate gaps` : Adds a given proportion of gaps to a given proportion of the sequences randomly in the input alignment (uniformly).
8+
* `goalign mutate snvs`: Substitute nucleotides/aminoacids by random (uniform) nucleotides/aminoacids with a given rate.
9+
10+
#### Usage
11+
* General command:
12+
```
13+
Usage:
14+
goalign mutate [command]
15+
16+
Available Commands:
17+
gaps Adds gaps uniformly in an input alignment
18+
snvs Adds substitutions uniformly in an input alignment
19+
20+
Flags:
21+
-h, --help help for mutate
22+
-o, --output string Mutated alignment output file (default "stdout")
23+
-r, --rate float Mutation rate per nucleotide/amino acid (default 0.1)
24+
-s, --seed int Initial Random Seed (default 1497950022161439493)
25+
26+
Global Flags:
27+
-i, --align string Alignment input file (default "stdin")
28+
-p, --phylip Alignment is in phylip? False=Fasta
29+
```
30+
31+
* gaps command:
32+
```
33+
Usage:
34+
goalign shuffle gaps [flags]
35+
36+
Flags:
37+
-r, --gap-rate float Proportion of gaps to add per sequences (default 0.5)
38+
-n, --prop-seq float Proportion of the sequences to add gaps in (default 0.5)
39+
40+
Global Flags:
41+
-i, --align string Alignment input file (default "stdin")
42+
-o, --output string Shuffled alignment output file (default "stdout")
43+
-p, --phylip Alignment is in phylip? False=Fasta
44+
-s, --seed int Initial Random Seed (default: number of nanoseconds elapsed since January 1, 1970 UTC)
45+
```
46+
47+
* snvs command:
48+
```
49+
Usage:
50+
goalign shuffle recomb [flags]
51+
52+
Flags:
53+
-l, --prop-length float Proportion of length of sequences to recombine (default 0.5)
54+
-n, --prop-seq float Proportion of the sequences to recombine (default 0.5)
55+
56+
Global Flags:
57+
-i, --align string Alignment input file (default "stdin")
58+
-o, --output string Shuffled alignment output file (default "stdout")
59+
-p, --phylip Alignment is in phylip? False=Fasta
60+
-s, --seed int Initial Random Seed (default 1495021613485404707)
61+
```
62+
63+
#### Examples
64+
* Generating a random (uniform) alignment and adding 20% gaps to 50% of the sequences:
65+
```
66+
goalign random -l 20 -s10| goalign mutate gaps -n 0.5 -r 0.2 -s10
67+
```
68+
69+
Should give:
70+
```
71+
>Seq0000
72+
GATTAATTTGCCGTAGGCCA
73+
>Seq0001
74+
G-ATCTGAAGA-CG-A-ACT
75+
>Seq0002
76+
TTAAGTTTT-AC--CTAA-G
77+
>Seq0003
78+
GAGAGGACTAGTTCATACTT
79+
>Seq0004
80+
TT-AAACA-TTTTA-A-CGA
81+
>Seq0005
82+
TGTCGGACCTAAGTATTGAG
83+
>Seq0006
84+
TAC-A-G-TGTATT-CAGCG
85+
>Seq0007
86+
GTGGAGAGGTCTATTTTTCC
87+
>Seq0008
88+
GGTTGAAG-ACT-TA-AGC-
89+
>Seq0009
90+
GTAAAGGGTATGGCCATGTG
91+
```
92+
93+
* Generating a random (uniform) alignment and adding 10% substitutions :
94+
```
95+
goalign random -l 20 -s10| goalign mutate snvs -r 0.1 -s 10
96+
```
97+
98+
Should give:
99+
```
100+
>Seq0000
101+
GATTAATTTCCCGTAGGCCA
102+
>Seq0001
103+
GAATCTGAATATCGAACTAT
104+
>Seq0002
105+
TTAAGTTTTCACTTCTAATG
106+
>Seq0003
107+
GAGAGGACTAGTTCATAATT
108+
>Seq0004
109+
TTTTAACACTTTTACATCGA
110+
>Seq0005
111+
TGTCGGACCTAAGTTTTGTG
112+
>Seq0006
113+
TGCAACGATGTACTCCAGCG
114+
>Seq0007
115+
GTGGAGAGGTCTATTTTTGC
116+
>Seq0008
117+
GGTTAAAGGACTCTATAGCT
118+
>Seq0009
119+
GAAAAGGGTATGGCCATGTG
120+
```

docs/commands/shuffle.md

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
### shuffle
66
This command adds different type of noises in an input alignment, with these sub-commands:
7-
* `goalign shuffle gaps` : Adds a given proportion of gaps to a given proportion of the sequences randomly in the input alignment (uniformly).
87
* `goalign shuffle recomb`:Recombine a given proportion of the length of a given proportion of the sequences with other sequences (copy/paste).
98
Example:
109
```
@@ -40,7 +39,6 @@ Usage:
4039
goalign shuffle [command]
4140
4241
Available Commands:
43-
gaps Adds random (uniformly) gaps
4442
recomb Recombine sequences in the input alignment
4543
rogue Simulate rogue taxa
4644
seqs Shuffles sequence order in alignment
@@ -57,22 +55,6 @@ Global Flags:
5755
-p, --phylip Alignment is in phylip? False=Fasta
5856
```
5957

60-
* gaps command:
61-
```
62-
Usage:
63-
goalign shuffle gaps [flags]
64-
65-
Flags:
66-
-r, --gap-rate float Proportion of gaps to add per sequences (default 0.5)
67-
-n, --prop-seq float Proportion of the sequences to add gaps in (default 0.5)
68-
69-
Global Flags:
70-
-i, --align string Alignment input file (default "stdin")
71-
-o, --output string Shuffled alignment output file (default "stdout")
72-
-p, --phylip Alignment is in phylip? False=Fasta
73-
-s, --seed int Initial Random Seed (default: number of nanoseconds elapsed since January 1, 1970 UTC)
74-
```
75-
7658
* recomb command:
7759
```
7860
Usage:
@@ -152,35 +134,6 @@ Global Flags:
152134
```
153135

154136
#### Examples
155-
* Generating a random (uniform) alignment and adding gaps:
156-
```
157-
goalign random -l 20 -s10| goalign shuffle gaps -n 0.5 -r 0.2 -s10
158-
```
159-
160-
Should give:
161-
```
162-
>Seq0000
163-
GATTAATTTGCCGTAGGCCA
164-
>Seq0001
165-
G-ATCTGAAGA-CG-A-ACT
166-
>Seq0002
167-
TTAAGTTTT-AC--CTAA-G
168-
>Seq0003
169-
GAGAGGACTAGTTCATACTT
170-
>Seq0004
171-
TT-AAACA-TTTTA-A-CGA
172-
>Seq0005
173-
TGTCGGACCTAAGTATTGAG
174-
>Seq0006
175-
TAC-A-G-TGTATT-CAGCG
176-
>Seq0007
177-
GTGGAGAGGTCTATTTTTCC
178-
>Seq0008
179-
GGTTGAAG-ACT-TA-AGC-
180-
>Seq0009
181-
GTAAAGGGTATGGCCATGTG
182-
```
183-
184137
* Recombining 0.25% of the sequences on 0.5% of their length
185138
```
186139
# Writing alignment

docs/index.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ Command | Subcommand |
4646
-- | pssm | Computes and prints a Position specific scoring matrix
4747
[concat](commands/concat.md) ([api](api/concat.md)) | | Concatenates a set of alignment
4848
[divide](commands/divide.md) ([api](api/divide.md)) | | Divide an input alignment in several output files
49+
[mutate](commands/mutate.md) ([api](api/mutate.md)) | | Adds substitutions (~sequencing errors), or gaps, uniformly in an input alignment
50+
-- | gaps | Adds gaps uniformly in an input alignment
51+
-- | snvs | Adds substitutions uniformly in an input alignment
4952
[random](commands/random.md) ([api](api/random.md)) | | Generate random sequences
5053
[reformat](commands/reformat.md) ([api](api/reformat.md)) | | Reformats input alignment into phylip of fasta format
5154
-- | fasta | Reformats an input alignment into Fasta
@@ -57,7 +60,6 @@ Command | Subcommand |
5760
-- | seqs | Samples a subset of sequences from the input alignment
5861
-- | sites | Take a random subalignment
5962
[shuffle](commands/shuffle.md) ([api](api/shuffle.md)) | | A set of commands to shuffle an alignment
60-
-- | gaps | Adds random (uniformly) gaps on an input alignment
6163
-- | recomb | Recombines sequences in the input alignment (copy/paste)
6264
-- | rogue | Simulates rogue taxa
6365
-- | seqs | Shuffles sequence order in alignment

0 commit comments

Comments
 (0)