Skip to content

Commit da05073

Browse files
committed
ENH Make split_contigs into a subcommand
1 parent b3c7d5b commit da05073

File tree

4 files changed

+91
-69
lines changed

4 files changed

+91
-69
lines changed

SemiBin/main.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,29 @@ def parse_args(args, is_semibin2):
102102
default=0,
103103
dest='min_length')
104104

105+
split_contigs = subparsers.add_parser('split_contigs',
106+
help = 'Split contigs to generate data (only for strobealign-aemb pipeline)')
107+
108+
split_contigs.add_argument('-i', '--input-fasta',
109+
required=True,
110+
help='Path to the input fasta file.',
111+
dest='contig_fasta',
112+
default=None, )
113+
114+
split_contigs.add_argument('-o', '--output',
115+
required=True,
116+
help='Output directory (will be created if non-existent)',
117+
dest='output',
118+
default=None
119+
)
120+
121+
split_contigs.add_argument('-m', '--min-len',
122+
required=False,
123+
type=int,
124+
help='Discard sequences below this length (default:0)',
125+
default=0,
126+
dest='min_length')
127+
105128
generate_sequence_features_single.add_argument('--kmer',
106129
required=False,
107130
help='Just output data.csv with k-mer features.',
@@ -1360,6 +1383,26 @@ def multi_easy_binning(logger, args, device):
13601383
new_path = os.path.join(args.output, 'bins', new_file)
13611384
shutil.copyfile(original_path, new_path)
13621385

1386+
1387+
def split_contigs(logger, contig_fasta, *, output, min_length):
1388+
from .utils import possibly_compressed_write
1389+
os.makedirs(output, exist_ok=True)
1390+
1391+
oname = f"{output}/split_contigs.fna.gz"
1392+
with possibly_compressed_write(oname) as ofile:
1393+
for h, seq in fasta_iter(contig_fasta):
1394+
if len(seq) < min_length:
1395+
continue
1396+
half = len(seq) // 2
1397+
h1 = h + "_1"
1398+
seq1 = seq[:half]
1399+
h2 = h + "_2"
1400+
seq2 = seq[half:]
1401+
ofile.write(f">{h1}\n{seq1}\n")
1402+
ofile.write(f">{h2}\n{seq2}\n")
1403+
return oname
1404+
1405+
13631406
def main2(args=None, is_semibin2=True):
13641407
import tempfile
13651408

@@ -1573,6 +1616,11 @@ def main2(args=None, is_semibin2=True):
15731616
from .utils import concatenate_fasta
15741617
ofname = concatenate_fasta(args.contig_fasta, args.min_length, args.output, args.separator, args.output_compression)
15751618
logger.info(f'Concatenated contigs written to {ofname}')
1619+
elif args.cmd == 'split_contigs':
1620+
if not is_semibin2:
1621+
logger.error('Command `split_contigs` is not available in SemiBin1. Please upgrade to SemiBin2.')
1622+
oname = split_contigs(logger, args.contig_fasta, output=args.output, min_length=args.min_length)
1623+
logger.info(f'Split contigs written to {oname}')
15761624

15771625
else:
15781626
logger.error(f'Could not handle subcommand {arg.cmd}')

docs/usage.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -426,22 +426,22 @@ Strobealign-aemb is a fast abundance estimation method for metagenomic binning.
426426
As strobealign-aemb can not provide the mapping information for every position of the contig, so we can not run SemiBin2 with strobealign-aemb in binning modes where samples used smaller 5 and need to split the contigs to generate the must-link constratints.
427427

428428

429-
1. Split the fasta files
429+
1. Split the fasta files using the `split_contigs` subcommand:
430430
```bash
431-
python script/generate_split.py -c contig.fa -o output
431+
SemiBin2 split_contigs -i contig.fa -o output
432432
```
433-
2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information
433+
2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information. Note that version 0.13 (or newer) is required
434434
```bash
435-
strobealign --aemb output/split.fa read1_1.fq read1_2.fq -R 6 > sample1.txt
436-
strobealign --aemb output/split.fa read2_1.fq read2_2.fq -R 6 > sample2.txt
437-
strobealign --aemb output/split.fa read3_1.fq read3_2.fq -R 6 > sample3.txt
438-
strobealign --aemb output/split.fa read4_1.fq read4_2.fq -R 6 > sample4.txt
439-
strobealign --aemb output/split.fa read5_1.fq read5_2.fq -R 6 > sample5.txt
435+
strobealign --aemb output/split_contigs.fna.gz read1_1.fq read1_2.fq -R 6 > sample1.tsv
436+
strobealign --aemb output/split_contigs.fna.gz read2_1.fq read2_2.fq -R 6 > sample2.tsv
437+
strobealign --aemb output/split_contigs.fna.gz read3_1.fq read3_2.fq -R 6 > sample3.tsv
438+
strobealign --aemb output/split_contigs.fna.gz read4_1.fq read4_2.fq -R 6 > sample4.tsv
439+
strobealign --aemb output/split_contigs.fna.gz read5_1.fq read5_2.fq -R 6 > sample5.tsv
440440
```
441-
3. Run SemiBin2 (like running SemiBin with BAM files)
441+
3. Run SemiBin2 (running SemiBin with BAM files)
442442
```bash
443-
SemiBin2 generate_sequence_features_single -i contig.fa -a *.txt -o output
444-
SemiBin2 generate_sequence_features_multi -i contig.fa -a *.txt -s : -o output
445-
SemiBin2 single_easy_bin -i contig.fa -a *.txt -o output
446-
SemiBin2 multi_easy_bin i contig.fa -a *.txt -s : -o output
443+
SemiBin2 generate_sequence_features_single -i contig.fa -a *.tsv -o output
444+
SemiBin2 generate_sequence_features_multi -i contig.fa -a *.tsv -s : -o output
445+
SemiBin2 single_easy_bin -i contig.fa -a *.tsv -o output
446+
SemiBin2 multi_easy_bin i contig.fa -a *.tsv -s : -o output
447447

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from os import path
2+
import subprocess
3+
import tempfile
4+
import gzip
5+
6+
tmpdir = '/tmp/SemiBin2'
7+
with tempfile.TemporaryDirectory() as tmpdir:
8+
with open(f'{tmpdir}/S1.fna', 'w') as f:
9+
f.write('>C1\n')
10+
for i in range(1000):
11+
f.write('ATCG')
12+
f.write('\n')
13+
f.write('>C2\n')
14+
for i in range(100):
15+
f.write('GTAA')
16+
subprocess.check_call(
17+
['SemiBin2', 'split_contigs',
18+
'-i', f'{tmpdir}/S1.fna',
19+
'-o', f'{tmpdir}/output',
20+
'--min-len', '1000',])
21+
assert path.exists(f'{tmpdir}/output/split_contigs.fna.gz')
22+
23+
with gzip.open(f'{tmpdir}/output/split_contigs.fna.gz', 'rt') as f:
24+
assert f.readline() == '>C1_1\n'
25+
part1 = f.readline()
26+
assert f.readline() == '>C1_2\n'
27+
part2 = f.readline()
28+
assert len(part1) + len(part2) == 4002 # 4000 + 2 newlines
29+
assert not f.read(1)
30+

script/generate_split.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

0 commit comments

Comments
 (0)