Skip to content

Commit 67dc748

Browse files
committed
added circular sequence fetching script for newer spades outputs
1 parent a424c4f commit 67dc748

File tree

6 files changed

+103
-9
lines changed

6 files changed

+103
-9
lines changed

get_simple_cycs.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,17 +62,20 @@ def parse_user_input():
6262
cycs = set([])
6363
fp = open(fastg_name, 'r')
6464

65-
simple_cycs_ofile = fp.name.replace(".fastg", ".simple_cycs.fasta")
65+
simple_cycs_ofile = fp.name.replace(".fastg", ".simple.cycs.fasta")
6666
f_cycs_out = open(simple_cycs_ofile, 'w')
67-
filt_graph_ofile = fp.name.replace(".fastg", ".simple_cycs_filtered.fastg")
67+
filt_graph_ofile = fp.name.replace(".fastg", ".simple.cycs_filtered.fastg")
6868
f_graph_out = open(filt_graph_ofile, 'w')
6969

7070

7171
#### potential issue: output fastg
7272
#### may contain links to nodes that have been removed
7373
#### may be potential hazard downstream
74-
74+
cnt = 0
7575
for name,seq,qual in readfq(fp):
76+
cnt+=1
77+
if cnt%2!=1:
78+
continue
7679
orig_name = name
7780
name = re.sub('[:,]'," ", name[:-1])
7881
name_parts = name.split(" ")

paper/get_spades_cyc_rr_contigs.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import argparse, sys, re, os
2+
sys.path.insert(0, '../recycle/')
3+
from recycle.utils import *
4+
import numpy as np
5+
6+
7+
def parse_user_input():
8+
parser = argparse.ArgumentParser(description='gets cycles from spades contigs output. Outputs these to separate fasta file')
9+
parser.add_argument('-g','--graph', help='Input (SPAdes 3.60+) FASTG to process', required=True, type=str)
10+
parser.add_argument('-p','--paths', help='Input (SPAdes 3.60+) contigs.paths file to process', required=True, type=str)
11+
parser.add_argument('-s','--sequences', help='Input (SPAdes 3.60+) contigs.fasta sequences to process', required=True, type=str)
12+
parser.add_argument('-m','--length', help='Minimum cycle length to keep (shorter cycles put in new graph file; default = 1000)',
13+
required=False, type=int, default=1000)
14+
parser.add_argument('-k','--max_k', help='integer reflecting maximum k value used by the assembler',
15+
required=True, type=int, default=55)
16+
return parser.parse_args()
17+
18+
def get_cyc_labels_from_paths_file(paths):
19+
""" scans lines of paths file
20+
when length greater than 2 and
21+
first node same as last, add label of path
22+
to returned set; add only forward strand versions
23+
"""
24+
labels = set({})
25+
fp = open(paths, 'r')
26+
lines = fp.read()
27+
fp.close()
28+
lines = re.sub(';\n',",", lines)
29+
lines = lines.split("\n")
30+
last_label = ''
31+
for ind in range(len(lines)):
32+
line = lines[ind].rstrip()
33+
if (ind % 4 == 2 or ind % 4 == 3):
34+
continue
35+
if ind%4==0:
36+
last_label = line
37+
else:
38+
path = line.split(",")
39+
if len(path)>1 and path[0]==path[-1]:
40+
labels.add(last_label)
41+
return labels
42+
43+
def update_paths_dict(paths_dict, labels, seqs):
44+
fs = open(seqs, 'r')
45+
for name,seq,qual in readfq(fs):
46+
if name in labels:
47+
paths_dict[name] = seq
48+
return seqs
49+
50+
51+
52+
args = parse_user_input()
53+
fastg = args.graph
54+
paths = args.paths
55+
seqs = args.sequences
56+
min_length = args.length
57+
max_k = args.max_k
58+
fg = open(fastg, 'r')
59+
files_dir = os.path.dirname(fg.name)
60+
61+
G = get_fastg_digraph(fastg)
62+
SEQS = get_fastg_seqs_dict(fastg,G)
63+
long_self_loops = get_long_self_loops(G, min_length, SEQS)
64+
final_paths_dict = {}
65+
path_count = 0
66+
67+
for nd in long_self_loops:
68+
name = get_spades_type_name(path_count, nd,
69+
SEQS, G, get_cov_from_spades_name(nd[0]))
70+
final_paths_dict[name] = nd
71+
path_count += 1
72+
73+
path_labels_set = get_cyc_labels_from_paths_file(paths)
74+
update_paths_dict(final_paths_dict, path_labels_set, seqs)
75+
76+
# output - fasta of cyc sequences
77+
fp = open(paths, 'r')
78+
(root,ext) = os.path.splitext(fp.name)
79+
fasta_ofile = root + ext.replace(".paths", ".cycs.fasta")
80+
f_cycs_fasta = open(fasta_ofile, 'w')
81+
82+
for p in final_paths_dict.keys():
83+
if p not in path_labels_set:
84+
# cycle = False b/c want last k-mer to match first
85+
seq = get_seq_from_path(final_paths_dict[p], SEQS, max_k_val=max_k, cycle=False)
86+
else:
87+
seq = final_paths_dict[p]
88+
#
89+
# print ""
90+
if len(seq)>=min_length:
91+
f_cycs_fasta.write(">" + p + "\n" + seq + "\n")

paper/nucmer.sim.mk

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
# creates a report on the number of plasmids reported
22
# and their alignment relative to some reference
33
REF_CNT = 100
4-
CV = 0.25
4+
CV = 0.5
55
PARENT_DIR = /home/nasheran/rozovr/recycle_paper_data
66
INPUT_DIR = $(PARENT_DIR)/ref_$(REF_CNT)
77
# TWO_STEP_ASSEM = 0
88
# REPEAT_RES_1 = 0
99
# REPEAT_RES_2 = 0
1010
REF = $(PARENT_DIR)/plasmids_sim_ref_$(REF_CNT).cln.fasta
11-
INPUT = $(INPUT_DIR)/assembly_graph
11+
INPUT = $(INPUT_DIR)/contigs
1212
# ifeq ($(REPEAT_RES_1),0)
1313
# INPUT1 = before_rr
1414
# else
@@ -36,7 +36,7 @@ all: $(INPUT).cycs.dbl.fasta $(INPUT).nucmer.delta $(INPUT).nucmer$(CV).summary
3636

3737
# 1) concat output cycles to self:
3838
$(INPUT).cycs.dbl.fasta: $(INPUT).cycs.fasta
39-
python ~/recycle/paper/concat_seq_to_self.py -i $(INPUT).cycs.fasta -m 0
39+
python ~/recycle/paper/concat_seq_to_self.py -i $(INPUT).cycs.fasta -m 55
4040

4141
# 2) run nucmer
4242
$(INPUT).nucmer.delta: $(INPUT).cycs.dbl.fasta

paper/tally_hits.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import argparse, sys
2+
sys.path.insert(0, '../recycle/')
13
from recycle.utils import *
2-
import argparse
34
import numpy as np
45

6+
57
def count_property_range_hits(prop, node_dict, hits):
68
""" picks which values to use in tuples based on property
79
counts vals having min_val <= val < max_val

recycle/utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,9 @@ def get_fastg_seqs_dict(fastg_name, G):
8080
"""
8181
fp = open(fastg_name, 'r')
8282
seqs = {}
83-
# nodes = G.nodes()
8483
for name,seq,qual in readfq(fp):
8584
name_parts = re.sub('[:,]'," ", name[:-1]).split()
8685
node = name_parts[0]
87-
# if node in nodes:
8886
seqs[node] = seq
8987
return seqs
9088

recycle/utils.pyc

0 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)