|
| 1 | +import argparse, sys, re, os |
| 2 | +sys.path.insert(0, '../recycle/') |
| 3 | +from recycle.utils import * |
| 4 | +import numpy as np |
| 5 | + |
| 6 | + |
| 7 | +def parse_user_input(): |
| 8 | + parser = argparse.ArgumentParser(description='gets cycles from spades contigs output. Outputs these to separate fasta file') |
| 9 | + parser.add_argument('-g','--graph', help='Input (SPAdes 3.60+) FASTG to process', required=True, type=str) |
| 10 | + parser.add_argument('-p','--paths', help='Input (SPAdes 3.60+) contigs.paths file to process', required=True, type=str) |
| 11 | + parser.add_argument('-s','--sequences', help='Input (SPAdes 3.60+) contigs.fasta sequences to process', required=True, type=str) |
| 12 | + parser.add_argument('-m','--length', help='Minimum cycle length to keep (shorter cycles put in new graph file; default = 1000)', |
| 13 | + required=False, type=int, default=1000) |
| 14 | + parser.add_argument('-k','--max_k', help='integer reflecting maximum k value used by the assembler', |
| 15 | + required=True, type=int, default=55) |
| 16 | + return parser.parse_args() |
| 17 | + |
| 18 | +def get_cyc_labels_from_paths_file(paths): |
| 19 | + """ scans lines of paths file |
| 20 | + when length greater than 2 and |
| 21 | + first node same as last, add label of path |
| 22 | + to returned set; add only forward strand versions |
| 23 | + """ |
| 24 | + labels = set({}) |
| 25 | + fp = open(paths, 'r') |
| 26 | + lines = fp.read() |
| 27 | + fp.close() |
| 28 | + lines = re.sub(';\n',",", lines) |
| 29 | + lines = lines.split("\n") |
| 30 | + last_label = '' |
| 31 | + for ind in range(len(lines)): |
| 32 | + line = lines[ind].rstrip() |
| 33 | + if (ind % 4 == 2 or ind % 4 == 3): |
| 34 | + continue |
| 35 | + if ind%4==0: |
| 36 | + last_label = line |
| 37 | + else: |
| 38 | + path = line.split(",") |
| 39 | + if len(path)>1 and path[0]==path[-1]: |
| 40 | + labels.add(last_label) |
| 41 | + return labels |
| 42 | + |
| 43 | +def update_paths_dict(paths_dict, labels, seqs): |
| 44 | + fs = open(seqs, 'r') |
| 45 | + for name,seq,qual in readfq(fs): |
| 46 | + if name in labels: |
| 47 | + paths_dict[name] = seq |
| 48 | + return seqs |
| 49 | + |
| 50 | + |
| 51 | + |
| 52 | +args = parse_user_input() |
| 53 | +fastg = args.graph |
| 54 | +paths = args.paths |
| 55 | +seqs = args.sequences |
| 56 | +min_length = args.length |
| 57 | +max_k = args.max_k |
| 58 | +fg = open(fastg, 'r') |
| 59 | +files_dir = os.path.dirname(fg.name) |
| 60 | + |
| 61 | +G = get_fastg_digraph(fastg) |
| 62 | +SEQS = get_fastg_seqs_dict(fastg,G) |
| 63 | +long_self_loops = get_long_self_loops(G, min_length, SEQS) |
| 64 | +final_paths_dict = {} |
| 65 | +path_count = 0 |
| 66 | + |
| 67 | +for nd in long_self_loops: |
| 68 | + name = get_spades_type_name(path_count, nd, |
| 69 | + SEQS, G, get_cov_from_spades_name(nd[0])) |
| 70 | + final_paths_dict[name] = nd |
| 71 | + path_count += 1 |
| 72 | + |
| 73 | +path_labels_set = get_cyc_labels_from_paths_file(paths) |
| 74 | +update_paths_dict(final_paths_dict, path_labels_set, seqs) |
| 75 | + |
| 76 | +# output - fasta of cyc sequences |
| 77 | +fp = open(paths, 'r') |
| 78 | +(root,ext) = os.path.splitext(fp.name) |
| 79 | +fasta_ofile = root + ext.replace(".paths", ".cycs.fasta") |
| 80 | +f_cycs_fasta = open(fasta_ofile, 'w') |
| 81 | + |
| 82 | +for p in final_paths_dict.keys(): |
| 83 | + if p not in path_labels_set: |
| 84 | + # cycle = False b/c want last k-mer to match first |
| 85 | + seq = get_seq_from_path(final_paths_dict[p], SEQS, max_k_val=max_k, cycle=False) |
| 86 | + else: |
| 87 | + seq = final_paths_dict[p] |
| 88 | + # |
| 89 | + # print "" |
| 90 | + if len(seq)>=min_length: |
| 91 | + f_cycs_fasta.write(">" + p + "\n" + seq + "\n") |
0 commit comments