Merge pull request #6 from xzt41/master

xzt41 · web-flow · commit fd8f8cea2bc9 · 2018-03-30T23:46:44.000-04:00
update to 1.2.10
diff --git a/ChangeLog.rst b/ChangeLog.rst
@@ -4,6 +4,12 @@ Changes
 todo in future versions:
 - Supporting for *de novo* GTF file (no annotated start and stop codons).
 
+v1.2.10 (2018.3.25)
+-----------------
+- Add "-i" option to metaplots command for multiple input files
+- Supporting for non-stranded library sequencing
+- Update the document
+
 v1.2.9 (2018.3.5)
 -----------------
 - Small bug fixed in metaplots.
@@ -35,7 +41,6 @@ v1.2.5 (2017.5.12)
 v1.2.4 (2017.5.4)
 -----------------
 - Add support for outputting ORF results in gtf format.
-- Add support for outputting ORF results in gtf format.
 - Fix a bug where some ORFs' genomic coordinates are wrong.
 - Other optimizations on code and documents.
 
diff --git a/README.rst b/README.rst
@@ -159,6 +159,8 @@ Please make sure the path and file name are correct.
       STAR --runThreadN 8 --runMode genomeGenerate --genomeDir <hg19_STARindex>
       --genomeFastaFiles <hg19_genome.fa> --sjdbGTFfile <gencode.v19.annotation.gtf>
 
+   .. _STAR:
+
    (2). Alignment:
 
    .. code-block:: bash
@@ -186,17 +188,22 @@ Please make sure the path and file name are correct.
    This step will generate two files: a PDF file which plots the aggregate profiles of the distance from the 5'-end
    of reads to the annotated start codons (or stop codons). The P-site config file, which defines the read lengths with
    strong 3-nt periodicity and the associated P-site locations for each length. Users can check the P-site periodicity
-   or modify this file according the plots in PDF file.
+   or modify this file according the plots in PDF file. In some cases, user may have multiple bam files to predict ORFs
+   together in next step, they can use "-i" argument to specify a text file which contain the names of these bam files (
+   one per line)
+
+   .. _RiboCode:
 
    (3). Detecting translated ORFs using the ribosome-profiling data:
 
    .. code-block:: bash
 
-      RiboCode -a <RiboCode_annot> -c <config.txt> -l no -o <RiboCode_ORFs_result>
+      RiboCode -a <RiboCode_annot> -c <config.txt> -l no -g -o <RiboCode_ORFs_result>
 
 
    Using the config file generated by last step to specify the information of the bam file and P-site parameters,
-   or refer to the example file `config.txt`_ in data folder.
+   or refer to the example file `config.txt`_ in data folder. The "gtf" or "bed" format file of predicted ORFs can
+   be obtained by adding the "-g" or "-b" argument to this command.
 
    **Explanation of final result files**
 
@@ -260,6 +267,8 @@ Please make sure the path and file name are correct.
       ORFcount -g <RiboCode_ORFs_result.gtf> -r <ribo-seq genomic mapping file> -f 15 -l 5 -e 100 -m 26 -M 34 -o <ORF.counts>
 
    The reads aligned to first 15 codons and last 5 codons of ORFs which length longer than 100 nt will be excluded.
+   The "RiboCode_ORFs_result.gtf" file can be generated by `RiboCode`_ command. The "ribo-seq genomic mapping file" is the
+   genome-wide mapping file produced by `STAR`_ mapping.
 
 
 Recipes (FAQ):
@@ -316,11 +325,11 @@ Xudong Xing (xudonxing_bioinf[at]sina.com)
 
 .. _HEK293 dataset: https://trace.ncbi.nlm.nih.gov/Traces/sra/?run=SRR1630831
 
-.. _config.txt: https://github.com/xzt41/RiboCode/blob/master/data/config.txt
+.. _config.txt: https://github.com/xryanglab/RiboCode/blob/master/data/config.txt
 
-.. _rRNA.fa: https://github.com/xzt41/RiboCode/blob/master/data/rRNA.fa
+.. _rRNA.fa: https://github.com/xryanglab/RiboCode/blob/master/data/rRNA.fa
 
-.. _GTF_update.rst: https://github.com/xzt41/RiboCode/blob/master/data/GTF_update.rst
+.. _GTF_update.rst: https://github.com/xryanglab/RiboCode/blob/master/data/GTF_update.rst
 
 .. _UNC Bioinformatics Utilities: https://github.com/mozack/ubu
 
diff --git a/RiboCode/RPF_count_ORF.py b/RiboCode/RPF_count_ORF.py
@@ -10,13 +10,13 @@
 import pysam
 import sys
 
-def readGTF(gtfFile):
+def readGTF(gtfFile,stranded):
 	gtf = HTSeq.GFF_Reader(gtfFile)
 	start_codon_sites = {}
 	stop_codon_sites = {}
 	counts = {}
 
-	ORF_features = HTSeq.GenomicArrayOfSets("auto",stranded="yes")
+	ORF_features = HTSeq.GenomicArrayOfSets("auto",stranded !="no")
 	# for i,f in enumerate(gtf):
 	for f in gtf:
 		# i += 1
@@ -87,7 +87,7 @@ def count_reads(start_codon_sites,stop_codon_sites,ORF_features,counts,map_file,
 		if read_len > max_read:
 			too_long += 1
 			continue
-		if stranded == "yes":
+		if stranded != "reverse":
 			iv_seq = (co.ref_iv for co in r.cigar if co.type =="M" and co.size >0)
 		else:
 			iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if co.type=="M" and co.size>0)
@@ -140,7 +140,7 @@ def main():
 	args = parsing_ORF_count()
 	verboseprint("Reading the GTF file ...")
 
-	start_codon_sites,stop_codon_sites,ORF_features,counts = readGTF(args.gtf_file)
+	start_codon_sites,stop_codon_sites,ORF_features,counts = readGTF(args.gtf_file,args.stranded)
 	counts["__too_low_quality"] = 0
 	counts["__not_aligned"] = 0
 	counts["__too_short(<%i)" % args.min_read] = 0
diff --git a/RiboCode/__init__.py b/RiboCode/__init__.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python
 # -*- coding:UTF-8 -*-
 __author__ = 'Zhengtao Xiao'
-__version__ = "1.2.9"
+__version__ = "1.2.10"
diff --git a/RiboCode/loadconfig.py b/RiboCode/loadconfig.py
@@ -44,13 +44,18 @@ def _parsing(self):
 	  				                  Use commas to separate read lengths and P-site offsets,\
 					                  e.g. 28,29 11,12\n")
 					sys.exit()
-				if stranded not in ["yes", "reverse"]:
+				if stranded not in ["yes", "reverse","no"]:
 					sys.stderr.write("Error, pleas check your config file\n, \
-					                  the stranded should be yes or reverse.\
+					                  the stranded should be yes ,no, or reverse.\
 					                  reverse means reversed strand interpretation\n")
 					sys.exit()
 				else:
-					stranded = True if stranded == "yes" else False
+					if stranded == "yes":
+						stranded = True
+					elif stranded == "reverse":
+						stranded = False
+					else:
+						stranded = None
 				if samplename in samplenames:
 					sys.stderr.wirte("Error, pls check you config file\n, bam file name is duplicated: %s.\n" % samplename)
 					sys.exit()
diff --git a/RiboCode/metaplots.py b/RiboCode/metaplots.py
@@ -184,47 +184,54 @@ def meta_analysis(gene_dict,transcript_dict,args):
 		                 "If no any start codons and stop codons are annotated in GTF file \n" +
 		                 ",please skip this step and create a config file to specify the P-site based on other evidence." )
 		sys.exit()
-	# read bam file
-	distance_to_start_count,distance_to_stop_count,length_counter = readTranscriptBam(
-		args.rpf_mapping_file,filter_tids,transcript_dict,args.stranded,args.minLength,args.maxLength)
-	# predefine the psite
-	pre_psite_dict = {}
-	total_reads = sum(length_counter.values())
+	# read bam files
 	fout = open(args.outname + "_pre_config.txt", "w")
-	fout.write("#read_length\tproportion(per total mapped reads)\tpredicted_psite\tf0_sum\tf1_sum\tf2_sum\tf0_percent\tpvalue1\tpvalue2\tpvalue_combined\n")
-	for l,d in distance_to_start_count.items():
-		if d.sum() < 10:
-			continue
-		mask_max_psite = d[:50].argmax()
-		predict_psite,others = _predict_psite(l,d,args.frame0_percent,args.pvalue1_cutoff,args.pvalue2_cutoff)
-		if predict_psite:
-			if mask_max_psite != predict_psite:
-				sys.stderr.write("Warning:The predicted P-site location(%i) for length %i is not the highest peak(%i),\
-				                 please confirm according metagene plots.\n" % (50-predict_psite,l,50-mask_max_psite))
-			read_percent = '{:.2%}'.format(length_counter[l] / total_reads)
-			pre_psite_dict[l] = predict_psite
-			f0sum,f1sum,f2sum,f0_percent,pv1,pv2,pv = others
-			fout.write("# " + "\t".join(map(str,[l,read_percent,50-predict_psite,f0sum,f1sum,f2sum,
-			                                     '{:.2%}'.format(f0_percent),pv1,pv2,pv])) + "\n")
-
-	#print the psite lines
-	fout.write("\n")
-	if pre_psite_dict:
-		fout.write("# " + "\t".join(["SampleName","AlignmentFile","Stranded(yes/reverse)","P-siteReadLength","P-siteLocations"]) + "\n")
-		stranded = "yes" if args.stranded is True else "reverse"
-		pre_psite_len = list(map(str,sorted(pre_psite_dict.keys())))
-		pre_psite_loc = list(map(str,[-pre_psite_dict[i]+50 for i in sorted(pre_psite_dict.keys())]))
-		sampleName = os.path.splitext(os.path.basename(args.rpf_mapping_file))[0]
-		fout.write("\t".join(map(str,[sampleName,args.rpf_mapping_file,stranded,",".join(pre_psite_len),",".join(pre_psite_loc)])) + "\n")
-		fout.close()
-		distancePlot(distance_to_start_count,distance_to_stop_count,pre_psite_dict,length_counter,args.outname)
-		#lengthDistribution(length_counter,args.outname)
-	else:
-		distancePlot(distance_to_start_count,distance_to_stop_count,pre_psite_dict,length_counter,args.outname)
-		sys.stderr.write("No obviously periodicity were detected from alignment reads in annotated start codons,\n" +
-		                 "it could be due to poor quality sequencing.\n" +
-		                 "Please check the metagene plots and try again by lowering the value of frame0_percent")
-
+	for r in args.rpf_mapping_file:
+		fout.write("\n#%s\n" % r)
+		fout.write("#read_length\tproportion(per total mapped reads)\tpredicted_psite\tf0_sum\tf1_sum\tf2_sum\tf0_percent\tpvalue1\tpvalue2\tpvalue_combined\n")
+		distance_to_start_count,distance_to_stop_count,length_counter = readTranscriptBam(
+			r,filter_tids,transcript_dict,args.stranded,args.minLength,args.maxLength)
+		# predefine the psite
+		pre_psite_dict = {}
+		total_reads = sum(length_counter.values())
+
+		for l,d in distance_to_start_count.items():
+			if d.sum() < 10:
+				continue
+			mask_max_psite = d[:50].argmax()
+			predict_psite,others = _predict_psite(l,d,args.frame0_percent,args.pvalue1_cutoff,args.pvalue2_cutoff)
+			if predict_psite:
+				if mask_max_psite != predict_psite:
+					sys.stderr.write("Warning:The predicted P-site location(%i) for length %i is not the highest peak(%i),\
+					                 please confirm according metagene plots.\n" % (50-predict_psite,l,50-mask_max_psite))
+				read_percent = '{:.2%}'.format(length_counter[l] / total_reads)
+				pre_psite_dict[l] = predict_psite
+				f0sum,f1sum,f2sum,f0_percent,pv1,pv2,pv = others
+				fout.write("# " + "\t".join(map(str,[l,read_percent,50-predict_psite,f0sum,f1sum,f2sum,
+				                                     '{:.2%}'.format(f0_percent),pv1,pv2,pv])) + "\n")
+		sampleName = os.path.splitext(os.path.basename(r))[0]
+		#print the psite lines
+		fout.write("\n")
+		if pre_psite_dict:
+			fout.write("# " + "\t".join(["SampleName","AlignmentFile","Stranded(yes/reverse)","P-siteReadLength","P-siteLocations"]) + "\n")
+			if args.stranded is True:
+				stranded = "yes"
+			elif args.stranded is False:
+				stranded = "reverse"
+			else:
+				stranded = "no"
+			pre_psite_len = list(map(str,sorted(pre_psite_dict.keys())))
+			pre_psite_loc = list(map(str,[-pre_psite_dict[i]+50 for i in sorted(pre_psite_dict.keys())]))
+
+			fout.write("\t".join(map(str,[sampleName,r,stranded,",".join(pre_psite_len),",".join(pre_psite_loc)])) + "\n")
+			distancePlot(distance_to_start_count,distance_to_stop_count,pre_psite_dict,length_counter,args.outname + sampleName)
+			#lengthDistribution(length_counter,args.outname)
+		else:
+			distancePlot(distance_to_start_count,distance_to_stop_count,pre_psite_dict,length_counter,args.outname + sampleName)
+			sys.stderr.write("No obviously periodicity are detected from bam file %s,\n" % r +
+			                 "it could be due to poor quality sequencing.\n" +
+			                 "Please check the metagene plots and try again by lowering the value of frame0_percent\n")
+	fout.close()
 
 def main():
 	verboseprint("Create metaplot file and predict the P-site locations ...")
@@ -234,6 +241,5 @@ def main():
 	meta_analysis(gene_dict,transcript_dict,args)
 	verboseprint("Complete prediction of the P-site locations")
 
-
 if __name__ == "__main__":
 	main()
diff --git a/RiboCode/parsing_opts.py b/RiboCode/parsing_opts.py
@@ -9,6 +9,17 @@
 import os
 from .__init__ import __version__
 
+def read_file(f):
+	if not os.path.exists(f):
+		raise  ValueError("Error, file %s not found!\n" % f)
+	fList = []
+	with open(f) as fin:
+		for line in fin:
+			if line.startswith("#") or line.strip() == "":
+				continue
+			fList.append(line.strip())
+	return fList
+
 def parsing_gtf_update():
 	parser = argparse.ArgumentParser(
 		description="This script is designed for preparing the appropriate GTF file from \
@@ -59,9 +70,12 @@ def parsing_metaplots():
 	)
 	parser.add_argument("-a","--annot_dir",dest="annot_dir",required=True,type=str,
 	                    help="transcripts annotation directory, generated by prepare_transcripts.")
-	parser.add_argument("-r","--rpf_mapping_file",dest="rpf_mapping_file",required=True,type=str,
+	group = parser.add_mutually_exclusive_group(required=True)
+	group.add_argument("-r","--rpf_mapping_file",dest="rpf_mapping_file",required=False,type=str,
 	                    help="ribo-seq BAM/SAM file aligned to the transcriptome.")
-	parser.add_argument("-s","--stranded",dest="stranded",required=False,type=str,choices=["yes","reverse"],
+	group.add_argument("-i","--input_file",dest="rpf_mapping_file",required=False,type=read_file,
+	                    help="the file list the ribo-seq BAM/SAM files aligned to the transcriptome.")
+	parser.add_argument("-s","--stranded",dest="stranded",required=False,type=str,choices=["yes","reverse","no"],
 	                    default="yes",help="whether the data is strand-specific, \
 	                    reverse means reversed strand interpretation.(default: yes)")
 	parser.add_argument("-m","--minimum-length",dest="minLength",required=False,type=int,default=24,
@@ -86,9 +100,17 @@ def parsing_metaplots():
 		raise ValueError("minimum length must be <= maximum length (currently %d and %d, respectively)" % (args.minLength, args.maxLength))
 	if args.minLength <= 0 or  args.maxLength <=0:
 		raise ValueError("minimum length or maximum length must be larger than 0.")
-	if not os.path.exists(args.rpf_mapping_file):
-		raise  ValueError("Error, the rpf mapping file not found: %s\n" % args.rpf_mapping_file)
-	args.stranded = True if args.stranded == "yes" else False
+	if type(args.rpf_mapping_file) is str:
+		if not os.path.exists(args.rpf_mapping_file):
+			raise  ValueError("Error, the rpf mapping file not found: %s\n" % args.rpf_mapping_file)
+		args.rpf_mapping_file = [args.rpf_mapping_file]
+
+	if args.stranded == "yes":
+		args.stranded = True
+	elif args.stranded == "reverse":
+		args.stranded = False
+	else:
+		args.stranded = None
 	args.pvalue1_cutoff = float(args.pvalue1_cutoff)
 	args.pvalue2_cutoff = float(args.pvalue2_cutoff)
 	args.frame0_percent = float(args.frame0_percent)
@@ -111,9 +133,6 @@ def parsing_ribo():
 	                          If set to no , the position of start codon will be automatically determined by program.", type=str)
 	parser.add_argument("-p","--pval-cutoff",dest="pval_cutoff",default=0.05,required=False,
 	                    help="P-value cutoff for ORF filtering, default 0.05", type=float)
-	parser.add_argument("--stranded","--stranded",dest="stranded",required=False,type=str,choices=["yes","reverse"],
-	                    default="yes",help="whether the data is strand-specific, \
-	                    reverse means reversed strand interpretation.(default: yes)")
 	parser.add_argument("-s","--start_codon",default="ATG",type=str,dest="start_codon",
 	                    help="The canonical start codon. default: ATG")
 	parser.add_argument("-A","--alt_start_codons",default="",type=str,dest="alternative_start_codons",
@@ -128,12 +147,12 @@ def parsing_ribo():
 	# parser.add_argument("-P","--parallel_num",dest="parallel_num",default=1,required=False,
 	#                     help="the number of threads to read the alignment file(s), \
 	#                     the optimal value is the number of alignment files, default=1",type=int)
-	parser.add_argument("-o","--output-name",dest="output_name",default="final_result",required=False,
-	                    help="output file name, default: final_result", type=str)
 	parser.add_argument("-g","--output-gtf",dest="output_gtf",action='store_true',default=False,required=False,
 	                    help="output the gtf file of predicted ORFs")
 	parser.add_argument("-b","--output-bed",dest="output_bed",action='store_true',default=False,required=False,
 	                    help="output the bed file of predicted ORFs")
+	parser.add_argument("-o","--output-name",dest="output_name",default="final_result",required=False,
+	                    help="output file name, default: final_result", type=str)
 	parser.add_argument('-V',"--version",action="version",version=__version__)
 	args = parser.parse_args()
 
@@ -174,7 +193,7 @@ def parsing_ORF_count():
 		description="This script is designed for calculating the number of reads mapping to ORF with the alignment files \
 		in SAM/BAM format (aligned to genome) and a feature file in GTF format"
 	)
-	parser.add_argument("-s","--stranded",dest="stranded",required=False,type=str,choices=["yes","reverse"],
+	parser.add_argument("-s","--stranded",dest="stranded",required=False,type=str,choices=["yes","reverse","no"],
 	                    default="yes",help="whether the data is strand-specific, \
 	                    reverse means reversed strand interpretation. (default: yes)")
 	parser.add_argument("-a","--minaqual",dest="min_quality",required=False,type=int,
@@ -228,9 +247,12 @@ def parsing_ribo_onestep():
 	                          please refer: https://en.wikipedia.org/wiki/GENCODE')
 	parser.add_argument("-f","--fasta",dest="genomeFasta",required=True,type=str,
 	                    help="The genome sequences file in fasta format.")
-	parser.add_argument("-r","--rpf_mapping_file",dest="rpf_mapping_file",required=True,type=str,
+	group = parser.add_mutually_exclusive_group(required=True)
+	group.add_argument("-r","--rpf_mapping_file",dest="rpf_mapping_file",required=True,type=str,
 	                    help="ribo-seq BAM/SAM file aligned to the transcriptome.")
-	parser.add_argument("-stranded","--stranded",dest="stranded",required=False,type=str,choices=["yes","reverse"],
+	group.add_argument("-i","--input_file",dest="rpf_mapping_file",required=False,type=read_file,
+	                    help="the file list the ribo-seq BAM/SAM files aligned to the transcriptome.")
+	parser.add_argument("-stranded","--stranded",dest="stranded",required=False,type=str,choices=["yes","reverse","no"],
 	                    default="yes",help="whether the data is strand-specific, \
 	                    reverse means reversed strand interpretation.(default: yes)")
 	parser.add_argument("-m","--minimum-length",dest="minLength",required=False,type=int,default=24,
@@ -266,9 +288,16 @@ def parsing_ribo_onestep():
 	                    help="output the bed file of predicted ORFs")
 	parser.add_argument('-V',"--version",action="version",version=__version__)
 	args = parser.parse_args()
-	if not os.path.exists(args.rpf_mapping_file):
-		raise  ValueError("Error, the rpf mapping file not found: %s\n" % args.rpf_mapping_file)
-	args.stranded = True if args.stranded == "yes" else False
+	if type(args.rpf_mapping_file) is str:
+		if not os.path.exists(args.rpf_mapping_file):
+			raise  ValueError("Error, the rpf mapping file not found: %s\n" % args.rpf_mapping_file)
+		args.rpf_mapping_file = [args.rpf_mapping_file]
+	if args.stranded == "yes":
+		args.stranded = True
+	elif args.stranded == "reverse":
+		args.stranded = False
+	else:
+		args.stranded = None
 	if args.minLength > args.maxLength:
 		raise ValueError("minimum length must be <= maximum length (currently %d and %d, respectively)" % (args.minLength, args.maxLength))
 	if args.minLength <= 0 or  args.maxLength <=0: