Skip to content

Commit 0f3e1a0

Browse files
committed
Added post-processing scripts
1 parent 4169e07 commit 0f3e1a0

File tree

8 files changed

+1942
-1
lines changed

8 files changed

+1942
-1
lines changed

Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina/README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ The current GeneLab Illumina amplicon sequencing data processing pipeline (AmpIl
2626
5a. [Main outputs](#5a-main-outputs)
2727
5b. [Resource logs](#5b-resource-logs)
2828

29+
6. [Post Processing](#6-post-processing)
30+
2931
<br>
3032

3133
---
@@ -136,7 +138,6 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --target_regi
136138
* `-resume` - Resumes workflow execution using previously cached results
137139
* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow
138140
* `--target_region` – Specifies the amplicon target region to be analyzed, 16S, 18S or ITS.
139-
140141
141142
*Required only if you would like to pull and process data directly from OSDR*
142143
@@ -181,3 +182,22 @@ Standard nextflow resource usage logs are also produced as follows:
181182
182183
> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report).
183184
185+
<br>
186+
187+
---
188+
189+
### 6. Post Processing
190+
191+
For options and detailed help on how to run the post-processing workflow, run the following command:
192+
193+
```bash
194+
nextflow run post_processng.nf --help
195+
```
196+
197+
To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command:
198+
199+
```bash
200+
nextflow -C post_processing.config run post_processng.nf -resume -profile slurm,singularity
201+
```
202+
203+
The outputs of the run will be in a directory called `Post_Processing` by default.

Amplicon/Illumina/Workflow_Documentation/NF_AmpIllumina/workflow_code/bin/GL-gen-amplicon-file-associations-table

Lines changed: 534 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
This is a program for generating a README.txt file for GeneLab processed amplicon datasets.
5+
"""
6+
7+
import os
8+
import sys
9+
import argparse
10+
import textwrap
11+
import zipfile
12+
import re
13+
14+
15+
parser = argparse.ArgumentParser(description = "This program generates the corresponding README file for GeneLab processed amplicon dataset. It is intended to \
16+
be run before running `GL-validate-processed-amplicon-data` and after processing_info.zip has been created.")
17+
18+
required = parser.add_argument_group('required arguments')
19+
required.add_argument("-g", "--GLDS-ID", help = 'GLDS ID (e.g. "GLDS-69")', action = "store", required = True)
20+
parser.add_argument("--output", help = 'Name of output file (default: "README.txt", with appended prefix if one is provided)', default = "README.txt")
21+
parser.add_argument("--name", help = 'Name of individual who performed the processing (default: "Michael D. Lee")', default = "Michael D. Lee")
22+
parser.add_argument("--email", help = 'Email address of individual who performed the processing (default: "Mike.Lee@nasa.gov")', default = "Mike.Lee@nasa.gov")
23+
parser.add_argument("--protocol_ID", help = 'Protocol document ID followed (default: assay dependent)', default = "GL-DPPD-7104-B")
24+
parser.add_argument("--assay_suffix", help = "Genelab assay suffix", action = "store", default = "_GLAmpSeq")
25+
parser.add_argument("--primers-already-trimmed", help = "Add this flag if primers were trimmed prior to GeneLab processing, \
26+
therefore there are no trimmed sequence data", action = "store_true")
27+
parser.add_argument("--processing_zip_file", help = "Specifies the location of processing_info.zip",
28+
action = "store", default = "processing_info.zip")
29+
parser.add_argument("--raw-reads-dir", help = "Specifies the location of the raw reads directory if they are to be included", action = "store", default = "")
30+
parser.add_argument("--fastqc_dir", help = "Specifies the location of fastqc and multiqc reports directory",
31+
action = "store", default = "FastQC_Outputs/")
32+
parser.add_argument("--filtered_reads_dir", help = "Specifies the location of the filtered reads directory",
33+
action = "store", default = "Filtered_Sequence_Data/")
34+
parser.add_argument("--trimmed_reads_dir", help = "Specifies the location of the trimmed reads directory",
35+
action = "store", default = "Trimmed_Sequence_Data/")
36+
parser.add_argument("--final_outputs_dir", help = "Specifies the location of the final outputs directory",
37+
action = "store", default = "Final_Outputs/")
38+
39+
40+
if len(sys.argv)==1:
41+
parser.print_help(sys.stderr)
42+
sys.exit(0)
43+
44+
args = parser.parse_args()
45+
46+
# Setting some colors
47+
tty_colors = {
48+
'green' : '\033[0;32m%s\033[0m',
49+
'yellow' : '\033[0;33m%s\033[0m',
50+
'red' : '\033[0;31m%s\033[0m'
51+
}
52+
53+
54+
### Functions ###
55+
def color_text(text, color='green'):
56+
if sys.stdout.isatty():
57+
return tty_colors[color] % text
58+
else:
59+
return text
60+
61+
62+
def wprint(text):
63+
""" Print wrapper """
64+
65+
print(textwrap.fill(text, width=80, initial_indent=" ",
66+
subsequent_indent=" ", break_on_hyphens=False))
67+
68+
69+
def report_failure(message, color = "yellow"):
70+
print("")
71+
wprint(color_text(message, color))
72+
print("\nREADME-generation failed.\n")
73+
74+
sys.exit(1)
75+
76+
77+
def check_for_file_and_contents(file_path):
78+
""" Used by get_processing_zip_contents function """
79+
80+
if not os.path.exists(file_path):
81+
report_failure("The expected file '" + str(file_path) + "' does not exist.")
82+
if not os.path.getsize(file_path) > 0:
83+
report_failure("The file '" + str(file_path) + "' is empty.")
84+
85+
86+
def get_processing_zip_contents(processing_zip_file):
87+
""" This gets the filenames that are in the processing_info.zip to add them to the readme """
88+
# Check that the zip file exists and that it is not empty
89+
90+
check_for_file_and_contents(processing_zip_file)
91+
92+
with zipfile.ZipFile(processing_zip_file) as zip_obj:
93+
94+
entries = zip_obj.namelist()
95+
entries.sort()
96+
97+
return(entries)
98+
99+
100+
def write_header(output, GLDS_ID, name, email, protocol_ID):
101+
102+
header = ["################################################################################\n",
103+
"{:<77} {:>0}".format("## This directory holds processed data for NASA " + str(GLDS_ID), "##\n"),
104+
"{:<77} {:>0}".format("## https://genelab-data.ndc.nasa.gov/genelab/accession/" + str(GLDS_ID) + "/", "##\n"),
105+
"{:<77} {:>0}".format("##", "##\n"),
106+
"{:<77} {:>0}".format("## Processed by " + str(name) + " (" + str(email) + ")", "##\n"),
107+
"{:<77} {:>0}".format("## Based on " + str(protocol_ID), "##\n"),
108+
"################################################################################\n\n",
109+
"Summary of contents:\n\n"]
110+
111+
output.writelines(header)
112+
113+
114+
def write_amplicon_body(output, output_file, assay_suffix, processing_zip_file,
115+
processing_zip_contents, fastqc_dir, raw_reads_dir,
116+
trimmed_reads_dir, filtered_reads_dir, final_outputs_dir, primers_already_trimmed):
117+
118+
# This README file
119+
output.write(" {:<41} {:>0}".format("- " + str(output_file), "- this file\n\n"))
120+
121+
# Fastqc info
122+
output.write(" {:<41} {:>0}".format("- " + str(fastqc_dir), "- multiQC summary reports of FastQC runs\n\n"))
123+
124+
# Raw reads
125+
if raw_reads_dir != "":
126+
output.write(" {:<41} {:>0}".format("- " + str(raw_reads_dir), "- initial read fastq files\n\n"))
127+
128+
# Primer-trimmed reads if there are any
129+
if not primers_already_trimmed:
130+
output.write(" {:<41} {:>0}".format("- " + str(trimmed_reads_dir), "- primer-trimmed fastq files\n\n"))
131+
132+
# Quality-filtered reads
133+
output.write(" {:<41} {:>0}".format("- " + str(filtered_reads_dir), "- quality-filtered fastq files\n\n"))
134+
135+
# Outputs
136+
output.write(" {:<41} {:>0}".format("- " + str(final_outputs_dir), "- primary output files (may or may not have additional prefix)\n"))
137+
output.write(" {:<37} {:>0}".format(f"- *{assay_suffix}.fasta", "- fasta file of recovered sequences\n"))
138+
output.write(" {:<37} {:>0}".format(f"- *counts{assay_suffix}.tsv", "- count table of sequences across samples\n"))
139+
output.write(" {:<37} {:>0}".format(f"- *taxonomy{assay_suffix}.tsv", "- assigned taxonomy of recovered sequences\n"))
140+
output.write(" {:<37} {:>0}".format(f"- *taxonomy-and-count{assay_suffix}.tsv", "- combined table of counts and taxonomy\n"))
141+
output.write(" {:<37} {:>0}".format(f"- *taxonomy-and-count{assay_suffix}.biom.zip", "- biom-formatted output of counts and taxonomy\n"))
142+
output.write(" {:<37} {:>0}".format(f"- *read-count-tracking{assay_suffix}.tsv", "- read counts at each processing step\n\n"))
143+
144+
# Processing info
145+
output.write(" {:<41} {:>0}".format("- " + str(processing_zip_file), "- zip archive holding info related to processing\n"))
146+
for item in processing_zip_contents:
147+
148+
num_levels = item.count("/")
149+
150+
if num_levels > 1 and not item.endswith("/"):
151+
out_item = re.sub(r'^.*/', '', str(item))
152+
elif num_levels == 1 and not item.endswith("/"):
153+
out_item = re.sub(r'^.*/', '', str(item))
154+
elif num_levels > 1:
155+
out_item = re.sub(r'^[^/]*/', '', str(item))
156+
else:
157+
out_item = str(item)
158+
159+
if item.endswith('/'):
160+
num_levels -= 1
161+
162+
num_spaces = num_levels * 4
163+
164+
output.write(" " + " " * num_spaces + "- " + out_item + "\n")
165+
166+
output.write("\n")
167+
168+
169+
170+
def main():
171+
### Variable setup ###
172+
assay_suffix = str(args.assay_suffix)
173+
fastqc_dir = str(args.fastqc_dir)
174+
filtered_reads_dir = str(args.filtered_reads_dir)
175+
processing_zip_file = str(args.processing_zip_file)
176+
177+
output_file = str(args.output)
178+
raw_reads_dir = str(args.raw_reads_dir)
179+
trimmed_reads_dir = str(args.trimmed_reads_dir)
180+
final_outputs_dir = str(args.final_outputs_dir)
181+
182+
processing_zip_contents = get_processing_zip_contents(processing_zip_file)
183+
184+
with open(output_file, "w") as output:
185+
186+
write_header(output, args.GLDS_ID, args.name, args.email, args.protocol_ID)
187+
188+
write_amplicon_body(output, output_file, assay_suffix, processing_zip_file,
189+
processing_zip_contents, fastqc_dir, raw_reads_dir,
190+
trimmed_reads_dir, filtered_reads_dir, final_outputs_dir,
191+
args.primers_already_trimmed)
192+
193+
194+
195+
if __name__ == "__main__":
196+
main()

0 commit comments

Comments
 (0)