Skip to content

Commit ecb72e8

Browse files
Merge pull request #117 from olabiyi/DEV_Metagenomics_Illumina_NF_conversion
Metagenomics Nextflow workflow: Added post-processing workflow
2 parents 5adb13f + 34b63a7 commit ecb72e8

File tree

11 files changed

+2507
-5
lines changed

11 files changed

+2507
-5
lines changed

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M
2929
5a. [Main outputs](#5a-main-outputs)
3030
5b. [Resource logs](#5b-resource-logs)
3131

32+
6. [Post Processing](#6-post-processing)
33+
3234
<br>
3335

3436
---
@@ -140,7 +142,8 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc <p
140142
* `-resume` - Resumes workflow execution using previously cached results
141143
142144
* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow
143-
*Required only if you would like to pull and process data directly from OSDR*
145+
146+
*Required only if you would like to pull and process data directly from OSDR*
144147
145148
* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-574.
146149
@@ -178,3 +181,23 @@ Standard nextflow resource usage logs are also produced as follows:
178181
- Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output)
179182
180183
> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report).
184+
185+
<br>
186+
187+
---
188+
189+
### 6. Post Processing
190+
191+
For options and detailed help on how to run the post-processing workflow, run the following command:
192+
193+
```bash
194+
nextflow run post_processng.nf --help
195+
```
196+
197+
To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command:
198+
199+
```bash
200+
nextflow -C post_processing.config run post_processng.nf -resume -profile slurm,singularity
201+
```
202+
203+
The outputs of the run will be in a directory called `Post_Processing` by default.

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table

Lines changed: 641 additions & 0 deletions
Large diffs are not rendered by default.

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme

Lines changed: 267 additions & 0 deletions
Large diffs are not rendered by default.

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data

Lines changed: 800 additions & 0 deletions
Large diffs are not rendered by default.

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${1} \
2121
| sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \
2222
| sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \
2323
| sed -E 's|/[a-z]{6}/[^ ]*|<path-removed-for-security-purposes>|g' \
24-
| sed -E "s|${ROOT_DIR}||g" > t && mv t ${1}
24+
| sed -E "s|${ROOT_DIR}||g" > t && mv t ${1}

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ if (params.help) {
8080
println(" --genes_dir [PATH] Specifies where predicted genes from the assemblies will be published. Default: ../Assembly-based_Processing/predicted-genes/.")
8181
println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: ../Assembly-based_Processing/annotations-and-taxonomy/.")
8282
println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: ../Assembly-based_Processing/read-mapping/.")
83-
println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: ../Assembly-based_Processing/combined-outputs/.")
83+
println(" --combined_output_dir [PATH] Assembly summaries and reports across samples directory. Default: ../Assembly-based_Processing/combined-outputs/.")
8484
println(" --bins_dir [PATH] Assembly bins directory. Default: ../Assembly-based_Processing/bins/.")
8585
println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: ../Assembly-based_Processing/MAGs/.")
8686
println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: ../Read-based_Processing/.")
Lines changed: 283 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
#!/usr/bin/env nextflow
2+
nextflow.enable.dsl = 2
3+
4+
process CLEAN_FASTQC_PATHS {
5+
tag "Purging genelab paths from MultiQC zip files in ${params.directories.FastQC_Outputs}"
6+
input:
7+
path(FastQC_Outputs_dir)
8+
output:
9+
path("${OUT_DIR}"), emit: clean_dir
10+
script:
11+
OUT_DIR = "${FastQC_Outputs_dir.baseName}"
12+
"""
13+
WORKDIR=`pwd`
14+
mv ${FastQC_Outputs_dir} FastQC_Outputs_dir
15+
16+
[ -d ${OUT_DIR}/ ] || mkdir ${OUT_DIR}/ && \\
17+
cp -r FastQC_Outputs_dir/* ${OUT_DIR}/
18+
19+
[ -f ${OUT_DIR}/versions.txt ] && rm -rf ${OUT_DIR}/versions.txt
20+
21+
cat `which clean-paths.sh` > \${WORKDIR}/clean-paths.sh
22+
chmod +x \${WORKDIR}/clean-paths.sh
23+
24+
echo "Purging paths from multiqc outputs"
25+
cd \${WORKDIR}/${OUT_DIR}/
26+
echo "Cleaning raw multiqc files with path info"
27+
unzip raw_multiqc${params.assay_suffix}_report.zip && rm raw_multiqc${params.assay_suffix}_report.zip
28+
cd raw_multiqc_report/raw_multiqc_data/
29+
30+
# No reason not to just run it on all
31+
echo "Purging paths in all raw QC files..."
32+
find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\;
33+
cd \${WORKDIR}/${OUT_DIR}/
34+
35+
echo "Re-zipping up raw multiqc"
36+
zip -r raw_multiqc${params.assay_suffix}_report.zip raw_multiqc_report/ && rm -rf raw_multiqc_report/
37+
38+
echo "Cleaning filtered multiqc files with path info..."
39+
unzip filtered_multiqc${params.assay_suffix}_report.zip && rm filtered_multiqc${params.assay_suffix}_report.zip
40+
cd filtered_multiqc_report/filtered_multiqc_data/
41+
42+
43+
# No reason not to just run it on all
44+
echo "Purging paths in all filtered QC files..."
45+
find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\;
46+
cd \${WORKDIR}/${OUT_DIR}/
47+
48+
49+
echo "Re-zipping up filtered multiqc..."
50+
zip -r filtered_multiqc${params.assay_suffix}_report.zip filtered_multiqc_report/ && rm -rf filtered_multiqc_report/
51+
cd \${WORKDIR}
52+
53+
echo "Purging paths from multiqc outputs completed successfully..."
54+
55+
echo "Done! Paths purged successfully."
56+
"""
57+
58+
}
59+
60+
process PACKAGE_PROCESSING_INFO {
61+
62+
tag "Purging file paths and zipping processing info"
63+
64+
input:
65+
val(files_and_dirs)
66+
output:
67+
path("processing_info${params.assay_suffix}.zip"), emit: zip
68+
69+
script:
70+
"""
71+
cat `which clean-paths.sh` > clean-paths.sh
72+
chmod +x ./clean-paths.sh
73+
[ -d processing_info/ ] || mkdir processing_info/ && \\
74+
cp -r ${files_and_dirs.join(" ")} processing_info/
75+
76+
echo "Purging file paths"
77+
find processing_info/ -type f -exec bash ./clean-paths.sh '{}' ${params.baseDir} \\;
78+
79+
# Purge file paths and then zip
80+
zip -r processing_info${params.assay_suffix}.zip processing_info/
81+
"""
82+
}
83+
84+
85+
process GENERATE_README {
86+
87+
beforeScript "chmod +x ${baseDir}/bin/*"
88+
tag "Generating README for ${OSD_accession}"
89+
input:
90+
tuple val(name), val(email), val(output_prefix),
91+
val(OSD_accession), val(protocol_id),
92+
val(FastQC_Outputs), val(Filtered_Sequence_Data),
93+
val(Read_Based_Processing), val(Assembly_Based_Processing),
94+
val(Assemblies), val(Genes), val(Annotations_And_Tax),
95+
val(Mapping), val(Combined_Output)
96+
path(processing_info)
97+
path(Bins)
98+
path(MAGS)
99+
output:
100+
path("README${params.assay_suffix}.txt"), emit: readme
101+
102+
script:
103+
"""
104+
GL-gen-processed-metagenomics-readme \\
105+
--output 'README${params.assay_suffix}.txt' \\
106+
--GLDS-ID '${OSD_accession}' \\
107+
--output-prefix '${output_prefix}' \\
108+
--name '${name}' \\
109+
--email '${email}' \\
110+
--protocol_ID '${protocol_id}' \\
111+
--assay_suffix '${params.assay_suffix}' \\
112+
--processing_zip_file '${processing_info}' \\
113+
--fastqc_dir '${FastQC_Outputs}' \\
114+
--filtered_reads_dir '${Filtered_Sequence_Data}' \\
115+
--read_based_dir '${Read_Based_Processing}' \\
116+
--assembly_based_dir '${Assembly_Based_Processing}' \\
117+
--assemblies_dir '${Assemblies}' \\
118+
--genes_dir '${Genes}' \\
119+
--annotations_and_tax_dir '${Annotations_And_Tax}' \\
120+
--mapping_dir '${Mapping}' \\
121+
--bins_dir '${Bins}' \\
122+
--MAGs_dir '${MAGS}' \\
123+
--combined_output_dir '${Combined_Output}' ${params.readme_extra}
124+
"""
125+
126+
}
127+
128+
129+
process VALIDATE_PROCESSING {
130+
131+
tag "Running automated validation and verification...."
132+
133+
input:
134+
// Labels
135+
tuple val(GLDS_accession), val(V_V_guidelines_link), val(output_prefix),
136+
val(target_files), val(assay_suffix), val(log_dir_basename),
137+
val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix),
138+
val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix)
139+
// Directory paths
140+
tuple path(Filtered_Sequence_Data), path(Read_Based),
141+
path(Assembly_Based), path(Assemblies), path(Mapping),
142+
path(Genes), path(Annotation_And_Tax), path(Bins),
143+
path(MAGS), path(Combined_Output), path(FastQC_Outputs)
144+
// File paths
145+
path(sample_ids_file)
146+
path(README)
147+
path(processing_info)
148+
149+
output:
150+
path("${GLDS_accession}_${output_prefix}metagenomics-validation.log"), emit: log
151+
152+
script:
153+
"""
154+
GL-validate-processed-metagenomics-data \\
155+
--output '${GLDS_accession}_${output_prefix}metagenomics-validation.log' \\
156+
--GLDS-ID '${GLDS_accession}' \\
157+
--readme '${README}' \\
158+
--sample-IDs-file '${sample_ids_file}' \\
159+
--V_V_guidelines_link '${V_V_guidelines_link}' \\
160+
--processing_zip_file '${processing_info}' \\
161+
--output-prefix '${output_prefix}' \\
162+
--zip_targets '${target_files}' \\
163+
--assay_suffix '${assay_suffix}' \\
164+
--raw_suffix '${raw_suffix}' \\
165+
--raw_R1_suffix '${raw_R1_suffix}' \\
166+
--raw_R2_suffix '${raw_R2_suffix}' \\
167+
--filtered_suffix '${filtered_suffix}' \\
168+
--filtered_R1_suffix '${filtered_R1_suffix}' \\
169+
--filtered_R2_suffix '${filtered_R2_suffix}' \\
170+
--logs_dir_basename '${log_dir_basename}' \\
171+
--fastqc_dir ${FastQC_Outputs} \\
172+
--filtered_reads_dir ${Filtered_Sequence_Data} \\
173+
--read_based_dir ${Read_Based} \\
174+
--assembly_based_dir ${Assembly_Based} \\
175+
--assemblies_dir ${Assemblies} \\
176+
--genes_dir ${Genes} \\
177+
--annotations_and_tax_dir ${Annotation_And_Tax} \\
178+
--mapping_dir ${Mapping} \\
179+
--bins_dir ${Bins} \\
180+
--MAGs_dir ${MAGS} \\
181+
--combined_output_dir ${Combined_Output} ${params.validation_extra}
182+
"""
183+
}
184+
185+
186+
process GENERATE_CURATION_TABLE {
187+
188+
beforeScript "chmod +x ${baseDir}/bin/*"
189+
tag "Generating a file association table for curation..."
190+
191+
input:
192+
// GeneLab accession and Suffixes
193+
tuple val(GLDS_accession), val(output_prefix), val(assay_suffix),
194+
val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix),
195+
val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix)
196+
// File labels
197+
tuple val(processing_zip_file), val(readme)
198+
// Directory labels as paths - these paths are utilized as mere labels by the script
199+
tuple path(raw_reads_dir), path(filtered_reads_dir), path(read_based_dir),
200+
path(assembly_based_dir), path(annotation_and_tax_dir), path(combined_output_dir)
201+
// Directory paths
202+
tuple path(Assemblies), path(Genes), path(Mapping),
203+
path(Bins), path(MAGS), path(FastQC_Outputs)
204+
path(assay_table)
205+
path(runsheet)
206+
207+
output:
208+
path("${GLDS_accession}_${output_prefix}-associated-file-names.tsv"), emit: curation_table
209+
210+
script:
211+
def INPUT_TABLE = "${params.files.assay_table}" == "" ? "--isa-zip ${assay_table}" : "--assay-table ${assay_table}"
212+
"""
213+
GL-gen-metagenomics-file-associations-table ${INPUT_TABLE} \\
214+
--runsheet '${runsheet}' \\
215+
--output '${GLDS_accession}_${output_prefix}-associated-file-names.tsv' \\
216+
--GLDS-ID '${GLDS_accession}' \\
217+
--output-prefix '${output_prefix}' \\
218+
--assay_suffix '${assay_suffix}' \\
219+
--raw_suffix '${raw_suffix}' \\
220+
--raw_R1_suffix '${raw_R1_suffix}' \\
221+
--raw_R2_suffix '${raw_R2_suffix}' \\
222+
--filtered_suffix '${filtered_suffix}' \\
223+
--filtered_R1_suffix '${filtered_R1_suffix}' \\
224+
--filtered_R2_suffix '${filtered_R2_suffix}' \\
225+
--processing_zip_file '${processing_zip_file}' \\
226+
--readme '${readme}' \\
227+
--fastqc_dir '${FastQC_Outputs}' \\
228+
--assemblies_dir '${Assemblies}' \\
229+
--genes_dir '${Genes}' \\
230+
--mapping_dir '${Mapping}' \\
231+
--bins_dir '${Bins}' \\
232+
--MAGs_dir '${MAGS}' \\
233+
--raw_reads_dir '${raw_reads_dir}' \\
234+
--filtered_reads_dir '${filtered_reads_dir}' \\
235+
--read_based_dir '${read_based_dir}' \\
236+
--assembly_based_dir '${assembly_based_dir}' \\
237+
--annotations_and_tax_dir '${annotation_and_tax_dir}' \\
238+
--combined_output_dir '${combined_output_dir}' ${params.file_association_extra}
239+
"""
240+
}
241+
242+
243+
process GENERATE_MD5SUMS {
244+
245+
tag "Generating md5sums for the files to be released on OSDR..."
246+
247+
input:
248+
path(processing_info)
249+
path(README)
250+
val(dirs)
251+
252+
output:
253+
path("processed_md5sum${params.assay_suffix}.tsv"), emit: md5sum
254+
script:
255+
"""
256+
mkdir processing/ && \\
257+
cp -r ${dirs.join(" ")} ${processing_info} ${README} \\
258+
processing/
259+
260+
# Generate md5sums
261+
find -L processing/ -type f -exec md5sum '{}' \\; |
262+
awk -v OFS='\\t' 'BEGIN{OFS="\\t"; printf "File Path\\tFile Name\\tmd5\\n"} \\
263+
{N=split(\$2,a,"/"); sub(/processing\\//, "", \$2); print \$2,a[N],\$1}' \\
264+
| grep -v "versions.txt" > processed_md5sum${params.assay_suffix}.tsv
265+
"""
266+
}
267+
268+
269+
process GENERATE_PROTOCOL {
270+
271+
beforeScript "chmod +x ${baseDir}/bin/*"
272+
tag "Generating your analysis protocol..."
273+
274+
input:
275+
path(software_versions)
276+
val(protocol_id)
277+
output:
278+
path("protocol.txt")
279+
script:
280+
"""
281+
generate_protocol.sh ${software_versions} ${protocol_id} > protocol.txt
282+
"""
283+
}

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ leave empty if wanting to use memory, the default, put in quotes the path to a d
1717
already exists if wanting to use disk space
1818
*/
1919

20-
params.gtdb_tk_scratch_location = ""
20+
//params.gtdb_tk_scratch_location = ""
2121

2222
/* Retrieve MAGS.
2323
Filters checkm results based on estimate completion, redundancy, and

Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ executor.queueSize = 20
167167
Note that relative paths such as '~/' and '../' are not expanded
168168
by nextflow's evaluation of files, so don't use that.
169169
*/
170-
params.DB_ROOT = ("${baseDir}".split("/")[0..-2]).join('/') + "/Reference_DBs"
170+
params.DB_ROOT = "${baseDir.getParent()}/Reference_DBs"
171171

172172
// Mount Humann databases to their predefined locations in the Biobakery container being used
173173
if(params.database.chocophlan_dir == null ||

0 commit comments

Comments
 (0)