|
| 1 | +#!/usr/bin/env nextflow |
| 2 | +nextflow.enable.dsl = 2 |
| 3 | + |
| 4 | +process CLEAN_FASTQC_PATHS { |
| 5 | + tag "Purging genelab paths from MultiQC zip files in ${params.directories.FastQC_Outputs}" |
| 6 | + input: |
| 7 | + path(FastQC_Outputs_dir) |
| 8 | + output: |
| 9 | + path("${OUT_DIR}"), emit: clean_dir |
| 10 | + script: |
| 11 | + OUT_DIR = "${FastQC_Outputs_dir.baseName}" |
| 12 | + """ |
| 13 | + WORKDIR=`pwd` |
| 14 | + mv ${FastQC_Outputs_dir} FastQC_Outputs_dir |
| 15 | +
|
| 16 | + [ -d ${OUT_DIR}/ ] || mkdir ${OUT_DIR}/ && \\ |
| 17 | + cp -r FastQC_Outputs_dir/* ${OUT_DIR}/ |
| 18 | + |
| 19 | + [ -f ${OUT_DIR}/versions.txt ] && rm -rf ${OUT_DIR}/versions.txt |
| 20 | +
|
| 21 | + cat `which clean-paths.sh` > \${WORKDIR}/clean-paths.sh |
| 22 | + chmod +x \${WORKDIR}/clean-paths.sh |
| 23 | +
|
| 24 | + echo "Purging paths from multiqc outputs" |
| 25 | + cd \${WORKDIR}/${OUT_DIR}/ |
| 26 | + echo "Cleaning raw multiqc files with path info" |
| 27 | + unzip raw_multiqc${params.assay_suffix}_report.zip && rm raw_multiqc${params.assay_suffix}_report.zip |
| 28 | + cd raw_multiqc_report/raw_multiqc_data/ |
| 29 | +
|
| 30 | + # No reason not to just run it on all |
| 31 | + echo "Purging paths in all raw QC files..." |
| 32 | + find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\; |
| 33 | + cd \${WORKDIR}/${OUT_DIR}/ |
| 34 | +
|
| 35 | + echo "Re-zipping up raw multiqc" |
| 36 | + zip -r raw_multiqc${params.assay_suffix}_report.zip raw_multiqc_report/ && rm -rf raw_multiqc_report/ |
| 37 | +
|
| 38 | + echo "Cleaning filtered multiqc files with path info..." |
| 39 | + unzip filtered_multiqc${params.assay_suffix}_report.zip && rm filtered_multiqc${params.assay_suffix}_report.zip |
| 40 | + cd filtered_multiqc_report/filtered_multiqc_data/ |
| 41 | +
|
| 42 | +
|
| 43 | + # No reason not to just run it on all |
| 44 | + echo "Purging paths in all filtered QC files..." |
| 45 | + find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\; |
| 46 | + cd \${WORKDIR}/${OUT_DIR}/ |
| 47 | +
|
| 48 | +
|
| 49 | + echo "Re-zipping up filtered multiqc..." |
| 50 | + zip -r filtered_multiqc${params.assay_suffix}_report.zip filtered_multiqc_report/ && rm -rf filtered_multiqc_report/ |
| 51 | + cd \${WORKDIR} |
| 52 | +
|
| 53 | + echo "Purging paths from multiqc outputs completed successfully..." |
| 54 | +
|
| 55 | + echo "Done! Paths purged successfully." |
| 56 | + """ |
| 57 | + |
| 58 | +} |
| 59 | + |
| 60 | +process PACKAGE_PROCESSING_INFO { |
| 61 | + |
| 62 | + tag "Purging file paths and zipping processing info" |
| 63 | + |
| 64 | + input: |
| 65 | + val(files_and_dirs) |
| 66 | + output: |
| 67 | + path("processing_info${params.assay_suffix}.zip"), emit: zip |
| 68 | + |
| 69 | + script: |
| 70 | + """ |
| 71 | + cat `which clean-paths.sh` > clean-paths.sh |
| 72 | + chmod +x ./clean-paths.sh |
| 73 | + [ -d processing_info/ ] || mkdir processing_info/ && \\ |
| 74 | + cp -r ${files_and_dirs.join(" ")} processing_info/ |
| 75 | +
|
| 76 | + echo "Purging file paths" |
| 77 | + find processing_info/ -type f -exec bash ./clean-paths.sh '{}' ${params.baseDir} \\; |
| 78 | + |
| 79 | + # Purge file paths and then zip |
| 80 | + zip -r processing_info${params.assay_suffix}.zip processing_info/ |
| 81 | + """ |
| 82 | +} |
| 83 | + |
| 84 | + |
| 85 | +process GENERATE_README { |
| 86 | + |
| 87 | + beforeScript "chmod +x ${baseDir}/bin/*" |
| 88 | + tag "Generating README for ${OSD_accession}" |
| 89 | + input: |
| 90 | + tuple val(name), val(email), val(output_prefix), |
| 91 | + val(OSD_accession), val(protocol_id), |
| 92 | + val(FastQC_Outputs), val(Filtered_Sequence_Data), |
| 93 | + val(Read_Based_Processing), val(Assembly_Based_Processing), |
| 94 | + val(Assemblies), val(Genes), val(Annotations_And_Tax), |
| 95 | + val(Mapping), val(Combined_Output) |
| 96 | + path(processing_info) |
| 97 | + path(Bins) |
| 98 | + path(MAGS) |
| 99 | + output: |
| 100 | + path("README${params.assay_suffix}.txt"), emit: readme |
| 101 | + |
| 102 | + script: |
| 103 | + """ |
| 104 | + GL-gen-processed-metagenomics-readme \\ |
| 105 | + --output 'README${params.assay_suffix}.txt' \\ |
| 106 | + --GLDS-ID '${OSD_accession}' \\ |
| 107 | + --output-prefix '${output_prefix}' \\ |
| 108 | + --name '${name}' \\ |
| 109 | + --email '${email}' \\ |
| 110 | + --protocol_ID '${protocol_id}' \\ |
| 111 | + --assay_suffix '${params.assay_suffix}' \\ |
| 112 | + --processing_zip_file '${processing_info}' \\ |
| 113 | + --fastqc_dir '${FastQC_Outputs}' \\ |
| 114 | + --filtered_reads_dir '${Filtered_Sequence_Data}' \\ |
| 115 | + --read_based_dir '${Read_Based_Processing}' \\ |
| 116 | + --assembly_based_dir '${Assembly_Based_Processing}' \\ |
| 117 | + --assemblies_dir '${Assemblies}' \\ |
| 118 | + --genes_dir '${Genes}' \\ |
| 119 | + --annotations_and_tax_dir '${Annotations_And_Tax}' \\ |
| 120 | + --mapping_dir '${Mapping}' \\ |
| 121 | + --bins_dir '${Bins}' \\ |
| 122 | + --MAGs_dir '${MAGS}' \\ |
| 123 | + --combined_output_dir '${Combined_Output}' ${params.readme_extra} |
| 124 | + """ |
| 125 | + |
| 126 | +} |
| 127 | + |
| 128 | + |
| 129 | +process VALIDATE_PROCESSING { |
| 130 | + |
| 131 | + tag "Running automated validation and verification...." |
| 132 | + |
| 133 | + input: |
| 134 | + // Labels |
| 135 | + tuple val(GLDS_accession), val(V_V_guidelines_link), val(output_prefix), |
| 136 | + val(target_files), val(assay_suffix), val(log_dir_basename), |
| 137 | + val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix), |
| 138 | + val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix) |
| 139 | + // Directory paths |
| 140 | + tuple path(Filtered_Sequence_Data), path(Read_Based), |
| 141 | + path(Assembly_Based), path(Assemblies), path(Mapping), |
| 142 | + path(Genes), path(Annotation_And_Tax), path(Bins), |
| 143 | + path(MAGS), path(Combined_Output), path(FastQC_Outputs) |
| 144 | + // File paths |
| 145 | + path(sample_ids_file) |
| 146 | + path(README) |
| 147 | + path(processing_info) |
| 148 | + |
| 149 | + output: |
| 150 | + path("${GLDS_accession}_${output_prefix}metagenomics-validation.log"), emit: log |
| 151 | + |
| 152 | + script: |
| 153 | + """ |
| 154 | + GL-validate-processed-metagenomics-data \\ |
| 155 | + --output '${GLDS_accession}_${output_prefix}metagenomics-validation.log' \\ |
| 156 | + --GLDS-ID '${GLDS_accession}' \\ |
| 157 | + --readme '${README}' \\ |
| 158 | + --sample-IDs-file '${sample_ids_file}' \\ |
| 159 | + --V_V_guidelines_link '${V_V_guidelines_link}' \\ |
| 160 | + --processing_zip_file '${processing_info}' \\ |
| 161 | + --output-prefix '${output_prefix}' \\ |
| 162 | + --zip_targets '${target_files}' \\ |
| 163 | + --assay_suffix '${assay_suffix}' \\ |
| 164 | + --raw_suffix '${raw_suffix}' \\ |
| 165 | + --raw_R1_suffix '${raw_R1_suffix}' \\ |
| 166 | + --raw_R2_suffix '${raw_R2_suffix}' \\ |
| 167 | + --filtered_suffix '${filtered_suffix}' \\ |
| 168 | + --filtered_R1_suffix '${filtered_R1_suffix}' \\ |
| 169 | + --filtered_R2_suffix '${filtered_R2_suffix}' \\ |
| 170 | + --logs_dir_basename '${log_dir_basename}' \\ |
| 171 | + --fastqc_dir ${FastQC_Outputs} \\ |
| 172 | + --filtered_reads_dir ${Filtered_Sequence_Data} \\ |
| 173 | + --read_based_dir ${Read_Based} \\ |
| 174 | + --assembly_based_dir ${Assembly_Based} \\ |
| 175 | + --assemblies_dir ${Assemblies} \\ |
| 176 | + --genes_dir ${Genes} \\ |
| 177 | + --annotations_and_tax_dir ${Annotation_And_Tax} \\ |
| 178 | + --mapping_dir ${Mapping} \\ |
| 179 | + --bins_dir ${Bins} \\ |
| 180 | + --MAGs_dir ${MAGS} \\ |
| 181 | + --combined_output_dir ${Combined_Output} ${params.validation_extra} |
| 182 | + """ |
| 183 | +} |
| 184 | + |
| 185 | + |
| 186 | +process GENERATE_CURATION_TABLE { |
| 187 | + |
| 188 | + beforeScript "chmod +x ${baseDir}/bin/*" |
| 189 | + tag "Generating a file association table for curation..." |
| 190 | + |
| 191 | + input: |
| 192 | + // GeneLab accession and Suffixes |
| 193 | + tuple val(GLDS_accession), val(output_prefix), val(assay_suffix), |
| 194 | + val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix), |
| 195 | + val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix) |
| 196 | + // File labels |
| 197 | + tuple val(processing_zip_file), val(readme) |
| 198 | + // Directory labels as paths - these paths are utilized as mere labels by the script |
| 199 | + tuple path(raw_reads_dir), path(filtered_reads_dir), path(read_based_dir), |
| 200 | + path(assembly_based_dir), path(annotation_and_tax_dir), path(combined_output_dir) |
| 201 | + // Directory paths |
| 202 | + tuple path(Assemblies), path(Genes), path(Mapping), |
| 203 | + path(Bins), path(MAGS), path(FastQC_Outputs) |
| 204 | + path(assay_table) |
| 205 | + path(runsheet) |
| 206 | + |
| 207 | + output: |
| 208 | + path("${GLDS_accession}_${output_prefix}-associated-file-names.tsv"), emit: curation_table |
| 209 | + |
| 210 | + script: |
| 211 | + def INPUT_TABLE = "${params.files.assay_table}" == "" ? "--isa-zip ${assay_table}" : "--assay-table ${assay_table}" |
| 212 | + """ |
| 213 | + GL-gen-metagenomics-file-associations-table ${INPUT_TABLE} \\ |
| 214 | + --runsheet '${runsheet}' \\ |
| 215 | + --output '${GLDS_accession}_${output_prefix}-associated-file-names.tsv' \\ |
| 216 | + --GLDS-ID '${GLDS_accession}' \\ |
| 217 | + --output-prefix '${output_prefix}' \\ |
| 218 | + --assay_suffix '${assay_suffix}' \\ |
| 219 | + --raw_suffix '${raw_suffix}' \\ |
| 220 | + --raw_R1_suffix '${raw_R1_suffix}' \\ |
| 221 | + --raw_R2_suffix '${raw_R2_suffix}' \\ |
| 222 | + --filtered_suffix '${filtered_suffix}' \\ |
| 223 | + --filtered_R1_suffix '${filtered_R1_suffix}' \\ |
| 224 | + --filtered_R2_suffix '${filtered_R2_suffix}' \\ |
| 225 | + --processing_zip_file '${processing_zip_file}' \\ |
| 226 | + --readme '${readme}' \\ |
| 227 | + --fastqc_dir '${FastQC_Outputs}' \\ |
| 228 | + --assemblies_dir '${Assemblies}' \\ |
| 229 | + --genes_dir '${Genes}' \\ |
| 230 | + --mapping_dir '${Mapping}' \\ |
| 231 | + --bins_dir '${Bins}' \\ |
| 232 | + --MAGs_dir '${MAGS}' \\ |
| 233 | + --raw_reads_dir '${raw_reads_dir}' \\ |
| 234 | + --filtered_reads_dir '${filtered_reads_dir}' \\ |
| 235 | + --read_based_dir '${read_based_dir}' \\ |
| 236 | + --assembly_based_dir '${assembly_based_dir}' \\ |
| 237 | + --annotations_and_tax_dir '${annotation_and_tax_dir}' \\ |
| 238 | + --combined_output_dir '${combined_output_dir}' ${params.file_association_extra} |
| 239 | + """ |
| 240 | +} |
| 241 | + |
| 242 | + |
| 243 | +process GENERATE_MD5SUMS { |
| 244 | + |
| 245 | + tag "Generating md5sums for the files to be released on OSDR..." |
| 246 | + |
| 247 | + input: |
| 248 | + path(processing_info) |
| 249 | + path(README) |
| 250 | + val(dirs) |
| 251 | + |
| 252 | + output: |
| 253 | + path("processed_md5sum${params.assay_suffix}.tsv"), emit: md5sum |
| 254 | + script: |
| 255 | + """ |
| 256 | + mkdir processing/ && \\ |
| 257 | + cp -r ${dirs.join(" ")} ${processing_info} ${README} \\ |
| 258 | + processing/ |
| 259 | +
|
| 260 | + # Generate md5sums |
| 261 | + find -L processing/ -type f -exec md5sum '{}' \\; | |
| 262 | + awk -v OFS='\\t' 'BEGIN{OFS="\\t"; printf "File Path\\tFile Name\\tmd5\\n"} \\ |
| 263 | + {N=split(\$2,a,"/"); sub(/processing\\//, "", \$2); print \$2,a[N],\$1}' \\ |
| 264 | + | grep -v "versions.txt" > processed_md5sum${params.assay_suffix}.tsv |
| 265 | + """ |
| 266 | +} |
| 267 | + |
| 268 | + |
| 269 | +process GENERATE_PROTOCOL { |
| 270 | + |
| 271 | + beforeScript "chmod +x ${baseDir}/bin/*" |
| 272 | + tag "Generating your analysis protocol..." |
| 273 | + |
| 274 | + input: |
| 275 | + path(software_versions) |
| 276 | + val(protocol_id) |
| 277 | + output: |
| 278 | + path("protocol.txt") |
| 279 | + script: |
| 280 | + """ |
| 281 | + generate_protocol.sh ${software_versions} ${protocol_id} > protocol.txt |
| 282 | + """ |
| 283 | +} |
0 commit comments