add nextflow schema without runsheet schema

torres-alexis · torres-alexis · commit 91114a0d0774 · 2025-02-18T17:14:57.000-08:00
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow.config b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow.config
@@ -6,6 +6,11 @@
 ----------------------------------------------------------------------------------------
 */
 
+// Plugins
+plugins {
+  id 'nf-schema@2.2.0'
+}
+
 // Global default params, used in configs
 params {
     mode                        = 'default' // Set to 'microbes' for processing microbes (Bowtie2)
@@ -139,7 +144,7 @@ dag {
 manifest {
     name            = 'NASA GeneLab Data Processing RNA-Seq Pipeline'
     homePage        = 'https://github.com/nasa/GeneLab_Data_Processing/tree/master/RNAseq'
-    description     = 'RNA-Seq Pipeline for Document GL-DPPD-7101-G.'
+    description     = 'RNA-Seq Pipeline for Documents GL-DPPD-7101-G and GL-DPPD-7115.'
     mainScript      = 'main.nf'
     nextflowVersion = '!>=24.04.4'
     version         = '2.0.0'
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow_schema.json b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow_schema.json
@@ -0,0 +1,271 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "stub.json",
+  "title": "NASA GeneLab Data Processing RNA-Seq Pipeline pipeline parameters",
+  "description": "RNA-Seq Pipeline for Documents GL-DPPD-7101-G and GL-DPPD-7115.",
+  "type": "object",
+  "properties": {
+      "outdir": {
+          "type": "string",
+          "format": "directory-path",
+          "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.",
+          "fa_icon": "fas fa-folder-open",
+          "default": "./results",
+          "hidden": true
+      },
+      "api_url": {
+          "type": "string",
+          "default": "https://osdr.nasa.gov/osdr/data/search?ffield=Study+Assay+Technology+Type&fvalue=RNA+Sequencing+%28RNA-Seq%29&size=2000",
+          "hidden": true
+      }
+  },
+  "$defs": {
+      "qc": {
+          "title": "QC",
+          "type": "object",
+          "description": "",
+          "default": "",
+          "properties": {
+              "validate_params": {
+                  "type": "boolean",
+                  "description": "Boolean whether to validate parameters against the schema at runtime",
+                  "default": true,
+                  "fa_icon": "fas fa-check-square",
+                  "hidden": true
+              },
+              "multiqc_config": {
+                  "type": "string",
+                  "default": "/home/alexis/git/NewRepo/GeneLab_Data_Processing/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/conf/multiqc.config",
+                  "description": "Path to the MultiQC config file.",
+                  "format": "file-path"
+              },
+              "rseqc_sample_count": {
+                  "type": "integer",
+                  "default": 15000000,
+                  "description": "Number of read-pairs used for RSeQC infer_experiment.py and inner_distance.py"
+              },
+              "max_flag_code": {
+                  "type": "integer",
+                  "default": 80,
+                  "description": "TO DO: implement",
+                  "hidden": true
+              },
+              "skip_vv": {
+                  "type": "boolean",
+                  "description": "TO DO:  Skip VV modules.",
+                  "hidden": true
+              }
+          },
+          "required": [
+              "rseqc_sample_count"
+          ]
+      },
+      "boilerplate": {
+          "title": "Boilerplate",
+          "type": "object",
+          "description": "",
+          "default": "",
+          "properties": {
+              "help": {
+                  "type": "string",
+                  "description": "Display help menu and exit."
+              },
+              "version": {
+                  "type": "boolean",
+                  "description": "Display version and exit.",
+                  "fa_icon": "fas fa-question-circle",
+                  "hidden": true
+              },
+              "email": {
+                  "type": "string",
+                  "hidden": true,
+                  "description": "TO DO: implement"
+              }
+          }
+      },
+      "local_storage": {
+          "title": "Local Storage",
+          "type": "object",
+          "description": "",
+          "default": "",
+          "properties": {
+              "derived_store_path": {
+                  "type": "string",
+                  "default": "./DerivedReferences",
+                  "description": "Location where the derived reference files will be stored.",
+                  "format": "directory-path"
+              },
+              "reference_store_path": {
+                  "type": "string",
+                  "default": "./References",
+                  "description": "Location where the reference fasta and gtf will be stored.",
+                  "format": "directory-path"
+              }
+          },
+          "required": [
+              "derived_store_path",
+              "reference_store_path"
+          ]
+      },
+      "configs": {
+          "title": "Configs",
+          "type": "object",
+          "description": "",
+          "default": "",
+          "properties": {
+              "publish_dir_mode": {
+                  "type": "string",
+                  "default": "link",
+                  "enum": [
+                      "copy",
+                      "copyNoFollow",
+                      "link",
+                      "move",
+                      "relink",
+                      "symlink"
+                  ],
+                  "description": "Nextflow publishdir mode."
+              }
+          },
+          "required": [
+              "publish_dir_mode"
+          ]
+      },
+      "input": {
+          "title": "Input",
+          "type": "object",
+          "description": "",
+          "default": "",
+          "properties": {
+              "accession": {
+                  "type": "string",
+                  "description": "Input OSD or GLDS identifier as 'OSD-#' or 'GLDS-#' if processing an OSDR dataset.",
+                  "pattern": "^(OSD|GLDS)-[0-9]+$"
+              },
+              "runsheet_path": {
+                  "type": "string",
+                  "description": "Path to the input runsheet.",
+                  "format": "file-path"
+              },
+              "isa_archive_path": {
+                  "type": "string",
+                  "description": "Path to the ISA.zip for an OSDR dataset. '--accession' must also be used.",
+                  "format": "file-path"
+              },
+              "mode": {
+                  "type": "string",
+                  "default": "default",
+                  "description": "Specifies whether to use the default Eukaryotes workflow or the Prokaryotes workflow with ' --mode microbes'.",
+                  "enum": [
+                      "default",
+                      "microbes"
+                  ]
+              },
+              "output_suffix": {
+                  "type": "string",
+                  "default": "_GLbulkRNAseq",
+                  "description": "Specifies a string that should be used to label the output file names."
+              }
+          },
+          "required": [
+              "mode",
+              "output_suffix"
+          ]
+      },
+      "references": {
+          "title": "References",
+          "type": "object",
+          "description": "By default, reference_table points to GeneLab Reference Annotations Table GL-DPPD-7110-A_annotations.csv which defines reference FASTA and GTF files and associated GeneLab-generated gene annotations.",
+          "default": "",
+          "properties": {
+              "reference_table": {
+                  "type": "string",
+                  "default": "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/refs/heads/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv",
+                  "description": "GL-DPPD-7110-A_annotations.csv"
+              },
+              "reference_fasta": {
+                  "type": "string",
+                  "description": "Specifies an alternative reference FASTA file to use in place of the one listed in GL-DPPD-7110-A_annotations.csv."
+              },
+              "reference_gtf": {
+                  "type": "string",
+                  "description": "Specifies an alternative reference GTF file to use in place of the one listed in GL-DPPD-7110-A_annotations.csv."
+              },
+              "reference_source": {
+                  "type": "string",
+                  "description": "Specifies the source of the alternative reference files. (e.g. \"NCBI\" or \"ENSEMBL\")."
+              },
+              "reference_version": {
+                  "type": "string",
+                  "description": "Specifies the version of the alternative reference files used, if available."
+              }
+          },
+          "required": [
+              "reference_table"
+          ]
+      },
+      "debugging_options": {
+          "title": "Debugging options",
+          "type": "object",
+          "description": "Parameters used to reduce the size of inputs or reference data",
+          "default": "",
+          "properties": {
+              "stage_local": {
+                  "type": "boolean",
+                  "default": true,
+                  "description": "Break the workflow after pulling reads into channels."
+              },
+              "limit_samples_to": {
+                  "type": "string",
+                  "description": "Only use the first n samples for the analysis"
+              },
+              "genome_subsample": {
+                  "type": "string",
+                  "description": "Subsample the reference genome to the specified region / chromosome."
+              },
+              "force_single_end": {
+                  "type": "boolean",
+                  "description": "Only use Read 1 for each sample, even if the original data is paired-end."
+              },
+              "truncate_to": {
+                  "type": "integer",
+                  "description": "Only use the specified number of reads from each input file."
+              },
+              "use_dummy_gene_counts": {
+                  "type": "boolean",
+                  "description": "Generate random gene counts during DGE. This is should be enabled when using '--truncate_to'."
+              },
+              "dp_tools_plugin": {
+                  "type": "string",
+                  "description": "Alternative dp_tools plugin."
+              }
+          }
+      }
+  },
+  "allOf": [
+      {
+          "$ref": "#/$defs/qc"
+      },
+      {
+          "$ref": "#/$defs/boilerplate"
+      },
+      {
+          "$ref": "#/$defs/local_storage"
+      },
+      {
+          "$ref": "#/$defs/configs"
+      },
+      {
+          "$ref": "#/$defs/input"
+      },
+      {
+          "$ref": "#/$defs/references"
+      },
+      {
+          "$ref": "#/$defs/debugging_options"
+      }
+  ],
+  "required": [
+      "outdir"
+  ]
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf
@@ -61,6 +61,7 @@ include { SOFTWARE_VERSIONS } from '../modules/software_versions.nf'
 include { MD5SUM as RAW_MD5SUM } from '../modules/md5sum.nf' addParams(md5sumLabel:"raw")
 include { MD5SUM as PROCESSED_MD5SUM } from '../modules/md5sum.nf' addParams(md5sumLabel:"processed")
 
+include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema'
 
 include { VV_RAW_READS;
     VV_TRIMMED_READS;
@@ -97,6 +98,7 @@ workflow RNASEQ_MICROBES {
         derived_store_path
     main:
         publishdir = "results" // default path passed to publishDir, updated below to "GLDS-#" if processing an OSDR dataset
+        ch_isa_versions = Channel.empty()  // Initialize empty channel for ISA versions
 
         // Set up runsheet
         if ( runsheet_path == null ) {
@@ -113,6 +115,7 @@ workflow RNASEQ_MICROBES {
             //Convert ISA archive to runsheet
             ISA_TO_RUNSHEET( osd_accession, glds_accession, isa_archive, dp_tools_plugin )
             runsheet_path = ISA_TO_RUNSHEET.out.runsheet
+            ch_isa_versions = ISA_TO_RUNSHEET.out.versions  // Capture version if ISA_TO_RUNSHEET runs
         }
 
         PARSE_RUNSHEET( runsheet_path )
@@ -303,7 +306,7 @@ workflow RNASEQ_MICROBES {
 
         // Mix in versions from each process
         ch_software_versions = ch_software_versions
-            | mix(ISA_TO_RUNSHEET.out.versions)
+            | mix(ch_isa_versions)  // Use the stored versions channel
             | mix(GTF_TO_PRED.out.versions)
             | mix(PRED_TO_BED.out.versions)
             | mix(RAW_FASTQC.out.versions)
@@ -336,6 +339,11 @@ workflow RNASEQ_MICROBES {
         | concat (RAW_READS_MULTIQC.out.zipped_report) // to do: reimplement zip output w/ cleaned paths
         | collect)
 
+        // Validate input parameters
+        validateParameters()
+
+        // Print summary of supplied parameters
+        log.info paramsSummaryLog(workflow)
         // PROCESSED_MD5SUM(x
         // | concat(y)
         // | collect