add draft runsheet schema check

torres-alexis · torres-alexis · commit a5b15a0782f4 · 2025-02-18T18:18:21.000-08:00
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/examples/runsheet/README.md b/RNAseq/Workflow_Documentation/NF_RCP/examples/runsheet/README.md
@@ -18,7 +18,7 @@
 | Sample Name | string | Sample Name, added as a prefix to sample-specific processed data output files. Should not include spaces or weird characters. | Mmus_BAL-TAL_LRTN_BSL_Rep1_B7 |
 | has_ERCC | bool | Set to True if ERCC spike-ins are included in the samples. This ensures ERCC normalized DGE is performed in addition to standard DGE. | True |
 | paired_end | bool | Set to True if the samples were sequenced as paired-end. If set to False, samples are assumed to be single-end. | False |
-| organism | string | Species name used to map to the appropriate gene annotations file. Supported species can be found in the `species` column of the [GL-DPPD-7110_annotations.csv](../../../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110_annotations.csv) file. | Mus musculus |
+| organism | string | Species name used to map to the appropriate gene annotations file. Supported species can be found in the `species` column of the [GL-DPPD-7110-A_annotations.csv](../../../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) file. | Mus musculus |
 | read1_path | string (url or local path) | Location of the raw reads file. For paired-end data, this specifies the forward reads fastq.gz file. | /my/data/sample_1.fastq.gz |
 | read2_path | string (url or local path) | Location of the raw reads file. For paired-end data, this specifies the reverse reads fastq.gz file. For single-end data, this column should be omitted. | /my/data/sample_2.fastq.gz |
 | Factor Value[<name, e.g. Spaceflight>] | string | A set of one or more columns specifying the experimental group the sample belongs to. In the simplest form, a column named 'Factor Value[group]' is sufficient. | Space Flight |
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/conf/runsheet_schema.json b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/conf/runsheet_schema.json
@@ -0,0 +1,61 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "title": "RNAseq Runsheet Schema",
+    "description": "Schema for validating RNAseq processing runsheets",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "Sample Name": {
+                "type": "string",
+                "description": "Sample Name, added as a prefix to sample-specific processed data output files. Should not include spaces or weird characters."
+            },
+            "has_ERCC": {
+                "type": "boolean",
+                "description": "Set to True if ERCC spike-ins are included in the samples. This ensures ERCC normalized DGE is performed in addition to standard DGE."
+            },
+            "paired_end": {
+                "type": "boolean",
+                "description": "Set to True if the samples were sequenced as paired-end. If set to False, samples are assumed to be single-end."
+            },
+            "organism": {
+                "type": "string",
+                "description": "The organism of the samples. This is used to select the appropriate annotation files."
+            },
+            "read1_path": {
+                "type": "string",
+                "description": "Location of the raw reads file. For paired-end data, this specifies the forward reads fastq.gz file."
+            },
+            "read2_path": {
+                "type": "string",
+                "description": "Location of the raw reads file. For paired-end data, this specifies the reverse reads fastq.gz file. For single-end data, this column should be omitted."
+            },
+            "Original Sample Name": {
+                "type": "string",
+                "description": "Used to map the sample name that will be used for processing to the original sample name. This is often identical except in cases where the original name includes spaces or weird characters."
+            }
+        },
+        "patternProperties": {
+            "^Factor Value\\[.*\\]$": {
+                "type": "string",
+                "description": "Experimental factor values. Column names should be 'Factor Value[factor_name]' where factor_name is the experimental factor (e.g., Spaceflight, Time, Treatment), and values are the corresponding conditions"
+            }
+        },
+        "required": [
+            "Sample Name",
+            "has_ERCC",
+            "paired_end",
+            "organism",
+            "read1_path"
+        ],
+        "if": {
+            "properties": {
+                "paired_end": { "const": true }
+            }
+        },
+        "then": {
+            "required": ["read2_path"]
+        },
+        "minItems": 2
+    }
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow_schema.json b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/nextflow_schema.json
@@ -145,7 +145,8 @@
               "runsheet_path": {
                   "type": "string",
                   "description": "Path to the input runsheet.",
-                  "format": "file-path"
+                  "format": "file-path",
+                  "schema": "./conf/runsheet_schema.json"
               },
               "isa_archive_path": {
                   "type": "string",
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf
@@ -118,6 +118,16 @@ workflow RNASEQ_MICROBES {
             ch_isa_versions = ISA_TO_RUNSHEET.out.versions  // Capture version if ISA_TO_RUNSHEET runs
         }
 
+                // Validate input parameters
+        validateParameters()
+
+        // Print summary of supplied parameters
+        log.info paramsSummaryLog(workflow)
+        // PROCESSED_MD5SUM(x
+        // | concat(y)
+        // | collect 
+        // )
+        
         PARSE_RUNSHEET( runsheet_path )
 
         samples = PARSE_RUNSHEET.out.samples
@@ -339,15 +349,7 @@ workflow RNASEQ_MICROBES {
         | concat (RAW_READS_MULTIQC.out.zipped_report) // to do: reimplement zip output w/ cleaned paths
         | collect)
 
-        // Validate input parameters
-        validateParameters()
 
-        // Print summary of supplied parameters
-        log.info paramsSummaryLog(workflow)
-        // PROCESSED_MD5SUM(x
-        // | concat(y)
-        // | collect 
-        // )
 
     emit:
         RAW_MD5SUM.out.md5sums