nasa
diff --git a/‎RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md
Lines changed: 9 additions & 7 deletions b/‎RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md
Lines changed: 9 additions & 7 deletions
diff --git a/‎RNAseq/Pipeline_GL-DPPD-7115_Versions/GL-DPPD-7115.md
Lines changed: 6 additions & 2 deletions b/‎RNAseq/Pipeline_GL-DPPD-7115_Versions/GL-DPPD-7115.md
Lines changed: 6 additions & 2 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/README.md
Lines changed: 4 additions & 0 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/README.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/dge_deseq2.Rmd
Lines changed: 3 additions & 3 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/dge_deseq2.Rmd
Lines changed: 3 additions & 3 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/generate_md5sums.py
Lines changed: 66 additions & 5 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/generate_md5sums.py
Lines changed: 66 additions & 5 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/generate_protocol.py
Lines changed: 9 additions & 9 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/generate_protocol.py
Lines changed: 9 additions & 9 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/parse_multiqc.py
Lines changed: 18 additions & 3 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/parse_multiqc.py
Lines changed: 18 additions & 3 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories_by_sample.py
100644100755 b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories_by_sample.py
100644100755
@@ -89,7 +89,7 @@ DESeq2 Analysis Workflow
 
 - Added parallel rRNA-removed DGE analysis:
   - Create filtered RSEM count files with rRNA features removed:
-    - {sample}_rRNA_removed.genes.results
+    - {sample}_rRNArm.genes.results
   - Normalize rRNA-removed counts
   - Perform DGE analysis using rRNA-removed counts
   - Output additional set of rRNA-removed counts and DGE results
@@ -310,7 +310,7 @@ fastqc -o /path/to/trimmed_fastqc/output/directory *.fastq.gz
 ### 2c. Compile Trimmed Data QC  
 
 ```bash
-multiqc --interactive -n trimmed_multiqc_GLbulkRNAseq -o /path/to/trimmed_multiqc/output/directory /path/to/directory/containing/trimmed_fastqc/files
+multiqc --interactive -n trimmed_multiqc_GLbulkRNAseq -o /path/to/trimmed_multiqc/output/directory /path/to/directory/containing/trimmed_fastqc/files /path/to/directory/containing/trimming_reports
 
 zip -r trimmed_multiqc_GLbulkRNAseq_data.zip trimmed_multiqc_GLbulkRNAseq_data
 ```
@@ -320,11 +320,13 @@ zip -r trimmed_multiqc_GLbulkRNAseq_data.zip trimmed_multiqc_GLbulkRNAseq_data
 - `--interactive` – force reports to use interactive plots
 - `-n` – prefix name for output files
 - `-o` – the output directory to store results
-- `/path/to/directory/containing/trimmed_fastqc/files` – the directory holding the output data from the FastQC run, provided as a positional argument
+- `/path/to/directory/containing/trimmed_fastqc/files` – the directory holding the output data from the fastqc run, provided as a positional argument
+- `/path/to/directory/containing/trimming_reports` – the directory containing the trimming reports from the trim/filter step, provided as a positional argument
 
 **Input Data:**
 
 - *fastqc.zip (FastQC data, output from [Step 2b](#2b-trimmed-data-qc))
+- *trimming_report.txt (trimming report, output from [Step 2a](#2a-trimfilter-raw-data))
 
 **Output Data:**
 
@@ -471,7 +473,7 @@ STAR --twopassMode Basic \
   - SJ.out.tab
 - *_STARtmp (directory containing the following:)
   - BAMsort (directory containing subdirectories that are empty – this was the location for temp files that were automatically removed after successful completion)
-- **\*Unmapped.out.mate1.fastq.gz, \*Unmapped.out.mate2.fastq.gz** (unmapped and partially mapped reads in fastq format)
+- **\*Unmapped.out.mate1, \*Unmapped.out.mate2** (unmapped and partially mapped reads in fastq format)
 
 <br>
 
@@ -1074,7 +1076,7 @@ grep -E 'gene_biotype "rRNA"|gene_type "rRNA"|gbkey "rRNA"' /path/to/annotation/
 ### Filter out rRNA entries ###
 awk 'NR==FNR {ids[$1]=1; next} !($1 in ids)' \
     *rrna_ensembl_ids.txt \
-    *.genes.results > *_rRNA_removed.genes.results
+    *.genes.results > *_rRNArm.genes.results
 
 ### Count removed rRNA entries ###
 rRNA_count=$(awk 'NR==FNR {ids[$1]=1; next} $1 in ids' \
@@ -1088,7 +1090,7 @@ echo "*: ${rRNA_count} rRNA entries removed." > *_rRNA_counts.txt
 - *rrna_ensembl_ids.txt (file containing list of gene IDs with rRNA features, output from [Step 8di](#8di-extract-rrna-gene-ids-from-gtf))
 
 **Output Data:**
-- **\*rRNA_removed.genes.results** (RSEM gene counts with rRNA entries removed)
+- **\*rRNArm.genes.results** (RSEM gene counts with rRNA entries removed)
 - *rRNA_counts.txt (Summary of number of rRNA entries removed)
 
 <br>
@@ -1099,7 +1101,7 @@ echo "*: ${rRNA_count} rRNA entries removed." > *_rRNA_counts.txt
 
 > Note: DGE Analysis is performed twice with different sets of input files:
 > 1. Using RSEM genes.results files (*genes.results, output from [Step 8a](#8a-count-aligned-reads-with-rsem))
-> 2. Using rRNA-removed RSEM genes.results files (*rRNA_removed.genes.results, output from [Step 8dii](#8dii-filter-rrna-genes-from-rsem-genes-results))
+> 2. Using rRNA-removed RSEM genes.results files (*rRNArm.genes.results, output from [Step 8dii](#8dii-filter-rrna-genes-from-rsem-genes-results))
 
 <br>
 
 
@@ -246,7 +246,7 @@ fastqc -o /path/to/trimmed_fastqc/output/directory *.fastq.gz
 ### 2c. Compile Trimmed Data QC  
 
 ```bash
-multiqc --interactive -n trimmed_multiqc_GLbulkRNAseq -o /path/to/trimmed_multiqc/output/directory /path/to/directory/containing/trimmed_fastqc/files
+multiqc --interactive -n trimmed_multiqc_GLbulkRNAseq -o /path/to/trimmed_multiqc/output/directory /path/to/directory/containing/trimmed_fastqc/files /path/to/directory/containing/trimming_reports
 
 zip -r trimmed_multiqc_GLbulkRNAseq_data.zip trimmed_multiqc_GLbulkRNAseq_data
 ```
@@ -257,10 +257,12 @@ zip -r trimmed_multiqc_GLbulkRNAseq_data.zip trimmed_multiqc_GLbulkRNAseq_data
 - `-n` – prefix name for output files
 - `-o` – the output directory to store results
 - `/path/to/directory/containing/trimmed_fastqc/files` – the directory holding the output data from the fastqc run, provided as a positional argument
+- `/path/to/directory/containing/trimming_reports` – the directory containing the trimming reports from the trim/filter step, provided as a positional argument
 
 **Input Data:**
 
 - *fastqc.zip (FastQC data, output from [Step 2b](#2b-trimmed-data-qc))
+- *trimming_report.txt (trimming report, output from [Step 2a](#2a-trimfilter-raw-data))
 
 **Output Data:**
 
@@ -350,7 +352,9 @@ bowtie2 -x /path/to/bowtie2/index \
 
 - *\.sam (alignments in SAM format)
 - **\*.bowtie2.log** (log file containing alignment statistics)
-- **\*.unmapped.fastq.gz** (unmapped reads in FASTQ format)
+- Unmapped reads (unmapped reads in FASTQ format)
+    - **\*.unmapped.fastq.gz** (single-end)
+    -  **\*.unmapped.fastq.1.gz, .unmapped.fastq.2.gz** (paired-end)
 
 <br>
 
 
@@ -104,6 +104,8 @@ We recommend installing Singularity on a system wide level as per the associated
 
 > Note: Singularity is also available through [Anaconda](https://anaconda.org/conda-forge/singularity).
 
+> Note: Alternatively, Docker can be used in place of Singularity. See the [Docker CE installation documentation](https://docs.docker.com/engine/install/).
+
 <br>
 
 ---
@@ -150,6 +152,8 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity
 While in the location containing the `NF_RCP_2.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are four examples of how to run the NF_RCP workflow:
 > Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --reference_version) that denote workflow specific parameters.  Take care to use the proper number of hyphens for each argument.
 
+> Note: To use Docker instead of Singularity, use `-profile docker` in the Nextflow run command. Nextflow will automatically pull images as needed.
+
 <br>
 
 #### 4a. Approach 1: Run the workflow on a GeneLab RNAseq dataset with automatic retrieval of reference fasta and gtf files
 
@@ -137,8 +137,8 @@ if (params$microbes) {
         full.names = TRUE
     )
 
-    # Remove "_rRNA_removed" from filenames for matching
-    clean_filenames <- sub("_rRNA_removed", "", basename(files))
+    # Remove "_rRNArm" from filenames for matching
+    clean_filenames <- sub("_rRNArm", "", basename(files))
 
     samples <- rownames(study)
 
@@ -331,7 +331,7 @@ write.csv(
 write.csv(
     VSTCounts,
     file = file.path(params$output_directory,
-                     paste0("VST_Normalized_Counts",
+                     paste0("VST_Counts",
                            params$output_filename_label, params$output_filename_suffix, ".csv"))
 )
 ```
 
@@ -24,12 +24,12 @@ def calculate_md5(filepath):
         return "ERROR"
 
 def is_raw_file(filepath):
-    """Check if file is a raw FASTQ or raw multiqc file."""
+    """Check if file is a raw FASTQ or raw multiqc file (zip or html)."""
     # Match raw fastq files but not trimming reports
     if "/Fastq/" in filepath and filepath.endswith("raw.fastq.gz") and "_raw.fastq.gz_" not in filepath:
         return True
-    # Match raw multiqc reports
-    if "raw_multiqc" in filepath and filepath.endswith(".zip"):
+    # Match raw multiqc reports (zip or html)
+    if "raw_multiqc" in filepath and (filepath.endswith(".zip") or filepath.endswith(".html")):
         return True
     return False
 
@@ -38,11 +38,57 @@ def should_include(filepath, outdir):
     # Skip files in VV_Logs
     if "/VV_Logs/" in filepath:
         return False
-    
     # Skip files in GeneLab except for qc_metrics
     if "/GeneLab/" in filepath and not filepath.endswith("qc_metrics" + args.assay_suffix + ".csv"):
         return False
-        
+    # Skip any files with 'fastqc' in the path or filename (case-insensitive)
+    if 'fastqc' in filepath.lower():
+        return False
+    # STAR: Only keep specific outputs
+    star_keep_patterns = [
+        re.compile(r"_Aligned\.toTranscriptome\.out\.bam$"),
+        re.compile(r"_Log\.final\.out$"),
+        re.compile(r"_SJ\.out\.tab$"),
+        re.compile(r"_Unmapped\.out\.mate1$"),
+        re.compile(r"_Unmapped\.out\.mate2$")
+    ]
+    basename = os.path.basename(filepath)
+    if any(pat.search(basename) for pat in star_keep_patterns):
+        return True
+    # If it's a STAR output but not in the keep list, filter it out
+    star_output_keywords = [
+        "Aligned.sortedByCoord.out.bam", "ReadsPerGene.out.tab", "Log.out", "Log.progress.out"
+    ]
+    if any(keyword in basename for keyword in star_output_keywords):
+        return False
+    # Skip fixed STAR output files (case-insensitive)
+    fixed_star_files = [
+        "Log.final.out", "SJ.out.tab", "sjdbList.out.tab", "sjdbInfo.txt"
+    ]
+    if basename.lower() in [f.lower() for f in fixed_star_files]:
+        return False
+    # RSeQC: Only keep MultiQC reports (zip or html)
+    if "RSeQC_Analyses" in filepath:
+        if "MultiQC_Reports" not in filepath:
+            return False
+        # In MultiQC_Reports, only allow .zip or .html
+        if not (filepath.endswith('.zip') or filepath.endswith('.html')):
+            return False
+    # RSEM: Only keep .genes.results and .isoforms.results (including _rRNArm variants)
+    if basename.endswith('.genes.results') or basename.endswith('.isoforms.results'):
+        return True
+    rsem_other_patterns = ['.cnt', '.model', '.theta']
+    if any(basename.endswith(ext) for ext in rsem_other_patterns):
+        return False
+    # Skip ISA.zip
+    if filepath.endswith("ISA.zip"):
+        return False
+    # Skip STAR_NumNonZeroGenes_GLbulkRNAseq.csv and RSEM_NumNonZeroGenes_GLbulkRNAseq.csv
+    if os.path.basename(filepath) in [
+        "STAR_NumNonZeroGenes_GLbulkRNAseq.csv",
+        "RSEM_NumNonZeroGenes_GLbulkRNAseq.csv"
+    ]:
+        return False
     return True
 
 def main():
@@ -100,5 +146,20 @@ def main():
     print(f"Added {raw_count} files to {raw_md5_file}")
     print(f"Added {processed_count} files to {processed_md5_file}")
 
+    def dedup_file(filename):
+        seen = set()
+        lines = []
+        with open(filename, 'r') as f:
+            for line in f:
+                key = line.split('\t', 1)[0]  # dedup by basename
+                if key not in seen:
+                    seen.add(key)
+                    lines.append(line)
+        with open(filename, 'w') as f:
+            f.writelines(lines)
+
+    dedup_file(raw_md5_file)
+    dedup_file(processed_md5_file)
+
 if __name__ == "__main__":
     main()
@@ -277,14 +277,14 @@ def generate_protocol_content(args, software_versions):
 
     # Define organism to annotation package mapping using scientific names
     organism_annotation_packages = {
-        "Arabidopsis thaliana": ("org.At.tair.db", "3.19.1"),
-        "Caenorhabditis elegans": ("org.Ce.eg.db", "3.19.1"),
-        "Drosophila melanogaster": ("org.Dm.eg.db", "3.19.1"),
-        "Danio rerio": ("org.Dr.eg.db", "3.19.1"),
-        "Homo sapiens": ("org.Hs.eg.db", "3.19.1"),
-        "Mus musculus": ("org.Mm.eg.db", "3.19.1"),
-        "Rattus norvegicus": ("org.Rn.eg.db", "3.19.1"),
-        "Saccharomyces cerevisiae": ("org.Sc.sgd.db", "3.19.1")
+        "arabidopsis_thaliana": ("org.At.tair.db", "3.19.1"),
+        "caenorhabditis_elegans": ("org.Ce.eg.db", "3.19.1"),
+        "drosophila_melanogaster": ("org.Dm.eg.db", "3.19.1"),
+        "danio_rerio": ("org.Dr.eg.db", "3.19.1"),
+        "homo_sapiens": ("org.Hs.eg.db", "3.19.1"),
+        "mus_musculus": ("org.Mm.eg.db", "3.19.1"),
+        "rattus_norvegicus": ("org.Rn.eg.db", "3.19.1"),
+        "saccharomyces_cerevisiae": ("org.Sc.sgd.db", "3.19.1")
     }
 
     # List of organisms that use custom annotation packages via AnnotationForge
@@ -318,7 +318,7 @@ def generate_protocol_content(args, software_versions):
     # Format the organism name for lookup
     organism_formatted = ""
     if hasattr(args, 'organism') and args.organism:
-        organism_formatted = args.organism.replace('_', ' ').title()
+        organism_formatted = args.organism.replace(' ', '_').replace('-', '_').lower()
 
     # Build gene annotations sentence
     gene_annotations_text = f"Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110-A (https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md), with STRINGdb (version {stringdb_version}) and PANTHER.db (version {pantherdb_version})"
 
@@ -86,15 +86,30 @@ def main(osd_num, paired_end, assay_suffix, mode):
         'num_uniquely_aligned', 'pct_uniquely_aligned', 'pct_multi_aligned', 'pct_filtered', 'pct_unalignable'
     ]
 
+    # Make a set of fieldnames for fast lookup
+    fieldnames_set = set(fieldnames)
+
     output_filename = f'qc_metrics{assay_suffix}.csv'
     with open(output_filename, mode='w', newline='') as f:
         writer = csv.DictWriter(f, fieldnames=fieldnames)
         writer.writeheader()
 
         for sample in samples:
-            fields = {k: v for d in [i[sample] for i in multiqc_data if sample in i] for k, v in d.items()}
-
-            writer.writerow({'osd_num': 'OSD-' + osd_num, 'sample': sample, **metadata, **fields})
+            # Collect all fields for this sample
+            all_fields = {}
+            for data_source in multiqc_data:
+                if sample in data_source:
+                    for k, v in data_source[sample].items():
+                        # Only keep fields that are in the fieldnames list
+                        if k in fieldnames_set:
+                            all_fields[k] = v
+                        else:
+                            # Optionally add debug output to see which fields are being skipped
+                            # print(f"Skipping field not in fieldnames: {k}")
+                            pass
+            
+            # Write the row with filtered fields
+            writer.writerow({'osd_num': 'OSD-' + osd_num, 'sample': sample, **metadata, **all_fields})
 
 
 def get_metadata(osd_num):