nasa
diff --git a/‎RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md
Lines changed: 9 additions & 5 deletions b/‎RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md
Lines changed: 9 additions & 5 deletions
diff --git a/‎RNAseq/Pipeline_GL-DPPD-7115_Versions/GL-DPPD-7115.md
Lines changed: 6 additions & 3 deletions b/‎RNAseq/Pipeline_GL-DPPD-7115_Versions/GL-DPPD-7115.md
Lines changed: 6 additions & 3 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/CHANGELOG.md
Lines changed: 12 additions & 4 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/CHANGELOG.md
Lines changed: 12 additions & 4 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/add_gene_annotations.Rmd
Lines changed: 11 additions & 7 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/add_gene_annotations.Rmd
Lines changed: 11 additions & 7 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/generate_protocol.py
Lines changed: 76 additions & 9 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/generate_protocol.py
Lines changed: 76 additions & 9 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/software_versions.py
Lines changed: 10 additions & 3 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/software_versions.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories.py
Lines changed: 2 additions & 1 deletion b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories_by_sample.py
Lines changed: 0 additions & 29 deletions b/‎RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/sort_into_subdirectories_by_sample.py
Lines changed: 0 additions & 29 deletions
@@ -59,9 +59,7 @@ Software Updates:
 | scipy             | 1.9.1         | 1.15.1  |
 
 STAR Alignment
-- Added unaligned reads FASTQ output file(s) via STAR `-outReadsUnmapped Fastq`:
-  - {sample}_Unmapped.out.mate1
-  - {sample}_Unmapped.out.mate2
+- Added unaligned reads FASTQ output file(s) via STAR `-outReadsUnmapped Fastx`
 
 RSeQC Analysis
 - Updated inner_distance.py invocation to use a lower minimum value to account for longer read lengths
@@ -420,9 +418,14 @@ STAR --twopassMode Basic \
  --outSAMheaderHD @HD VN:1.4 SO:coordinate \
  --outFileNamePrefix /path/to/STAR/output/directory/<sample_id> \
  --outReadsUnmapped Fastx \
+ --genomeLoad NoSharedMemory \
  --readFilesIn /path/to/trimmed_forward_reads \
  /path/to/trimmed_reverse_reads # only needed for PE studies
 
+mv <sample_id>_Unmapped.out.mate1 <sample_id>_R1_unmapped.fastq  # Only needed for PE studies
+mv <sample_id>_Unmapped.out.mate2 <sample_id>_R2_unmapped.fastq  # Only needed for PE studies
+# mv <sample_id>_Unmapped.out.mate1 <sample_id>_unmapped.fastq    # Only needed for SE studies
+gzip *_unmapped.fastq
 ```
 
 **Parameter Definitions:**
@@ -448,7 +451,8 @@ STAR --twopassMode Basic \
 - `--quantMode` – specifies the type(s) of quantification desired; the `TranscriptomeSAM` option instructs STAR to output a separate sam/bam file containing alignments to the transcriptome and the `GeneCounts` option instructs STAR to output a tab delimited file containing the number of reads per gene
 - `--outSAMheaderHD` – indicates a header line for the sam/bam file
 - `--outFileNamePrefix` – specifies the path to and prefix for the output file names; for GeneLab the prefix is the sample id
-- `outReadsUnmapped` - specifies how to output unmapped and partially mapped reads (where only one mate of a paired-end read is mapped); the `Fastx` option outputs unmapped reads in separate fasta/fastq files named Unmapped.out.mate1 and Unmapped.out.mate2
+- `outReadsUnmapped` - specifies how to output unmapped and partially mapped reads (where only one mate of a paired-end read is mapped); the `Fastx` option outputs unmapped reads in separate fastq files 
+- `--genomeLoad` – controls how the genome index is loaded into memory; `NoSharedMemory` specifies that each job will have its own private copy of the genome rather than using shared memory. This is the only option compatible with `--twopassMode Basic`.
 - `--readFilesIn` – path to input read 1 (forward read) and read 2 (reverse read); for paired-end reads, read 1 and read 2 should be separated by a space; for single-end reads only read 1 should be indicated
 
 **Input Data:**
@@ -473,7 +477,7 @@ STAR --twopassMode Basic \
   - SJ.out.tab
 - *_STARtmp (directory containing the following:)
   - BAMsort (directory containing subdirectories that are empty – this was the location for temp files that were automatically removed after successful completion)
-- **\*Unmapped.out.mate1, \*Unmapped.out.mate2** (unmapped and partially mapped reads in fastq format)
+- **\*unmapped.fastq.gz** (unmapped and partially mapped reads)
 
 <br>
 
 
@@ -327,6 +327,11 @@ bowtie2 -x /path/to/bowtie2/index \
  # --un-gz <sample_id>.unmapped.fastq.gz \     # For single-end data
  -S /path/to/bowtie2/output/directory/<sample_id>.sam \
  2> /path/to/bowtie2/output/directory/<sample_id>.bowtie2.log
+
+# Rename unmapped reads
+mv <sample_id>.unmapped.fastq.1.gz <sample_id>_R1_unmapped.fastq.gz  # For paired-end data
+mv <sample_id>.unmapped.fastq.2.gz <sample_id>_R2_unmapped.fastq.gz  # For paired-end data
+# mv <sample_id>.unmapped.fastq.gz <sample_id>_unmapped.fastq.gz      # For single-end data
 ```
 
 **Parameter Definitions:**
@@ -352,9 +357,7 @@ bowtie2 -x /path/to/bowtie2/index \
 
 - *\.sam (alignments in SAM format)
 - **\*.bowtie2.log** (log file containing alignment statistics)
-- Unmapped reads (unmapped reads in FASTQ format)
-    - **\*.unmapped.fastq.gz** (single-end)
-    -  **\*.unmapped.fastq.1.gz, .unmapped.fastq.2.gz** (paired-end)
+- **\*unmapped.fastq.gz** (unmapped and partially mapped reads)
 
 <br>
 
 
@@ -20,6 +20,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Separate results are generated for rRNA-removed DGE analysis, with new output directories:  
     - `04-DESeq2_NormCounts_rRNArm/`  
     - `05-DESeq2_DGE_rRNArm/`
+- Added reference table support for Pseudomonas aeruginosa [#37](https://github.com/nasa/GeneLab_Data_Processing/issues/37)
+- Added V&V check for adapter content removal using FastQC/MultiQC reports from trimmed reads [#42](https://github.com/nasa/GeneLab_Data_Processing/issues/42)
+- Added generation of a CSV file summarizing parsed metrics from tool logs and MultiQC reports [#84](https://github.com/nasa/GeneLab_Data_Processing/issues/84)
 
 ### Changed
 
@@ -55,14 +58,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Bacteria: Ensembl bacteria release 59
 - Added "_GLbulkRNAseq" suffix to output files
 - RSeQC inner_distance minimum value now dynamically set based on read length
-- DESeq2 analysis now handles technical replicates
+- DESeq2 analysis now handles technical replicates [#32](https://github.com/nasa/GeneLab_Data_Processing/issues/32)
 - MultiQC reports replaced with separate data zip and html files
+- Increased default memory allocation for the STAR alignment process to 40GB [#36](https://github.com/nasa/GeneLab_Data_Processing/issues/36)
+
+### Fixed
+
+- DGE validation script (`vv_dge_deseq2.py`) error with all-integer sample names [#112](https://github.com/nasa/GeneLab_Data_Processing/issues/112)
+- The `--accession` parameter (formerly `--gldsAccession`) is now optional for runsheet-based workflows; if omitted, outputs default to the 'results' directory [#35](https://github.com/nasa/GeneLab_Data_Processing/issues/35)
 
 ### Removed
 
 - ERCC-normalized DGE analysis and associated output files
-- GeneLab visualization output tables
-
+- GeneLab visualization output tables [#41](https://github.com/nasa/GeneLab_Data_Processing/issues/41)
 
 ## [1.0.4](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP-F_1.0.4/RNAseq/Workflow_Documentation/NF_RCP-F) - 2024-02-08
 
@@ -76,7 +84,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- TrimGalore! will now use autodetect for adaptor type
+- TrimGalore! will now use autodetect for adaptor type [#20](https://github.com/nasa/GeneLab_Data_Processing/issues/20)
 - V&V migrated from dp_tools version 1.1.8 to 1.3.4 including:
   - Migration of V&V protocol code to this codebase instead of dp_tools
   - Fix for sample wise checks reusing same sample
 
@@ -35,13 +35,17 @@ suppressMessages(library(tibble))
 ```{r, load-annotation-table}
 ### Read in annotation table for the appropriate organism ###
 
-annot <- read.table(
-            params$annotation_file_path,
-            sep = "\t",
-            header = TRUE,
-            quote = "",
-            comment.char = "",
-        )
+if (is.null(params$annotation_file_path) || params$annotation_file_path == "" || params$annotation_file_path == "null") {
+    annot <- tibble::tibble()
+} else {
+    annot <- read.table(
+        params$annotation_file_path,
+        sep = "\t",
+        header = TRUE,
+        quote = "",
+        comment.char = ""
+    )
+}
 ```
 
 ```{r, load-table-to-annotate}
 
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 """
 This script generates a protocol text file for GeneLab RNA-seq data processing.
 It reads software versions from a YAML file and incorporates other parameters.
@@ -9,6 +9,8 @@
 import os
 import sys
 from datetime import datetime
+import pandas as pd
+import re
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Generate protocol file for GeneLab RNA-seq pipeline')
@@ -38,6 +40,8 @@ def parse_args():
                         help='Path to the reference genome FASTA file')
     parser.add_argument('--reference_gtf', required=True,
                         help='Path to the reference genome GTF file')
+    parser.add_argument('--runsheet', required=False,
+                        help='Path to the runsheet CSV file')
     return parser.parse_args()
 
 def read_software_versions(yaml_file):
@@ -267,8 +271,54 @@ def generate_protocol_content(args, software_versions):
         # For standard workflow (include tximport)
         description += f"The runsheet was generated with dp_tools (version {dp_tools_version}) and the runsheet and quantification data were imported to R (version {r_version}) with tximport (version {tximport_version}) and normalized with DESeq2 (version {deseq2_version}) median of ratios method. "
 
+    # Parse runsheet for technical replicate handling
+    tech_rep_sentence = ""
+    if hasattr(args, 'runsheet') and args.runsheet and os.path.exists(args.runsheet):
+        try:
+            runsheet_df = pd.read_csv(args.runsheet)
+            if 'Sample Name' in runsheet_df.columns:
+                # Remove whitespace and NA
+                sample_names = runsheet_df['Sample Name'].dropna().astype(str).tolist()
+                # Remove trailing/leading whitespace
+                sample_names = [s.strip() for s in sample_names]
+                # Remove empty
+                sample_names = [s for s in sample_names if s]
+                # Find base names (remove _techrepN if present)
+                base_names = [re.sub(r'_techrep\d+$', '', s) for s in sample_names]
+                from collections import Counter
+                base_counts = Counter(base_names)
+                n_reps = list(base_counts.values())
+                unique_n = set(n_reps)
+                if all(x == 1 for x in n_reps):
+                    # No technical replicates at all
+                    tech_rep_sentence = ""
+                elif len(unique_n) == 1 and list(unique_n)[0] > 1:
+                    # All samples have the same number of tech reps
+                    tech_rep_sentence = ("Counts from all technical replicates for each sample were summed using DESeq2's collapseReplicates function. "
+                                         "These collapsed counts were then used for count normalization and differential expression analysis. ")
+                elif len(unique_n) > 1 and min(unique_n) > 1:
+                    # All samples have tech reps, but unequal number
+                    tech_rep_sentence = ("For each sample, counts from the first n technical replicates were summed using DESeq2's collapseReplicates function. "
+                                         "These collapsed counts were then used for count normalization and differential expression analysis. ")
+                else:
+                    # Some samples have tech reps, some don't
+                    tech_rep_sentence = ("For samples with technical replicates, only the first replicate was used for count normalization and differential expression analysis. ")
+        except Exception as e:
+            tech_rep_sentence = ""
+    # If no runsheet, leave tech_rep_sentence as empty
+    
+    # Add ERCC normalization sentence if ERCC spike-ins were used
+    if args.has_ercc.lower() == "true":
+        description += ("The data were normalized twice, each time using a different size factor. "
+                        "The first used non-ERCC genes for size factor estimation, and the second used only ERCC group B genes to estimate the size factor. "
+                        "Both sets of normalized gene counts were subject to differential expression analysis. ")
+    else:
+        description += "Normalized gene counts were subject to differential expression analysis. "
+    # Add tech rep sentence
+    if tech_rep_sentence:
+        description += tech_rep_sentence
     # Add differential expression analysis sentence
-    description += f"Normalized gene counts were subject to differential expression analysis. Differential expression analysis was performed in R (version {r_version}) using DESeq2 (version {deseq2_version}); all groups were compared pairwise using the Wald test and the likelihood ratio test was used to generate the F statistic p-value. "
+    description += f"Differential expression analysis was performed in R (version {r_version}) using DESeq2 (version {deseq2_version}); all groups were compared pairwise using the Wald test and the likelihood ratio test was used to generate the F statistic p-value. "
 
     # Add gene annotations section
     # Define versions for annotation packages
@@ -321,15 +371,32 @@ def generate_protocol_content(args, software_versions):
         organism_formatted = args.organism.replace(' ', '_').replace('-', '_').lower()
 
     # Build gene annotations sentence
-    gene_annotations_text = f"Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110-A (https://github.com/nasa/GeneLab_Data_Processing/blob/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md), with STRINGdb (version {stringdb_version}) and PANTHER.db (version {pantherdb_version})"
-    
-    # Add organism-specific annotation package if it's an officially supported organism
+    annotation_sources = []
+    # Only include STRINGdb if not in organisms_without_string
+    if organism_formatted not in organisms_without_string:
+        annotation_sources.append(f"STRINGdb (version {stringdb_version})")
+    # Only include PANTHER.db if not in organisms_without_panther
+    if organism_formatted not in organisms_without_panther:
+        annotation_sources.append(f"PANTHER.db (version {pantherdb_version})")
+    # Custom annotation package
+    custom_pkg = None
+    if hasattr(args, 'organism') and args.organism and args.organism in organisms_with_custom_annotations:
+        custom_pkg = "a custom annotation package generated in-house using AnnotationForge"
+        annotation_sources.append(custom_pkg)
+    # org.*.eg.db package
     if organism_formatted and organism_formatted in organism_annotation_packages:
         package_name, package_version = organism_annotation_packages[organism_formatted]
-        gene_annotations_text += f", and {package_name} (version {package_version})"
-    
-    # Complete the gene annotations sentence
-    gene_annotations_text += "."
+        annotation_sources.append(f"{package_name} (version {package_version})")
+    # Build the sentence
+    base_text = ("Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110-A "
+                 "(https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable-A_1.1.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md)")
+    if annotation_sources:
+        if len(annotation_sources) == 1:
+            gene_annotations_text = f"{base_text}, with {annotation_sources[0]}."
+        else:
+            gene_annotations_text = f"{base_text}, with {', '.join(annotation_sources[:-1])}, and {annotation_sources[-1]}."
+    else:
+        gene_annotations_text = f"{base_text}."
     description += gene_annotations_text
 
     # Add ERCC assessment sentence if ERCC spike-ins were used
 
@@ -22,6 +22,7 @@
 
 CONFIG = {
     "rnaseq": [
+        ["NF_RCP", "https://github.com/nasa/GeneLab_Data_Processing/tree/master/RNAseq"],
         ["Nextflow", "https://github.com/nextflow-io/nextflow"],
         ["dp_tools", "https://github.com/J-81/dp_tools"],
         ["FastQC", "https://www.bioinformatics.babraham.ac.uk/projects/fastqc/"],
@@ -74,12 +75,16 @@ def compare_versions(v1, v2):
         # Fallback for non-standard version strings
         return str(v1) > str(v2)
 
-def main(versions_json_path: Path, output_path: Path, assay: str = 'rnaseq'):
+def main(versions_json_path: Path, output_path: Path, assay: str = 'rnaseq', workflow: str = None, workflow_version: str = None):
     software_urls = {name: url for name, url in CONFIG[assay]}
     known_names = CONFIG[assay]
 
     processed_versions = {}
 
+    # Add workflow version if provided
+    if workflow and workflow_version:
+        processed_versions[workflow] = workflow_version
+    
     with versions_json_path.open() as f:
         data = yaml.safe_load(f)
         # Flatten nested structure
@@ -155,7 +160,9 @@ def main(versions_json_path: Path, output_path: Path, assay: str = 'rnaseq'):
     @click.argument('input', type=click.Path(exists=True))
     @click.argument('output', type=click.Path())
     @click.option('--assay', type=click.Choice(['rnaseq']), default='rnaseq')
-    def cli(input, output, assay):
-        main(Path(input), Path(output), assay)
+    @click.option('--workflow', type=str, help='Workflow name')
+    @click.option('--workflow_version', type=str, help='Workflow version')
+    def cli(input, output, assay, workflow, workflow_version):
+        main(Path(input), Path(output), assay, workflow, workflow_version)
 
     cli()
@@ -24,13 +24,14 @@
 # For a given directory, sort all files into {sample: str, [files: str]}
 files_by_sample = dict()
 for sample in samples:
+    sample = str(sample)  # Add this line before the path is constructed
     pattern = f"{sample}{args.glob_suffix}"
     print(f"Looking for files matching: {pattern}")
     files_for_this_sample = list(Path(args.from_dir).glob(pattern))
 
     # Move files
     for file in files_for_this_sample:
-        dest = Path(args.to_dir) / sample / file.name
+        dest = Path(args.to_dir) / str(sample) / file.name
         print(f"Moving {file} to {dest}")
         dest.parent.mkdir( parents=True, exist_ok=True )
         shutil.move(file, dest)