Skip to content

Commit cf6bde3

Browse files
committed
pipeline doc edits
1 parent d003cae commit cf6bde3

File tree

4 files changed

+47
-44
lines changed

4 files changed

+47
-44
lines changed

RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md

Lines changed: 41 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ Software Updates:
5454
| dp_tools | 1.18*, 1.3.4* | 1.3.5 |
5555
| pandas | 1.5.0 | 2.2.3 |
5656
| seaborn | 0.12.0 | 0.13.2 |
57-
| matplotlib | 3.6.0 | 3.8.3 - DOES NOT MATCH VERSION SPECIFIED IN SOFTWARE TABLE BELOW |
58-
| numpy | 1.23.3 | 1.26.4 - DOES NOT MATCH VERSION SPECIFIED IN SOFTWARE TABLE BELOW |
59-
| scipy | 1.9.1 | 1.14.1 - DOES NOT MATCH VERSION SPECIFIED IN SOFTWARE TABLE BELOW |
57+
| matplotlib | 3.6.0 | 3.10.0 |
58+
| numpy | 1.23.3 | 2.2.1 |
59+
| scipy | 1.9.1 | 1.15.1 |
6060

6161
STAR Alignment
6262
- Added unaligned reads FASTQ output file(s) via STAR `-outReadsUnmapped Fastq`:
@@ -227,7 +227,7 @@ zip -r raw_multiqc_GLbulkRNAseq_report.zip raw_multiqc_GLbulkRNAseq_report
227227
- `--interactive` – force reports to use interactive plots
228228
- `-n` – prefix name for output files
229229
- `-o` – the output directory to store results
230-
- `/path/to/directory/containing/raw_fastqc/files` – the directory holding the output data from the fastqc run, provided as a positional argument
230+
- `/path/to/directory/containing/raw_fastqc/files` – the directory holding the output data from the FastQC run, provided as a positional argument
231231

232232
**Input Data:**
233233

@@ -236,8 +236,8 @@ zip -r raw_multiqc_GLbulkRNAseq_report.zip raw_multiqc_GLbulkRNAseq_report
236236
**Output Data:**
237237

238238
* **raw_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
239-
* **raw_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
240-
* **raw_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
239+
* **raw_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
240+
* **raw_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
241241

242242
<br>
243243

@@ -317,7 +317,7 @@ zip -r trimmed_multiqc_GLbulkRNAseq_report.zip /path/to/trimmed_multiqc/output/t
317317
- `--interactive` – force reports to use interactive plots
318318
- `-n` – prefix name for output files
319319
- `-o` – the output directory to store results
320-
- `/path/to/directory/containing/trimmed_fastqc/files` – the directory holding the output data from the fastqc run, provided as a positional argument
320+
- `/path/to/directory/containing/trimmed_fastqc/files` – the directory holding the output data from the FastQC run, provided as a positional argument
321321

322322
**Input Data:**
323323

@@ -326,8 +326,8 @@ zip -r trimmed_multiqc_GLbulkRNAseq_report.zip /path/to/trimmed_multiqc/output/t
326326
**Output Data:**
327327

328328
* **trimmed_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
329-
* **trimmed_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
330-
* **trimmed_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
329+
* **trimmed_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
330+
* **trimmed_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
331331

332332
<br>
333333

@@ -495,8 +495,8 @@ zip -r align_multiqc_GLbulkRNAseq_report.zip /path/to/align_multiqc/output/align
495495
**Output Data:**
496496

497497
* **align_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
498-
* **align_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
499-
* **align_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
498+
* **align_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
499+
* **align_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
500500

501501
<br>
502502

@@ -708,8 +708,8 @@ zip -r infer_exp_multiqc_GLbulkRNAseq_report.zip /path/to/infer_exp_multiqc/outp
708708
**Output Data:**
709709

710710
* **infer_exp_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
711-
* **infer_exp_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
712-
* **infer_exp_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
711+
* **infer_exp_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
712+
* **infer_exp_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
713713

714714
<br>
715715

@@ -764,8 +764,8 @@ zip -r genebody_cov_multiqc_GLbulkRNAseq_report.zip /path/to/genebody_cov_multiq
764764
**Output Data:**
765765

766766
* **genebody_cov_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
767-
* **genebody_cov_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
768-
* **genebody_cov_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
767+
* **genebody_cov_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
768+
* **genebody_cov_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
769769

770770
<br>
771771

@@ -776,7 +776,7 @@ inner_distance.py -r /path/to/annotation/BED/file \
776776
-i /path/to/*Aligned.sortedByCoord_sorted.out.bam \
777777
-k 15000000 \
778778
-l -(max read length) \
779-
-u 350 \ [SHOULD WE INCREASE THIS VALUE?]
779+
-u 350 \
780780
-o /path/to/inner_distance/output/directory
781781
```
782782

@@ -826,8 +826,8 @@ zip -r inner_dist_multiqc_GLbulkRNAseq_report.zip /path/to/align_multiqc/output/
826826
**Output Data:**
827827

828828
* **inner_dist_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
829-
* **inner_dist_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
830-
* **inner_dist_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
829+
* **inner_dist_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
830+
* **inner_dist_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
831831

832832
<br>
833833

@@ -879,8 +879,8 @@ zip -r read_dist_multiqc_GLbulkRNAseq_report.zip /path/to/read_dist_multiqc/outp
879879
**Output Data:**
880880

881881
* **read_dist_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
882-
* **read_dist_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
883-
* **read_dist_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
882+
* **read_dist_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
883+
* **read_dist_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
884884

885885
<br>
886886

@@ -996,8 +996,8 @@ zip -r RSEM_count_multiqc_GLbulkRNAseq_report.zip /path/to/raw_multiqc/output/RS
996996
**Output Data:**
997997

998998
* **RSEM_count_multiqc_GLbulkRNAseq_report.zip** (zip containing the following)
999-
* **RSEM_count_multiqc_GLbulkRNAseq.html** (multiqc output html summary)
1000-
* **RSEM_count_multiqc_GLbulkRNAseq_data** (directory containing multiqc output data)
999+
* **RSEM_count_multiqc_GLbulkRNAseq.html** (MultiQC output html summary)
1000+
* **RSEM_count_multiqc_GLbulkRNAseq_data** (directory containing MultiQC output data)
10011001

10021002
<br>
10031003

@@ -1101,7 +1101,7 @@ echo "*: ${rRNA_count} rRNA entries removed." > *_rRNA_counts.txt
11011101

11021102
## 9. Normalize Read Counts and Perform Differential Gene Expression Analysis
11031103

1104-
> **Note:** DGE Analysis is performed twice with different sets of input files:
1104+
> Note: DGE Analysis is performed twice with different sets of input files:
11051105
> 1. Using RSEM genes.results files (*genes.results, output from [Step 8a](#8a-count-aligned-reads-with-rsem)))
11061106
> 2. Using rRNA-removed RSEM genes.results files (*rRNA_removed.genes.results, output from [Step 8dii](#8dii-filter-rrna-genes-from-rsem-genes-results))
11071107
@@ -1219,11 +1219,13 @@ setwd(file.path(work_dir))
12191219
### Pull all factors for each sample in the study from the runsheet created in Step 9a ###
12201220

12211221
compare_csv_from_runsheet <- function(runsheet_path) {
1222-
df = read.csv(runsheet_path)
1223-
# get only Factor Value columns
1224-
factors = as.data.frame(df[,grep("Factor.Value", colnames(df), ignore.case=TRUE)])
1225-
colnames(factors) = paste("factor",1:dim(factors)[2], sep= "_")
1226-
result = data.frame(sample_id = df[,c("Sample.Name")], factors)
1222+
df <- read.csv(runsheet_path)
1223+
factors <- df %>%
1224+
select(matches("Factor.Value", ignore.case = TRUE)) %>%
1225+
rename_with(~ paste0("factor_", seq_along(.)))
1226+
result <- df %>%
1227+
select(sample_id = Sample.Name) %>%
1228+
bind_cols(factors)
12271229
return(result)
12281230
}
12291231

@@ -1235,20 +1237,19 @@ compare_csv <- compare_csv_from_runsheet(runsheet_path)
12351237

12361238
### Create data frame containing all samples and respective factors ###
12371239

1238-
study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
1239-
colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
1240-
rownames(study) <- compare_csv[,1]
1240+
study <- compare_csv[, -1, drop=FALSE] # Exclude sample_id
1241+
rownames(study) <- compare_csv$sample_id
12411242

12421243

12431244
### Format groups and indicate the group that each sample belongs to ###
12441245

1245-
if (dim(study) >= 2){
1246-
group<-apply(study,1,paste,collapse = " & ") ## concatenate multiple factors into one condition per sample
1247-
} else{
1248-
group<-study[,1]
1246+
group <- if (ncol(study) >= 2) {
1247+
apply(study, 1, paste, collapse = " & ")
1248+
} else {
1249+
study[[1]]
12491250
}
1250-
group_names <- paste0("(",group,")",sep = "") ## human readable group names
1251-
group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group names
1251+
group_names <- paste0("(", group, ")") ## human readable group names
1252+
group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group naming compatible with R models, this maintains the default behaviour of make.names with the exception that 'X' is never prepended to group names
12521253
names(group) <- group_names
12531254
rm(group_names)
12541255

@@ -1492,6 +1493,9 @@ write.csv(txi.rsem$counts,
14921493
write.csv(normCounts,
14931494
file.path(norm_output, "Normalized_Counts_GLbulkRNAseq.csv"))
14941495

1496+
write.csv(VSTCounts,
1497+
file.path(norm_output, "VST_Counts_GLbulkRNAseq.csv"))
1498+
14951499
### Export sample grouping and contrasts tables ###
14961500
write.csv(sampleTable,
14971501
file.path(DGE_output, "SampleTable_GLbulkRNAseq.csv"))

RNAseq/Workflow_Documentation/NF_RCP/CHANGELOG.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
4242
- dp_tools 1.3.5
4343
- pandas 2.2.3
4444
- seaborn 0.13.2
45-
- matplotlib 3.8.3
46-
- numpy 1.26.4
47-
- scipy 1.14.1
45+
- matplotlib 3.10.0
46+
- numpy 2.2.1
47+
- scipy 1.15.1
4848
- Updated [Ensembl Reference Files](../../../GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv) now use:
4949
- Animals: Ensembl release 112
5050
- Plants: Ensembl plants release 59

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/deseq2_dge.Rmd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,8 @@ if (params$microbes) {
180180
files <- files[reordering]
181181
names(files) <- samples
182182
txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)
183-
if ((dim(txi.rsem$counts)[2] == nrow(study)) == FALSE) {
184-
stop(sprintf("Assert statement: '%s' was False: Sample count mismatch after comparing imported gene results and runsheet", deparse(quote(dim(txi.rsem$counts)[2] == nrow(study)))))
183+
if (dim(txi.rsem$counts)[2] != nrow(study)) {
184+
stop("Sample count mismatch between imported gene results and runsheet")
185185
}
186186
## Add 1 to genes with lengths of zero - needed to make DESeqDataSet object
187187
print(sprintf("DEBUG: %s: Converting %d zero length genes to 1-length of %d genes (%f %% total)", Sys.time(), length(txi.rsem$length[txi.rsem$length == 0]), length(txi.rsem$length), length(txi.rsem$length[txi.rsem$length == 0])/length(txi.rsem$length)))

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/rseqc.nf

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,9 @@ process INNER_DISTANCE {
6464

6565
script:
6666
def log_fname = "${ meta.id }.inner_distance_freq.txt"
67-
def max_length = Math.max(150, max_read_length)
6867

6968
"""
70-
inner_distance.py -r ${ genome_bed } -i ${ bam_file } -k ${ params.rseqc_sample_count } -l -${ max_length } -u 350 -o ${ meta.id }
69+
inner_distance.py -r ${ genome_bed } -i ${ bam_file } -k ${ params.rseqc_sample_count } -l -${ max_read_length } -u 350 -o ${ meta.id }
7170
7271
# VERSIONS
7372
echo '"${task.process}":' > versions.yml

0 commit comments

Comments
 (0)