Skip to content

Commit 6471d7a

Browse files
committed
clean up dge step
1 parent 7ecd205 commit 6471d7a

File tree

2 files changed

+27
-30
lines changed

2 files changed

+27
-30
lines changed

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/deseq2_dge.Rmd renamed to RNAseq/Workflow_Documentation/NF_RCP/workflow_code/bin/dge_deseq2.Rmd

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -115,16 +115,16 @@ rm(contrast.names)
115115
```
116116

117117
### 4. Load Gene Counts
118-
```{r load-gene-counts }
119-
##### Import Bowtie 2 or RSEM raw (gene) count data #####
118+
```{r load-gene-counts}
119+
##### Import FeatureCounts or RSEM count data #####
120120
if (params$microbes) {
121121
# For microbes, look for FeatureCounts TSV file
122122
if (!file.exists(params$input_counts)) {
123123
stop(paste("FeatureCounts file not found at:", params$input_counts))
124124
}
125125
# Load featureCounts data with tab separator
126126
featurecounts <- read.csv(params$input_counts, header = TRUE, sep = "\t", skip = 1)
127-
# Create counts matrix: remove metadata columns from featurecounts table, remove possible .bam from column names
127+
# Create counts matrix: remove metadata columns, remove possible .bam from column names
128128
row.names(featurecounts) <- gsub("-", ".", featurecounts$Geneid)
129129
counts <- featurecounts[,-c(1:6)]
130130
colnames(counts) <- gsub("\\.bam$", "", colnames(counts))
@@ -138,36 +138,45 @@ if (params$microbes) {
138138
full.names = TRUE
139139
)
140140
samples = rownames(study)
141-
# Reorder files based on sample names without specifying "Rsem_gene_counts/" folder
141+
# Reorder files based on sample names
142142
reordering <- sapply(samples, function(x) grep(paste0(x, ".genes.results$"), files, value = FALSE))
143143
files <- files[reordering]
144144
names(files) <- samples
145145
txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)
146146
if (dim(txi.rsem$counts)[2] != nrow(study)) {
147147
stop("Sample count mismatch between imported gene results and runsheet")
148148
}
149-
## Add 1 to genes with lengths of zero - needed to make DESeqDataSet object
150-
print(sprintf("DEBUG: %s: Converting %d zero length genes to 1-length of %d genes (%f %% total)", Sys.time(), length(txi.rsem$length[txi.rsem$length == 0]), length(txi.rsem$length), length(txi.rsem$length[txi.rsem$length == 0])/length(txi.rsem$length)))
151-
txi.rsem$length[txi.rsem$length == 0] <- 1
152149
}
150+
##### Apply debug options if enabled #####
153151
if (params$DEBUG_MODE_LIMIT_GENES) {
154152
if (params$microbes) {
155-
counts <- counts[, 1:150]
156-
print(sprintf("DEBUG: %s: Limiting analysis to last 150 genes", Sys.time()))
153+
counts <- counts[1:150,]
157154
} else {
158-
txi.rsem$counts <- txi.rsem$counts[, 1:150]
159-
print(sprintf("DEBUG: %s: Limiting analysis to last 150 genes", Sys.time()))
155+
txi.rsem$counts <- txi.rsem$counts[1:150,]
160156
}
157+
print(sprintf("DEBUG: %s: Limiting analysis to first 150 genes", Sys.time()))
161158
}
162159
if (params$DEBUG_MODE_ADD_DUMMY_COUNTS) {
163160
set.seed(1)
164161
if (params$microbes) {
165-
counts <- counts + matrix(sample( 0:5000, NROW(counts)*NCOL(counts), replace=TRUE),nrow=NROW(counts))
162+
counts <- counts + matrix(
163+
sample(0:5000, NROW(counts)*NCOL(counts), replace=TRUE),
164+
nrow=NROW(counts)
165+
)
166166
} else {
167-
txi.rsem$counts <- txi.rsem$counts + matrix(sample( 0:5000, NROW(txi.rsem$counts)*NCOL(txi.rsem$counts), replace=TRUE),nrow=NROW(txi.rsem$counts))
167+
txi.rsem$counts <- txi.rsem$counts + matrix(
168+
sample(0:5000, NROW(txi.rsem$counts)*NCOL(txi.rsem$counts), replace=TRUE),
169+
nrow=NROW(txi.rsem$counts)
170+
)
168171
}
169172
print(sprintf("DEBUG: %s: Replacing original counts with random values from 0 to 5000", Sys.time()))
170173
}
174+
if (params$microbes) {
175+
} else {
176+
## Add 1 to genes with lengths of zero - needed to make DESeqDataSet object
177+
print(sprintf("DEBUG: %s: Converting %d zero length genes to 1-length of %d genes (%f %% total)", Sys.time(), length(txi.rsem$length[txi.rsem$length == 0]), length(txi.rsem$length), length(txi.rsem$length[txi.rsem$length == 0])/length(txi.rsem$length)))
178+
txi.rsem$length[txi.rsem$length == 0] <- 1
179+
}
171180
```
172181

173182
```{r create-sample-table}
@@ -281,12 +290,6 @@ dds <- DESeq(dds, parallel = TRUE, BPPARAM = BPPARAM)
281290
```{r output-counts-related-files}
282291
normCounts <- as.data.frame(counts(dds, normalized = TRUE))
283292
VSTCounts <- as.data.frame(assay(vst(dds)))
284-
unnormalized_counts_filename <- if (params$microbes) {
285-
"FeatureCounts_Unnormalized_Counts_GLbulkRNAseq.csv"
286-
} else {
287-
"RSEM_Unnormalized_Counts_GLbulkRNAseq.csv"
288-
}
289-
normalized_counts_filename <- paste0("Normalized_Counts", params$output_filename_suffix, ".csv")
290293
write.csv(
291294
if (params$microbes) {
292295
counts
@@ -312,12 +315,7 @@ write.csv(
312315

313316
```{r prep-counts-for-dge}
314317
## Add 1 to all counts to avoid issues with log transformation
315-
print(sprintf("DEBUG: %s Printing head of: '%s' below", Sys.time(), 'normCounts'))
316-
print(head(normCounts), quote = TRUE)
317-
print(sprintf("DEBUG: %s: Adding 1 to all normalized counts to avoid issues with log transformation", Sys.time()))
318318
normCounts <- normCounts + 1
319-
print(sprintf("DEBUG: %s Printing head of: '%s' below", Sys.time(), 'normCounts'))
320-
print(head(normCounts), quote = TRUE)
321319
## output table 1 will be used to generate computer-readable DGE table,
322320
## which is used to create GeneLab visualization plots
323321
output_table <- tibble::rownames_to_column(normCounts, var = params$gene_id_type)
@@ -334,7 +332,7 @@ lrt_pvalues <- res_lrt@listData$padj
334332

335333
```{r wald-test-iteration}
336334
## Iterate through Wald Tests to generate pairwise comparisons of all groups
337-
compute_contrast <- function(i) {
335+
compute_contrast <- function(dds, i) {
338336
res <- results(
339337
dds,
340338
contrast = c("condition", contrasts[1, i], contrasts[2, i]),
@@ -349,8 +347,7 @@ compute_contrast <- function(i) {
349347
)
350348
return(res_df)
351349
}
352-
# Use bplapply to compute results in parallel
353-
res_list <- bplapply(1:dim(contrasts)[2], compute_contrast, BPPARAM = BPPARAM)
350+
res_list <- bplapply(1:dim(contrasts)[2], function(i) compute_contrast(dds, i), BPPARAM = BPPARAM)
354351
# Combine the list of data frames into a single data frame
355352
res_df <- do.call(cbind, res_list)
356353
# Combine with the existing output_table

RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/dge_deseq2.nf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@ process DGE_DESEQ2 {
2424
script:
2525
def output_filename_suffix = params.output_suffix ?: ""
2626
def microbes = params.mode == 'microbes' ? 'TRUE' : 'FALSE'
27-
def dge_rmd_file = "${projectDir}/bin/deseq2_dge.Rmd"
28-
def debug_dummy_counts = params.use_dummy_gene_counts ? 'TRUE' : 'FALSE'
27+
def dge_rmd_file = "${projectDir}/bin/dge_deseq2.Rmd"
28+
def debug_dummy_counts = params.use_dummy_gene_counts ? 'TRUE' : 'FALSE'
2929

3030
"""
3131
Rscript -e "rmarkdown::render('${dge_rmd_file}',
32-
output_file = 'DESeq2_DGE.html',
32+
output_file = 'DGE_DESeq2.html',
3333
output_dir = '\${PWD}',
3434
params = list(
3535
cpus = ${task.cpus},

0 commit comments

Comments
 (0)