Skip to content

Commit 7ecd205

Browse files
committed
Clarify DGE code
1 parent e6aa936 commit 7ecd205

File tree

4 files changed

+70
-138
lines changed

4 files changed

+70
-138
lines changed

RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-G.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,7 +1192,7 @@ organism <- "organism_that_samples_were_derived_from"
11921192

11931193
runsheet_path="/path/to/directory/containing/runsheet.csv/file" ## This is the runsheet created in Step 9a above
11941194
work_dir="/path/to/working/directory/where/script/is/executed/from"
1195-
counts_dir="/path/to/directory/containing/RSEM/counts/files"
1195+
input_counts="/path/to/directory/containing/RSEM/counts/files"
11961196
norm_output="/path/to/normalized/counts/output/directory"
11971197
DGE_output="/path/to/DGE/output/directory"
11981198

@@ -1297,7 +1297,7 @@ rm(contrast.names)
12971297
```R
12981298
### Import RSEM gene count data ###
12991299
files <- list.files(
1300-
path = counts_dir,
1300+
path = input_counts,
13011301
pattern = ".genes.results",
13021302
full.names = TRUE
13031303
)

RNAseq/Pipeline_GL-DPPD-7115_Versions/GL-DPPD-7115.md

Lines changed: 15 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -986,7 +986,6 @@ dpt-isa-to-runsheet --accession GLDS-### \
986986

987987
```R
988988
### Install and load required packages ###
989-
990989
if (!require("BiocManager", quietly = TRUE))
991990
install.packages("BiocManager")
992991

@@ -1016,33 +1015,22 @@ library(DESeq2)
10161015
library(BiocParallel)
10171016

10181017
### Define which organism is used in the study - this should be consistent with the species name in the "species" column of the GL-DPPD-7110-A_annotations.csv file ###
1019-
10201018
organism <- "organism_that_samples_were_derived_from"
10211019

1022-
10231020
### Define the location of the input data and where the output data will be printed to ###
1024-
1025-
runsheet_path="/path/to/directory/containing/runsheet.csv/file" ## This is the runsheet created in Step 9a above
1021+
runsheet_path="/path/to/directory/containing/runsheet.csv/file"
10261022
work_dir="/path/to/working/directory/where/script/is/executed/from"
1027-
counts_dir="/path/to/directory/containing/FeatureCounts/counts/file"
10281023
norm_output="/path/to/normalized/counts/output/directory"
10291024
DGE_output="/path/to/DGE/output/directory"
10301025

1031-
10321026
### Pull in the GeneLab annotation table (GL-DPPD-7110-A_annotations.csv) file ###
1033-
10341027
org_table_link <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv"
1035-
10361028
org_table <- read.table(org_table_link, sep = ",", header = TRUE)
10371029

1038-
10391030
### Define the link to the GeneLab annotation table for the organism of interest ###
1040-
10411031
annotations_link <- org_table[org_table$species == organism, "genelab_annots_link"]
10421032

1043-
10441033
### Set your working directory to the directory where you will execute your DESeq2 script from ###
1045-
10461034
setwd(file.path(work_dir))
10471035

10481036
```
@@ -1064,7 +1052,6 @@ setwd(file.path(work_dir))
10641052

10651053
```R
10661054
### Pull all factors for each sample in the study from the runsheet created in Step 9a ###
1067-
10681055
compare_csv_from_runsheet <- function(runsheet_path) {
10691056
df <- read.csv(runsheet_path)
10701057
factors <- df %>%
@@ -1076,20 +1063,14 @@ compare_csv_from_runsheet <- function(runsheet_path) {
10761063
return(result)
10771064
}
10781065

1079-
10801066
### Load metadata from runsheet csv file ###
1081-
10821067
compare_csv <- compare_csv_from_runsheet(runsheet_path)
10831068

1084-
10851069
### Create data frame containing all samples and respective factors ###
1086-
10871070
study <- compare_csv[, -1, drop=FALSE] # Exclude sample_id
10881071
rownames(study) <- compare_csv$sample_id
10891072

1090-
10911073
### Format groups and indicate the group that each sample belongs to ###
1092-
10931074
group <- if (ncol(study) >= 2) {
10941075
apply(study, 1, paste, collapse = " & ")
10951076
} else {
@@ -1100,16 +1081,13 @@ group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group nam
11001081
names(group) <- group_names
11011082
rm(group_names)
11021083

1103-
11041084
### Format contrasts table, defining pairwise comparisons for all groups ###
1105-
11061085
contrast.names <- combn(levels(factor(names(group))),2) ## generate matrix of pairwise group combinations for comparison
11071086
contrasts <- apply(contrast.names, MARGIN=2, function(col) sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", stringr::str_sub(col, 2, -2))))) # limited make.names call for each group (also removes leading parentheses)
11081087
contrast.names <- c(paste(contrast.names[1,],contrast.names[2,],sep = "v"),paste(contrast.names[2,],contrast.names[1,],sep = "v")) ## format combinations for output table files names
11091088
contrasts <- cbind(contrasts,contrasts[c(2,1),])
11101089
colnames(contrasts) <- contrast.names
11111090
rm(contrast.names)
1112-
11131091
```
11141092

11151093
**Input Data:**
@@ -1128,32 +1106,18 @@ rm(contrast.names)
11281106

11291107
```R
11301108
### Import FeatureCounts data ###
1131-
counts_file <- "/path/to/FeatureCounts_GLbulkRNAseq.tsv"
1132-
1133-
# Load featureCounts data
1134-
featurecounts_data <- read.csv(file = counts_file,
1135-
header = TRUE,
1136-
sep = "\t",
1137-
skip = 1,
1138-
stringsAsFactors = FALSE,
1139-
check.names = FALSE)
1140-
1141-
# Identify metadata columns and sample columns
1142-
metadata_cols <- c("Geneid", "Chr", "Start", "End", "Strand", "Length")
1143-
sample_cols <- setdiff(colnames(featurecounts_data), metadata_cols)
1144-
1145-
# Remove the ".bam" suffix from sample columns
1146-
sample_cols <- sub("\\.bam$", "", sample_cols)
1147-
1148-
# Reorder sample columns to match the sample order in the study
1149-
samples <- rownames(study)
1150-
sample_col_indices <- match(samples, sample_cols)
1151-
1152-
# Create counts matrix
1153-
counts <- featurecounts_data[, sample_col_indices, drop = FALSE]
1154-
counts <- as.data.frame(lapply(counts, as.numeric))
1155-
colnames(counts) <- samples
1156-
rownames(counts) <- featurecounts_data$Geneid
1109+
input_counts <- "/path/to/FeatureCounts_GLbulkRNAseq.tsv"
1110+
1111+
### Load featureCounts data ###
1112+
featurecounts <- read.csv(params$input_counts, header = TRUE, sep = "\t", skip = 1)
1113+
1114+
### Create counts matrix: remove metadata columns from featurecounts table, remove bam file extension from column names ###
1115+
row.names(featurecounts) <- gsub("-", ".", featurecounts$Geneid)
1116+
counts <- featurecounts[,-c(1:6)]
1117+
colnames(counts) <- gsub("\\.bam$", "", colnames(counts))
1118+
1119+
### Reorder counts columns to match runsheet ###
1120+
counts <- counts[, rownames(study)]
11571121
```
11581122

11591123
**Input Data:**
@@ -1224,10 +1188,10 @@ handle_technical_replicates <- function(sampleTable) {
12241188
return(new_sampleTable)
12251189
}
12261190

1227-
# Apply the technical replicate handling
1191+
### Apply the technical replicate handling to the sample table ###
12281192
sampleTable <- handle_technical_replicates(sampleTable)
12291193

1230-
# Update the counts matrix to match the new sample table
1194+
### Remove columns from the counts matrix to match the sample table if necessary ###
12311195
counts <- counts[, rownames(sampleTable)]
12321196

12331197
### Build dds object ###

0 commit comments

Comments
 (0)