@@ -986,7 +986,6 @@ dpt-isa-to-runsheet --accession GLDS-### \
986
986
987
987
``` R
988
988
# ## Install and load required packages ###
989
-
990
989
if (! require(" BiocManager" , quietly = TRUE ))
991
990
install.packages(" BiocManager" )
992
991
@@ -1016,33 +1015,22 @@ library(DESeq2)
1016
1015
library(BiocParallel )
1017
1016
1018
1017
# ## Define which organism is used in the study - this should be consistent with the species name in the "species" column of the GL-DPPD-7110-A_annotations.csv file ###
1019
-
1020
1018
organism <- " organism_that_samples_were_derived_from"
1021
1019
1022
-
1023
1020
# ## Define the location of the input data and where the output data will be printed to ###
1024
-
1025
- runsheet_path = " /path/to/directory/containing/runsheet.csv/file" # # This is the runsheet created in Step 9a above
1021
+ runsheet_path = " /path/to/directory/containing/runsheet.csv/file"
1026
1022
work_dir = " /path/to/working/directory/where/script/is/executed/from"
1027
- counts_dir = " /path/to/directory/containing/FeatureCounts/counts/file"
1028
1023
norm_output = " /path/to/normalized/counts/output/directory"
1029
1024
DGE_output = " /path/to/DGE/output/directory"
1030
1025
1031
-
1032
1026
# ## Pull in the GeneLab annotation table (GL-DPPD-7110-A_annotations.csv) file ###
1033
-
1034
1027
org_table_link <- " https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv"
1035
-
1036
1028
org_table <- read.table(org_table_link , sep = " ," , header = TRUE )
1037
1029
1038
-
1039
1030
# ## Define the link to the GeneLab annotation table for the organism of interest ###
1040
-
1041
1031
annotations_link <- org_table [org_table $ species == organism , " genelab_annots_link" ]
1042
1032
1043
-
1044
1033
# ## Set your working directory to the directory where you will execute your DESeq2 script from ###
1045
-
1046
1034
setwd(file.path(work_dir ))
1047
1035
1048
1036
```
@@ -1064,7 +1052,6 @@ setwd(file.path(work_dir))
1064
1052
1065
1053
``` R
1066
1054
# ## Pull all factors for each sample in the study from the runsheet created in Step 9a ###
1067
-
1068
1055
compare_csv_from_runsheet <- function (runsheet_path ) {
1069
1056
df <- read.csv(runsheet_path )
1070
1057
factors <- df %> %
@@ -1076,20 +1063,14 @@ compare_csv_from_runsheet <- function(runsheet_path) {
1076
1063
return (result )
1077
1064
}
1078
1065
1079
-
1080
1066
# ## Load metadata from runsheet csv file ###
1081
-
1082
1067
compare_csv <- compare_csv_from_runsheet(runsheet_path )
1083
1068
1084
-
1085
1069
# ## Create data frame containing all samples and respective factors ###
1086
-
1087
1070
study <- compare_csv [, - 1 , drop = FALSE ] # Exclude sample_id
1088
1071
rownames(study ) <- compare_csv $ sample_id
1089
1072
1090
-
1091
1073
# ## Format groups and indicate the group that each sample belongs to ###
1092
-
1093
1074
group <- if (ncol(study ) > = 2 ) {
1094
1075
apply(study , 1 , paste , collapse = " & " )
1095
1076
} else {
@@ -1100,16 +1081,13 @@ group <- sub("^BLOCKER_", "", make.names(paste0("BLOCKER_", group))) # group nam
1100
1081
names(group ) <- group_names
1101
1082
rm(group_names )
1102
1083
1103
-
1104
1084
# ## Format contrasts table, defining pairwise comparisons for all groups ###
1105
-
1106
1085
contrast.names <- combn(levels(factor (names(group ))),2 ) # # generate matrix of pairwise group combinations for comparison
1107
1086
contrasts <- apply(contrast.names , MARGIN = 2 , function (col ) sub(" ^BLOCKER_" , " " , make.names(paste0(" BLOCKER_" , stringr :: str_sub(col , 2 , - 2 ))))) # limited make.names call for each group (also removes leading parentheses)
1108
1087
contrast.names <- c(paste(contrast.names [1 ,],contrast.names [2 ,],sep = " v" ),paste(contrast.names [2 ,],contrast.names [1 ,],sep = " v" )) # # format combinations for output table files names
1109
1088
contrasts <- cbind(contrasts ,contrasts [c(2 ,1 ),])
1110
1089
colnames(contrasts ) <- contrast.names
1111
1090
rm(contrast.names )
1112
-
1113
1091
```
1114
1092
1115
1093
** Input Data:**
@@ -1128,32 +1106,18 @@ rm(contrast.names)
1128
1106
1129
1107
``` R
1130
1108
# ## Import FeatureCounts data ###
1131
- counts_file <- " /path/to/FeatureCounts_GLbulkRNAseq.tsv"
1132
-
1133
- # Load featureCounts data
1134
- featurecounts_data <- read.csv(file = counts_file ,
1135
- header = TRUE ,
1136
- sep = " \t " ,
1137
- skip = 1 ,
1138
- stringsAsFactors = FALSE ,
1139
- check.names = FALSE )
1140
-
1141
- # Identify metadata columns and sample columns
1142
- metadata_cols <- c(" Geneid" , " Chr" , " Start" , " End" , " Strand" , " Length" )
1143
- sample_cols <- setdiff(colnames(featurecounts_data ), metadata_cols )
1144
-
1145
- # Remove the ".bam" suffix from sample columns
1146
- sample_cols <- sub(" \\ .bam$" , " " , sample_cols )
1147
-
1148
- # Reorder sample columns to match the sample order in the study
1149
- samples <- rownames(study )
1150
- sample_col_indices <- match(samples , sample_cols )
1151
-
1152
- # Create counts matrix
1153
- counts <- featurecounts_data [, sample_col_indices , drop = FALSE ]
1154
- counts <- as.data.frame(lapply(counts , as.numeric ))
1155
- colnames(counts ) <- samples
1156
- rownames(counts ) <- featurecounts_data $ Geneid
1109
+ input_counts <- " /path/to/FeatureCounts_GLbulkRNAseq.tsv"
1110
+
1111
+ # ## Load featureCounts data ###
1112
+ featurecounts <- read.csv(params $ input_counts , header = TRUE , sep = " \t " , skip = 1 )
1113
+
1114
+ # ## Create counts matrix: remove metadata columns from featurecounts table, remove bam file extension from column names ###
1115
+ row.names(featurecounts ) <- gsub(" -" , " ." , featurecounts $ Geneid )
1116
+ counts <- featurecounts [,- c(1 : 6 )]
1117
+ colnames(counts ) <- gsub(" \\ .bam$" , " " , colnames(counts ))
1118
+
1119
+ # ## Reorder counts columns to match runsheet ###
1120
+ counts <- counts [, rownames(study )]
1157
1121
```
1158
1122
1159
1123
** Input Data:**
@@ -1224,10 +1188,10 @@ handle_technical_replicates <- function(sampleTable) {
1224
1188
return (new_sampleTable )
1225
1189
}
1226
1190
1227
- # Apply the technical replicate handling
1191
+ # ## Apply the technical replicate handling to the sample table ###
1228
1192
sampleTable <- handle_technical_replicates(sampleTable )
1229
1193
1230
- # Update the counts matrix to match the new sample table
1194
+ # ## Remove columns from the counts matrix to match the sample table if necessary ###
1231
1195
counts <- counts [, rownames(sampleTable )]
1232
1196
1233
1197
# ## Build dds object ###
0 commit comments