Skip to content

Commit 7c1195d

Browse files
authored
initial commit
1 parent a5d29db commit 7c1195d

12 files changed

+1431
-0
lines changed

OIC_meta.Rproj

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: Default
4+
SaveWorkspace: No
5+
AlwaysSaveHistory: No
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX

R/Flowchart.R

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Meta-analysis of the rodent object-in-context task: Get flowchart numbers
2+
# Written by Milou Sep
3+
4+
rm(list=ls())
5+
library(dplyr)
6+
library(osfr) # to interact with Open Science Framework
7+
# instructions: https://github.com/CenterForOpenScience/osfr
8+
# Authenticate to OSF (see: http://centerforopenscience.github.io/osfr/articles/auth.html). Via osf_auth("PAT") in the commentline. (note PAT can be derived from OSF)
9+
library(readxl)
10+
11+
12+
# retrieve OSF files ------------------------------------------------------
13+
#search hits
14+
osf_retrieve_file("dvwhn") %>% osf_download(path = "data", conflicts="overwrite") # search hits search string thesis MV
15+
osf_retrieve_file("umz5k") %>% osf_download(path = "data", conflicts="overwrite") # search hits new search string (25.5.20)
16+
# Screening
17+
osf_retrieve_file("j7sfm") %>% osf_download(path = "data", conflicts="overwrite") # screening search string thesis MV
18+
osf_retrieve_file("bz3sj") %>% osf_download(path = "data", conflicts="overwrite") # screening new search string
19+
# data extraction file
20+
osf_retrieve_file("qadch") %>% osf_download(path = "data", conflicts="overwrite")
21+
22+
# Load data ---------------------------------------------------------------
23+
# search documents (PMID's)
24+
search1.thesis <- read.table("data/hits.search.thesis.MV.txt", quote="\"", comment.char="")
25+
search2.meta <- read.table("data/hits.new.search.meta.oic.v25.5.20.txt", quote="\"", comment.char="")
26+
# screening documents
27+
screeing.search1 <-read.csv2("data/Screening S1 thesis search PMIDs.csv", na.strings = c(""))
28+
screeing.search2 <-read.csv2("data/Screening S2 new.in.new.search.PMIDs.csv", na.strings = c(""))
29+
# data extraction file
30+
read_excel("data/280121_Data_Extraction_RoB.xlsx", sheet = "Extraction_converted", na=c("N/A", "#VALUE!"))->data
31+
32+
33+
# Compare PMIDs -----------------------------------------------------------
34+
search1.thesis$V1[!search1.thesis$V1 %in% search2.meta$V1]-> not.in.new
35+
search2.meta$V1[!search2.meta$V1 %in% search1.thesis$V1]->new.in.new
36+
search2.meta$V1[search2.meta$V1 %in% search1.thesis$V1]->old.in.new
37+
38+
# Count hits and screened papers
39+
nrow(search2.meta) #253
40+
nrow(search1.thesis) # 54
41+
nrow(screeing.search1) # 54
42+
length(new.in.new) #219
43+
nrow(screeing.search2) # 219 (so correct)
44+
length(not.in.new) # 20
45+
length(old.in.new) # 34
46+
#new in new + old in new
47+
219+34 # =253
48+
# not in new + old in new
49+
20+34 # = 54
50+
51+
52+
# Counts for Flowchart ----------------------------------------------------
53+
# Total
54+
screeing.search1 %>% filter(PMID %in% old.in.new) %>% nrow()# 34
55+
screeing.search2 %>% filter(PMID %in% new.in.new) %>% nrow() # 219
56+
34+219 #= 253
57+
58+
59+
# Counts search 1 (thesis) ------------------------------------------------
60+
# str(screeing.search1)
61+
62+
screeing.search1 %>%
63+
filter(PMID %in% old.in.new) %>%
64+
filter(inclusie_july2020 == "yes") %>% nrow() # 30 included
65+
66+
screeing.search1 %>%
67+
filter(PMID %in% old.in.new) %>%
68+
filter(full.text.checked.july2020 == "no") %>% #nrow()# 26
69+
filter(inclusie_july2020 == "yes") %>% nrow()# 26 included without full text check
70+
71+
screeing.search1 %>%
72+
filter(PMID %in% old.in.new) %>%
73+
filter(full.text.checked.july2020 == "yes") %>% #nrow()# 8 full text checked
74+
filter(inclusie_july2020 == "yes") # of that 4 included (so 4 excluded)
75+
76+
screeing.search1 %>%
77+
filter(PMID %in% old.in.new) %>%
78+
filter(inclusie_july2020 == "no?" | inclusie_july2020 == "no") # 4 excluded (reason for all: no OIC)
79+
80+
81+
# Counts search 2 (new) ---------------------------------------------------
82+
# str(screeing.search2)
83+
84+
#total
85+
screeing.search2 %>% filter(PMID %in% new.in.new) %>%
86+
filter(final.inclusion.screening == "Yes") %>% nrow() # 10 included
87+
88+
#included without full text check
89+
screeing.search2 %>% filter(PMID %in% new.in.new) %>% #View()
90+
filter(full.text.checked.MS == "no") %>% #nrow()# 140 full text NOT checked
91+
filter(final.inclusion.screening == "Yes") # of that 10 included
92+
93+
# excluded without full text check in s2 (+ reasons for exclusion)
94+
screeing.search2 %>% filter(PMID %in% new.in.new) %>% #View()
95+
filter(full.text.checked.MS == "no") %>%
96+
filter(final.inclusion.screening == "No") %>% #nrow()# 130
97+
mutate(Reason.for.exclusion = factor(Reason.for.exclusion))%>%
98+
select(Reason.for.exclusion) %>% table()
99+
100+
# included after full text check
101+
screeing.search2 %>% filter(PMID %in% new.in.new) %>%
102+
filter(full.text.checked.MS == "yes"| is.na(full.text.checked.MS)) %>% #nrow()# 79 full text checked
103+
filter(final.inclusion.screening == "Yes") # of that 0 included
104+
105+
# excluded after full text check in s2 (+ reason for exclusion)
106+
screeing.search2 %>% filter(PMID %in% new.in.new) %>%
107+
filter(full.text.checked.MS == "yes"| is.na(full.text.checked.MS)) %>%
108+
filter(final.inclusion.screening == "No") %>% #nrow() # 79 excluded
109+
mutate(Reason.for.exclusion = factor(Reason.for.exclusion))%>%
110+
select(Reason.for.exclusion) %>% table()
111+
112+
113+
114+
# Unique inclusions thesis search + new search ----------------------------
115+
30 + 10 # total 40 inclusions following screening
116+
117+
118+
# Compare to numbers in data extraction file ------------------------------
119+
data %>% distinct(PMID) %>%nrow() # 41 studies
120+
121+
122+
# Difference inclusions vs data extraction: snowballing -------------------
123+
screeing.search1 %>% filter(inclusie_july2020 == "yes") %>% select(PMID) ->inclusions.S1
124+
screeing.search2 %>% filter(final.inclusion.screening == "Yes") %>% select(PMID)->inclusions.S2
125+
c(inclusions.S1 , inclusions.S2) %>% unlist() -> inclusions
126+
127+
data %>% filter(!PMID %in% inclusions) # 1 identified via snowballing
128+
129+
# To check: all inclusions from screening files are in dataset
130+
inclusions[(!inclusions %in% data$PMID)]
131+
132+
133+
# Total inclusions --------------------------------------------------------
134+
# of these 41 studies, 4 studies are included in the systematic review but excluded for the meta-analyses due to missing data. Also see the prepare_data.rmd script.
135+
136+
137+
# save file local ---------------------------------------------------------
138+
# Note these files are also on OSF (https://osf.io/d9eu4/)
139+
write.table(not.in.new, file = "processed_data/not.in.new.search.csv", sep = ';', row.names = F)
140+
write.table(new.in.new, file = "processed_data/new.in.new.search.csv", sep = ';', row.names = F)
141+
write.table(old.in.new, file = "processed_data/old.in.new.search.csv", sep = ';', row.names = F)

R/MetaForest.Rmd

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
---
2+
title: "Meta-analysis of the rodent object-in-context task: Random forest-based Meta-analysis (MetaForest)"
3+
author: "Milou Sep"
4+
date: "`r format(Sys.time(), '%d %B, %Y')`"
5+
output: html_document
6+
---
7+
8+
# Prepare environment
9+
```{r setup, include=FALSE}
10+
rm(list = ls()) #clean environment
11+
# packages
12+
library(dplyr)
13+
library(caret) #for cross-validation
14+
library(metaforest) #for exploratory analysis
15+
# to check which packages are loaded
16+
# (.packages())
17+
```
18+
19+
# Metaforest
20+
https://www.rdocumentation.org/packages/metaforest/versions/0.1.3
21+
22+
## Data preparation
23+
```{r metaforest preparation}
24+
readRDS("processed_data/data_with_effect_size.RDS") ->data6
25+
str(data6)
26+
27+
# create dataset for MetaForest
28+
data6 %>%
29+
select( -c(PMID, EXP_group,
30+
SmellCntrl, # only 1 level, so non-informative
31+
nC,
32+
# other outcome variables
33+
DR, DR_A, DR_B, SEM, SEM_DR_A, SEM_DR_B, SD, label,
34+
no.Affiliation, # captures mostly the same variation as `PMID` variable (no.Affiliation: 29 levels, PMID: 37 levels)
35+
Program # captures mostly the same variation as in the 'scoring' variable.
36+
)) %>%
37+
droplevels() -> datForest
38+
is.na(datForest) %>% any()
39+
```
40+
41+
## Save data for WS follow-up plots
42+
```{r save data WS plots}
43+
data6 %>% select(nC, ID, PMID)->sample.sizes
44+
full_join(sample.sizes, datForest, by="ID")->datForest_for_WS_Plot
45+
saveRDS(datForest_for_WS_Plot, "processed_data/datForest_for_WS_Plot.RDS")
46+
```
47+
48+
## MetaForest: Tuning
49+
Note, tuning assumes that convergence is reached. To set-up metaforest, also see: https://www.rdocumentation.org/packages/metaforest/versions/0.1.3/topics/ModelInfo_mf
50+
```{r metaforest tuning}
51+
set.seed(94372)
52+
ModelInfo_mf()$notes # information on method from MetaForest
53+
# Set up 9-fold grouped cross-validation
54+
fit_control <- trainControl(method = "cv", number = 9, index = groupKFold(datForest$ID, k = 9))
55+
# Set up a custom tuning grid for the three tuning parameters of MetaForest
56+
rf_grid <- expand.grid(whichweights = c("random", "fixed", "unif"),
57+
mtry = c(2, 4, 6),
58+
min.node.size = c(2, 4, 6))
59+
```
60+
61+
## MetaForest: Train Model
62+
Cross-validated clustered MetaForest
63+
```{r MetaForest training}
64+
set.seed(94372)
65+
# Train the model
66+
cv.mf.cluster <- train(y = datForest$yi, x = datForest[names(datForest) !="yi"],
67+
study = "ID",
68+
method = ModelInfo_mf(), #= the method from MetaForest. (https://rdrr.io/cran/metaforest/man/MetaForest.html)
69+
trControl = fit_control,
70+
tuneGrid = rf_grid,
71+
num.trees = 600,
72+
data=datForest)
73+
# Save the model
74+
saveRDS(cv.mf.cluster,file="processed_data/fitted.MetaForest.RDS")
75+
```
76+
77+
To load saved model directly
78+
```{r eval=FALSE, include=FALSE}
79+
readRDS("processed_data/fitted.MetaForest.RDS")->cv.mf.cluster
80+
```
81+
82+
## MetaForest: Inspect model
83+
84+
### Tuning parameters
85+
#### Show model: The final values used for the model were whichweights = unif, mtry = 4 and min.node.size = 2
86+
```{r inspect model 1}
87+
cv.mf.cluster
88+
```
89+
#### Plot grid parameter models
90+
```{r inspect model 2}
91+
plot(cv.mf.cluster)
92+
```
93+
94+
### Model Summary
95+
```{r inspect model 3}
96+
summary(cv.mf.cluster)
97+
```
98+
99+
### R(oob)
100+
https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710
101+
```{r inspect model 4}
102+
cv.mf.cluster$finalModel
103+
```
104+
105+
### Cross validated R2 with SD
106+
details for the "best" model: unif, mtry = 4 and min.node.size = 2
107+
```{r inspect model 5}
108+
cv.mf.cluster$results[which(cv.mf.cluster$results$whichweights == "unif" &
109+
cv.mf.cluster$results$mtry == 4 &
110+
cv.mf.cluster$results$min.node.size == 2),]
111+
```
112+
113+
### Convergence plot
114+
Good model convergence
115+
```{r convergence plot}
116+
jpeg(file="results/metaforest_convergencePlot.jpeg", height = 3000, width = 3000, res=500)
117+
plot(cv.mf.cluster$finalModel)
118+
dev.off()
119+
```
120+
121+
## Variable importance (based on metaForest)
122+
```{r variable importance plot}
123+
tiff(file="results/metaforest_varImportance.tiff", height=1500, width=1500, pointsize=8, res=300)
124+
VarImpPlot(cv.mf.cluster$finalModel)+
125+
geom_hline(yintercept=11.5) +
126+
annotate(geom="text", x=0.0017, y=12.5, label="Top 50%", color="darkblue")
127+
dev.off()
128+
```
129+
130+
## Export important variables
131+
```{r metaforest varimportance scores}
132+
varImp(cv.mf.cluster) ->imp.scores
133+
134+
# Export variable importance scores (ordered) to csv
135+
imp <- as.data.frame(imp.scores$importance)
136+
imp <- data.frame(overall = imp$Overall,
137+
names = rownames(imp))
138+
write.csv2(imp[order(imp$overall,decreasing = T),],"results/important_variables_metaforest.csv")
139+
140+
# Select top 50% most important variables
141+
imp[order(imp$overall,decreasing = T),] %>%
142+
slice_head(.,n=nrow(.) / 2) -> important.vars
143+
important.vars$names ->variable
144+
variable
145+
146+
# Save top 50% most important variables
147+
saveRDS(variable,"processed_data/important_variables.RDS")
148+
```

0 commit comments

Comments
 (0)