-
Notifications
You must be signed in to change notification settings - Fork 5
Description
I have used the new purrr to parallelise fitting of TPCs, but have also changed fundamentally how multiple models are fit. Instead of fitting each model using a separate map call, we write a custom function of all the models we want to fit that we then pass to purrr. This makes it much easier to play ball with how purrr wants to parallelise. And I think should be the default way we try and fit going forwards. Maybe having an argument for models to fit and then returning a dataframe with them all. It may of course allow people to think less, but this seems to be the best way going forwards and avoids code repeats. Probably quite similar to #77.
Proof of principle code is here. It goes from 48s to 14s, a pretty massive speed up when you have LOADS of curves. This approach could replace all of the vignettes we have, at least the model fitting ones.
Open to thoughts @fwimp
# if librarian is not installed, install it
if (!requireNamespace("librarian", quietly = TRUE)) {
install.packages("librarian")
}
# load packages
librarian::shelf(
tidyverse,
rTPC,
nls.multstart,
progressr,
microbenchmark,
mirai
)
## ---------------------------
# load in data
data("chlorella_tpc")
d <- chlorella_tpc
# just some of the models, edit as wanted
d2 <- filter(d, curve_id <= 60)
# ever more complex uses of parallelisation of purrr and usign rtpc
#----------------------------------------------#
# 1. just fit a single model to the dataset ####
#----------------------------------------------#
# write a function to fit a single model
# fit single rTPC
fit_gaussian <- function(d, ...) {
# get start values and fit model
start_vals <- rTPC::get_start_vals(
d$temp,
d$rate,
model_name = 'gaussian_1987'
)
lower <- rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987')
upper <- rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987')
# fit model
mod <- nls.multstart::nls_multstart(
rate ~ rTPC::gaussian_1987(temp = temp, rmax, topt, a),
data = d,
iter = c(4, 4, 4),
start_lower = start_vals - 10,
start_upper = start_vals + 10,
lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
supp_errors = 'Y',
convergence_count = FALSE
)
return(mod)
}
# fit gaussian without parallelisation
start_normal <- Sys.time()
d_mods <- d2 %>%
nest(data = c(temp, rate)) %>%
mutate(
mods = map(
data,
~ fit_gaussian(d = .x)
),
.progress = TRUE
)
end_normal <- Sys.time()
print(end_normal - start_normal)
# create a parallel plan
start_time_parallel <- Sys.time()
# try parallelisation
daemons(3)
d_mods1 <- d2 %>%
nest(data = c(temp, rate)) %>%
mutate(
mods = map(
data,
in_parallel(
\(x) fit_gaussian(d = x),
fit_gaussian = fit_gaussian
),
.progress = TRUE
)
)
daemons(0)
end_time_parallel <- Sys.time()
# print time taken
print(end_time_parallel - start_time_parallel)
#------------------------------------------#
# 2. fit multiple models to the dataset ####
#------------------------------------------#
# instead of running multiple map calls within a single mutate call, we can include a single model fitting function to fit the models we want
# fit single rTPC
fit_TPCs <- function(d, ...) {
# get start values and fit model
start_vals <- rTPC::get_start_vals(
d$temp,
d$rate,
model_name = 'gaussian_1987'
)
# fit model
mod <- nls.multstart::nls_multstart(
rate ~ rTPC::gaussian_1987(temp = temp, rmax, topt, a),
data = d,
iter = c(4, 4, 4),
start_lower = start_vals - 10,
start_upper = start_vals + 10,
lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
supp_errors = 'Y',
convergence_count = FALSE,
...
)
# get start values and fit model
start_vals <- rTPC::get_start_vals(
d$temp,
d$rate,
model_name = 'boatman_2017'
)
# fit model
mod1 <- nls.multstart::nls_multstart(
rate ~ rTPC::boatman_2017(temp = temp, rmax, tmin, tmax, a, b),
data = d,
iter = c(4, 4, 4, 4, 4),
start_lower = start_vals - 10,
start_upper = start_vals + 10,
lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'boatman_2017'),
upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'boatman_2017'),
supp_errors = 'Y',
convergence_count = FALSE,
...
)
# get start values and fit model
start_vals <- rTPC::get_start_vals(
d$temp,
d$rate,
model_name = 'sharpeschoolhigh_1981'
)
# fit model
mod2 <- nls.multstart::nls_multstart(
rate ~
rTPC::sharpeschoolhigh_1981(temp = temp, r_tref, e, eh, th, tref = 20),
data = d,
iter = c(3, 3, 3, 3),
start_lower = start_vals - 10,
start_upper = start_vals + 10,
lower = rTPC::get_lower_lims(
d$temp,
d$rate,
model_name = 'sharpeschoolhigh_1981'
),
upper = rTPC::get_upper_lims(
d$temp,
d$rate,
model_name = 'sharpeschoolhigh_1981'
),
supp_errors = 'Y',
convergence_count = FALSE,
...
)
return(tibble::tibble(
gaussian = list(mod),
boatman = list(mod1),
sharpeschoolhigh = list(mod2)
))
}
# fit all models without parallelisation
start_normal <- Sys.time()
d_mods <- d2 %>%
nest(data = c(temp, rate)) %>%
mutate(
mods = map(
data,
~ fit_TPCs(d = .x),
.progress = TRUE
)
) %>%
unnest(mods)
end_normal <- Sys.time()
print(end_normal - start_normal)
# create a parallel plan
start_time_parallel <- Sys.time()
# try parallelisation
daemons(6)
d_mods1 <- d2 %>%
nest(data = c(temp, rate)) %>%
mutate(
mods = map(
data,
in_parallel(
\(x) fit_TPCs(d = x),
fit_TPCs = fit_TPCs
),
.progress = TRUE
)
) %>%
unnest(mods)
daemons(0)
end_time_parallel <- Sys.time()
print(end_time_parallel - start_time_parallel)