Parallelising rTPC

I have used the new **purrr** to parallelise fitting of TPCs, but have also changed fundamentally how multiple models are fit. Instead of fitting each model using a separate **map** call, we write a custom function of all the models we want to fit that we then pass to **purrr**. This makes it much easier to play ball with how **purrr** wants to parallelise. And I think should be the default way we try and fit going forwards. Maybe having an argument for models to fit and then returning a dataframe with them all. It may of course allow people to think less, but this seems to be the best way going forwards and avoids code repeats. Probably quite similar to https://github.com/padpadpadpad/rTPC/issues/77.

Proof of principle code is here. It goes from 48s to 14s, a pretty massive speed up when you have LOADS of curves. This approach could replace all of the vignettes we have, at least the model fitting ones.

Open to thoughts @fwimp 

```r
# if librarian is not installed, install it
if (!requireNamespace("librarian", quietly = TRUE)) {
  install.packages("librarian")
}
# load packages
librarian::shelf(
  tidyverse,
  rTPC,
  nls.multstart,
  progressr,
  microbenchmark,
  mirai
)

## ---------------------------

# load in data
data("chlorella_tpc")

d <- chlorella_tpc

# just some of the models, edit as wanted
d2 <- filter(d, curve_id <= 60)

# ever more complex uses of parallelisation of purrr and usign rtpc

#----------------------------------------------#
# 1. just fit a single model to the dataset ####
#----------------------------------------------#

# write a function to fit a single model

# fit single rTPC
fit_gaussian <- function(d, ...) {
  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'gaussian_1987'
  )
  lower <- rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987')
  upper <- rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987')

  # fit model
  mod <- nls.multstart::nls_multstart(
    rate ~ rTPC::gaussian_1987(temp = temp, rmax, topt, a),
    data = d,
    iter = c(4, 4, 4),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    supp_errors = 'Y',
    convergence_count = FALSE
  )

  return(mod)
}

# fit gaussian without parallelisation
start_normal <- Sys.time()

d_mods <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      ~ fit_gaussian(d = .x)
    ),
    .progress = TRUE
  )

end_normal <- Sys.time()

print(end_normal - start_normal)

# create a parallel plan
start_time_parallel <- Sys.time()

# try parallelisation
daemons(3)

d_mods1 <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      in_parallel(
        \(x) fit_gaussian(d = x),
        fit_gaussian = fit_gaussian
      ),
      .progress = TRUE
    )
  )


daemons(0)

end_time_parallel <- Sys.time()

# print time taken
print(end_time_parallel - start_time_parallel)

#------------------------------------------#
# 2. fit multiple models to the dataset ####
#------------------------------------------#

# instead of running multiple map calls within a single mutate call, we can include a single model fitting function to fit the models we want

# fit single rTPC
fit_TPCs <- function(d, ...) {
  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'gaussian_1987'
  )

  # fit model
  mod <- nls.multstart::nls_multstart(
    rate ~ rTPC::gaussian_1987(temp = temp, rmax, topt, a),
    data = d,
    iter = c(4, 4, 4),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    supp_errors = 'Y',
    convergence_count = FALSE,
    ...
  )

  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'boatman_2017'
  )

  # fit model
  mod1 <- nls.multstart::nls_multstart(
    rate ~ rTPC::boatman_2017(temp = temp, rmax, tmin, tmax, a, b),
    data = d,
    iter = c(4, 4, 4, 4, 4),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'boatman_2017'),
    upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'boatman_2017'),
    supp_errors = 'Y',
    convergence_count = FALSE,
    ...
  )

  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'sharpeschoolhigh_1981'
  )

  # fit model
  mod2 <- nls.multstart::nls_multstart(
    rate ~
      rTPC::sharpeschoolhigh_1981(temp = temp, r_tref, e, eh, th, tref = 20),
    data = d,
    iter = c(3, 3, 3, 3),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(
      d$temp,
      d$rate,
      model_name = 'sharpeschoolhigh_1981'
    ),
    upper = rTPC::get_upper_lims(
      d$temp,
      d$rate,
      model_name = 'sharpeschoolhigh_1981'
    ),
    supp_errors = 'Y',
    convergence_count = FALSE,
    ...
  )

  return(tibble::tibble(
    gaussian = list(mod),
    boatman = list(mod1),
    sharpeschoolhigh = list(mod2)
  ))
}

# fit all models without parallelisation
start_normal <- Sys.time()

d_mods <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      ~ fit_TPCs(d = .x),
      .progress = TRUE
    )
  ) %>%
  unnest(mods)

end_normal <- Sys.time()

print(end_normal - start_normal)

# create a parallel plan
start_time_parallel <- Sys.time()

# try parallelisation
daemons(6)

d_mods1 <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      in_parallel(
        \(x) fit_TPCs(d = x),
        fit_TPCs = fit_TPCs
      ),
      .progress = TRUE
    )
  ) %>%
  unnest(mods)

daemons(0)

end_time_parallel <- Sys.time()

print(end_time_parallel - start_time_parallel)

``` 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Parallelising rTPC #78

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Parallelising rTPC #78

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions