Skip to content

Parallelising rTPC #78

@padpadpadpad

Description

@padpadpadpad

I have used the new purrr to parallelise fitting of TPCs, but have also changed fundamentally how multiple models are fit. Instead of fitting each model using a separate map call, we write a custom function of all the models we want to fit that we then pass to purrr. This makes it much easier to play ball with how purrr wants to parallelise. And I think should be the default way we try and fit going forwards. Maybe having an argument for models to fit and then returning a dataframe with them all. It may of course allow people to think less, but this seems to be the best way going forwards and avoids code repeats. Probably quite similar to #77.

Proof of principle code is here. It goes from 48s to 14s, a pretty massive speed up when you have LOADS of curves. This approach could replace all of the vignettes we have, at least the model fitting ones.

Open to thoughts @fwimp

# if librarian is not installed, install it
if (!requireNamespace("librarian", quietly = TRUE)) {
  install.packages("librarian")
}
# load packages
librarian::shelf(
  tidyverse,
  rTPC,
  nls.multstart,
  progressr,
  microbenchmark,
  mirai
)

## ---------------------------

# load in data
data("chlorella_tpc")

d <- chlorella_tpc

# just some of the models, edit as wanted
d2 <- filter(d, curve_id <= 60)

# ever more complex uses of parallelisation of purrr and usign rtpc

#----------------------------------------------#
# 1. just fit a single model to the dataset ####
#----------------------------------------------#

# write a function to fit a single model

# fit single rTPC
fit_gaussian <- function(d, ...) {
  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'gaussian_1987'
  )
  lower <- rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987')
  upper <- rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987')

  # fit model
  mod <- nls.multstart::nls_multstart(
    rate ~ rTPC::gaussian_1987(temp = temp, rmax, topt, a),
    data = d,
    iter = c(4, 4, 4),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    supp_errors = 'Y',
    convergence_count = FALSE
  )

  return(mod)
}

# fit gaussian without parallelisation
start_normal <- Sys.time()

d_mods <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      ~ fit_gaussian(d = .x)
    ),
    .progress = TRUE
  )

end_normal <- Sys.time()

print(end_normal - start_normal)

# create a parallel plan
start_time_parallel <- Sys.time()

# try parallelisation
daemons(3)

d_mods1 <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      in_parallel(
        \(x) fit_gaussian(d = x),
        fit_gaussian = fit_gaussian
      ),
      .progress = TRUE
    )
  )


daemons(0)

end_time_parallel <- Sys.time()

# print time taken
print(end_time_parallel - start_time_parallel)

#------------------------------------------#
# 2. fit multiple models to the dataset ####
#------------------------------------------#

# instead of running multiple map calls within a single mutate call, we can include a single model fitting function to fit the models we want

# fit single rTPC
fit_TPCs <- function(d, ...) {
  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'gaussian_1987'
  )

  # fit model
  mod <- nls.multstart::nls_multstart(
    rate ~ rTPC::gaussian_1987(temp = temp, rmax, topt, a),
    data = d,
    iter = c(4, 4, 4),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'gaussian_1987'),
    supp_errors = 'Y',
    convergence_count = FALSE,
    ...
  )

  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'boatman_2017'
  )

  # fit model
  mod1 <- nls.multstart::nls_multstart(
    rate ~ rTPC::boatman_2017(temp = temp, rmax, tmin, tmax, a, b),
    data = d,
    iter = c(4, 4, 4, 4, 4),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(d$temp, d$rate, model_name = 'boatman_2017'),
    upper = rTPC::get_upper_lims(d$temp, d$rate, model_name = 'boatman_2017'),
    supp_errors = 'Y',
    convergence_count = FALSE,
    ...
  )

  # get start values and fit model
  start_vals <- rTPC::get_start_vals(
    d$temp,
    d$rate,
    model_name = 'sharpeschoolhigh_1981'
  )

  # fit model
  mod2 <- nls.multstart::nls_multstart(
    rate ~
      rTPC::sharpeschoolhigh_1981(temp = temp, r_tref, e, eh, th, tref = 20),
    data = d,
    iter = c(3, 3, 3, 3),
    start_lower = start_vals - 10,
    start_upper = start_vals + 10,
    lower = rTPC::get_lower_lims(
      d$temp,
      d$rate,
      model_name = 'sharpeschoolhigh_1981'
    ),
    upper = rTPC::get_upper_lims(
      d$temp,
      d$rate,
      model_name = 'sharpeschoolhigh_1981'
    ),
    supp_errors = 'Y',
    convergence_count = FALSE,
    ...
  )

  return(tibble::tibble(
    gaussian = list(mod),
    boatman = list(mod1),
    sharpeschoolhigh = list(mod2)
  ))
}

# fit all models without parallelisation
start_normal <- Sys.time()

d_mods <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      ~ fit_TPCs(d = .x),
      .progress = TRUE
    )
  ) %>%
  unnest(mods)

end_normal <- Sys.time()

print(end_normal - start_normal)

# create a parallel plan
start_time_parallel <- Sys.time()

# try parallelisation
daemons(6)

d_mods1 <- d2 %>%
  nest(data = c(temp, rate)) %>%
  mutate(
    mods = map(
      data,
      in_parallel(
        \(x) fit_TPCs(d = x),
        fit_TPCs = fit_TPCs
      ),
      .progress = TRUE
    )
  ) %>%
  unnest(mods)

daemons(0)

end_time_parallel <- Sys.time()

print(end_time_parallel - start_time_parallel)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions