Simulate values of summary statistics across a parameter grid

For a given data frame (each column a parameter of a slendr model function) simulates values of given population genetic statistics

Usage

simulate_grid(
  model,
  grid,
  functions,
  replicates,
  sequence_length,
  recombination_rate,
  mutation_rate = 0,
  data = NULL,
  format = c("ts", "files"),
  packages = NULL,
  file = NULL,
  engine = NULL,
  model_args = NULL,
  engine_args = NULL,
  strict = TRUE
)

Arguments

model: A slendr model generating function
grid: A data frame object containing parameter grid such as one produced by tidyr::expand_grid or base::expand.grid
functions: A named list of summary statistic functions to apply on simulated tree sequences
replicates: How many simulation replicates to run for each parameter combination?
sequence_length: Amount of sequence to simulate using slendr (in base pairs). Ignored when custom simulations scripts are provided.
recombination_rate: Recombination rate to use for the simulation
mutation_rate: Mutation rate to use for the simulation
data: A named list of data-generating functions. The names then represent all possible arguments of simulated summary statistic functions.
format: In which format will the model generate results to be used for computing simulated summary statistics?
packages: A character vector with package names used by user-defined summary statistic functions. Only relevant when parallelization is set up using future::plan() to make sure that the parallelized tree-sequence summary statistic functions have all of their packages available.
file: If not NULL, a path where to save the data frame with simulated grid results. If this path is set, the results data frame is returned but invisibly.
engine: Which simulation engine to use? Values "msprime" and "slim" will use one of the built-in slendr simulation back ends. Which engine will be used is determined by the nature of the model. If engine = NULL, then spatial slendr models will by default use the "slim" back end, non-spatial models will use the "msprime" back end, and custom user-defined model scripts will use the "custom" engine. Setting this argument explicitly will change the back ends (where appropriate). Setting this argument for custom simulation script has no effect.
model_args: Optional (non-prior) arguments for the slendr model generating function. Setting this argument for custom simulation script has no effect.
engine_args: Optional arguments for the slendr simulation back end. Setting this argument for custom simulation script has no effect.
strict: Should parameter combinations leading to invalid slendr models be treated as an error? Default is TRUE. If set to FALSE, invalid simulations will be simply dropped, with an informative message.

Value

If file != NULL, returns a data frame with simulated grid results. Otherwise does not return anything, saving an object to an .rds file instead.

A data frame object with the results of parameter grid simulations, with values of each summary statistic stored in a list-column

Examples

slendr::check_dependencies(python = TRUE, quit = TRUE)

library(demografr)

library(slendr)
init_env(quiet = TRUE)

##################################################
# define a model

model <- function(Ne_A, Ne_B, Ne_C, Ne_D, T_AB, T_BC, T_CD, gf_BC) {
  A <- population("A", time = 1,    N = Ne_A)
  B <- population("B", time = T_AB, N = Ne_B, parent = A)
  C <- population("C", time = T_BC, N = Ne_C, parent = B)
  D <- population("D", time = T_CD, N = Ne_D, parent = C)

  gf <- gene_flow(from = B, to = C, start = 9000, end = 9301, rate = gf_BC)

  model <- compile_model(
    populations = list(A, B, C, D), gene_flow = gf,
    generation_time = 1, simulation_length = 10000,
    direction = "forward", serialize = FALSE
  )

  samples <- schedule_sampling(
    model, times = 10000,
    list(A, 25), list(B, 25), list(C, 25), list(D, 25),
    strict = TRUE
  )

  # when a specific sampling schedule is to be used, both model and samples
  # must be returned by the function
  return(list(model, samples))
}

##################################################
# set up parameter grid

grid <- expand.grid(
  Ne_A  = c(1000, 3000),
  Ne_B  = c(100,  1500),
  Ne_C  = c(5000, 10000),
  Ne_D  = c(2000, 7000),

  T_AB  = c(100,    3000),
  T_BC  = c(4000, 6000),
  T_CD  = c(7000, 10000),

  gf_BC = 0.1
)

# let's make the grid a little smaller just for this example
grid <- grid[1:10, ]

##################################################
# prepare a list of simulated summary statistics

compute_diversity <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  ts_diversity(ts, sample_sets = samples)
}
compute_divergence <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  ts_divergence(ts, sample_sets = samples)
}
compute_f4 <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  A <- samples["A"]; B <- samples["B"]
  C <- samples["C"]; D <- samples["D"]
  ts_f4(ts, A, B, C, D)
}

functions <- list(
  diversity  = compute_diversity,
  divergence = compute_divergence,
  f4         = compute_f4
)

##################################################
# simulate data from a single model run
# (step #1 of one replicate of a grid simulation)

ts <- simulate_model(model, grid, sequence_length = 1e6, recombination_rate = 1e-8)

##################################################
# simulate data from a single model run
# (step #2 of one replicate of a grid simulation)

summarise_data(ts, functions)
#> $diversity
#> # A tibble: 4 × 2
#>   set   diversity
#>   <chr>     <dbl>
#> 1 A             0
#> 2 B             0
#> 3 C             0
#> 4 D             0
#> 
#> $divergence
#> # A tibble: 6 × 3
#>   x     y     divergence
#>   <chr> <chr>      <dbl>
#> 1 A     B              0
#> 2 A     C              0
#> 3 A     D              0
#> 4 B     C              0
#> 5 B     D              0
#> 6 C     D              0
#> 
#> $f4
#> # A tibble: 1 × 5
#>   W     X     Y     Z        f4
#>   <chr> <chr> <chr> <chr> <dbl>
#> 1 A     B     C     D         0
#> 

#
# we're skipping the remaining steps because they are extremely
# computationally intensive for the scope of this example
#

##################################################
# set up paralelization

# library(future)
# plan(multisession, workers = availableCores())

##################################################
# simulate data across the parameter grid

data <- simulate_grid(
  model, grid, functions, replicates = 1,
  sequence_length = 1e6, recombination_rate = 1e-8, mutation_rate = 1e-8
)

# the results (summary statistic values simulated across the parameter grid)
# are present in list-columns in the produced data frame (in this example,
# columns `diversity`, `divergence`, `f4`)
data
#> # A tibble: 10 × 12
#>      rep  Ne_A  Ne_B  Ne_C  Ne_D  T_AB  T_BC  T_CD gf_BC diversity divergence
#>    <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list>    <list>    
#>  1     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  2     1  3000   100  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  3     1  1000  1500  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  4     1  3000  1500  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  5     1  1000   100 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  6     1  3000   100 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  7     1  1000  1500 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  8     1  3000  1500 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  9     1  1000   100  5000  7000   100  4000  7000   0.1 <tibble>  <tibble>  
#> 10     1  3000   100  5000  7000   100  4000  7000   0.1 <tibble>  <tibble>  
#> # ℹ 1 more variable: f4 <list>

# for easier data analysis, each statistic can be unnested
tidyr::unnest(data, diversity)
#> # A tibble: 40 × 13
#>      rep  Ne_A  Ne_B  Ne_C  Ne_D  T_AB  T_BC  T_CD gf_BC set    diversity
#>    <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>      <dbl>
#>  1     1  1000   100  5000  2000   100  4000  7000   0.1 A     0.0000563 
#>  2     1  1000   100  5000  2000   100  4000  7000   0.1 B     0.00000244
#>  3     1  1000   100  5000  2000   100  4000  7000   0.1 C     0.0000923 
#>  4     1  1000   100  5000  2000   100  4000  7000   0.1 D     0.0000725 
#>  5     1  3000   100  5000  2000   100  4000  7000   0.1 A     0.000136  
#>  6     1  3000   100  5000  2000   100  4000  7000   0.1 B     0.00000795
#>  7     1  3000   100  5000  2000   100  4000  7000   0.1 C     0.0000887 
#>  8     1  3000   100  5000  2000   100  4000  7000   0.1 D     0.0000750 
#>  9     1  1000  1500  5000  2000   100  4000  7000   0.1 A     0.0000481 
#> 10     1  1000  1500  5000  2000   100  4000  7000   0.1 B     0.0000544 
#> # ℹ 30 more rows
#> # ℹ 2 more variables: divergence <list>, f4 <list>
tidyr::unnest(data, divergence)
#> # A tibble: 60 × 14
#>      rep  Ne_A  Ne_B  Ne_C  Ne_D  T_AB  T_BC  T_CD gf_BC diversity x     y    
#>    <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list>    <chr> <chr>
#>  1     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  A     B    
#>  2     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  A     C    
#>  3     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  A     D    
#>  4     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  B     C    
#>  5     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  B     D    
#>  6     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  C     D    
#>  7     1  3000   100  5000  2000   100  4000  7000   0.1 <tibble>  A     B    
#>  8     1  3000   100  5000  2000   100  4000  7000   0.1 <tibble>  A     C    
#>  9     1  3000   100  5000  2000   100  4000  7000   0.1 <tibble>  A     D    
#> 10     1  3000   100  5000  2000   100  4000  7000   0.1 <tibble>  B     C    
#> # ℹ 50 more rows
#> # ℹ 2 more variables: divergence <dbl>, f4 <list>
tidyr::unnest(data, f4)
#> # A tibble: 10 × 16
#>      rep  Ne_A  Ne_B  Ne_C  Ne_D  T_AB  T_BC  T_CD gf_BC diversity divergence
#>    <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list>    <list>    
#>  1     1  1000   100  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  2     1  3000   100  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  3     1  1000  1500  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  4     1  3000  1500  5000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  5     1  1000   100 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  6     1  3000   100 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  7     1  1000  1500 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  8     1  3000  1500 10000  2000   100  4000  7000   0.1 <tibble>  <tibble>  
#>  9     1  1000   100  5000  7000   100  4000  7000   0.1 <tibble>  <tibble>  
#> 10     1  3000   100  5000  7000   100  4000  7000   0.1 <tibble>  <tibble>  
#> # ℹ 5 more variables: W <chr>, X <chr>, Y <chr>, Z <chr>, f4 <dbl>