
Simulate values of summary statistics across a parameter grid
Source:R/simulate_grid.R
simulate_grid.Rd
For a given data frame (each column a parameter of a slendr model function) simulates values of given population genetic statistics
Usage
simulate_grid(
model,
grid,
functions,
replicates,
sequence_length,
recombination_rate,
mutation_rate = 0,
data = NULL,
format = c("ts", "files"),
packages = NULL,
file = NULL,
engine = NULL,
model_args = NULL,
engine_args = NULL,
strict = TRUE
)
Arguments
- model
A slendr model generating function
- grid
A data frame object containing parameter grid such as one produced by
tidyr::expand_grid
orbase::expand.grid
- functions
A named list of summary statistic functions to apply on simulated tree sequences
- replicates
How many simulation replicates to run for each parameter combination?
- sequence_length
Amount of sequence to simulate using slendr (in base pairs). Ignored when custom simulations scripts are provided.
- recombination_rate
Recombination rate to use for the simulation
- mutation_rate
Mutation rate to use for the simulation
- data
A named list of data-generating functions. The names then represent all possible arguments of simulated summary statistic functions.
- format
In which format will the model generate results to be used for computing simulated summary statistics?
- packages
A character vector with package names used by user-defined summary statistic functions. Only relevant when parallelization is set up using
future::plan()
to make sure that the parallelized tree-sequence summary statistic functions have all of their packages available.- file
If not
NULL
, a path where to save the data frame with simulated grid results. If this path is set, the results data frame is returned but invisibly.- engine
Which simulation engine to use? Values "msprime" and "slim" will use one of the built-in slendr simulation back ends. Which engine will be used is determined by the nature of the
model
. Ifengine = NULL
, then spatial slendr models will by default use the "slim" back end, non-spatial models will use the "msprime" back end, and custom user-defined model scripts will use the "custom" engine. Setting this argument explicitly will change the back ends (where appropriate). Setting this argument for custom simulation script has no effect.- model_args
Optional (non-prior) arguments for the slendr model generating function. Setting this argument for custom simulation script has no effect.
- engine_args
Optional arguments for the slendr simulation back end. Setting this argument for custom simulation script has no effect.
- strict
Should parameter combinations leading to invalid slendr models be treated as an error? Default is
TRUE
. If set toFALSE
, invalid simulations will be simply dropped, with an informative message.
Value
If file != NULL
, returns a data frame with simulated grid results. Otherwise
does not return anything, saving an object to an .rds file instead.
A data frame object with the results of parameter grid simulations, with values of each summary statistic stored in a list-column
Examples
slendr::check_dependencies(python = TRUE, quit = TRUE)
library(demografr)
library(slendr)
init_env(quiet = TRUE)
##################################################
# define a model
model <- function(Ne_A, Ne_B, Ne_C, Ne_D, T_AB, T_BC, T_CD, gf_BC) {
A <- population("A", time = 1, N = Ne_A)
B <- population("B", time = T_AB, N = Ne_B, parent = A)
C <- population("C", time = T_BC, N = Ne_C, parent = B)
D <- population("D", time = T_CD, N = Ne_D, parent = C)
gf <- gene_flow(from = B, to = C, start = 9000, end = 9301, rate = gf_BC)
model <- compile_model(
populations = list(A, B, C, D), gene_flow = gf,
generation_time = 1, simulation_length = 10000,
direction = "forward", serialize = FALSE
)
samples <- schedule_sampling(
model, times = 10000,
list(A, 25), list(B, 25), list(C, 25), list(D, 25),
strict = TRUE
)
# when a specific sampling schedule is to be used, both model and samples
# must be returned by the function
return(list(model, samples))
}
##################################################
# set up parameter grid
grid <- expand.grid(
Ne_A = c(1000, 3000),
Ne_B = c(100, 1500),
Ne_C = c(5000, 10000),
Ne_D = c(2000, 7000),
T_AB = c(100, 3000),
T_BC = c(4000, 6000),
T_CD = c(7000, 10000),
gf_BC = 0.1
)
# let's make the grid a little smaller just for this example
grid <- grid[1:10, ]
##################################################
# prepare a list of simulated summary statistics
compute_diversity <- function(ts) {
samples <- ts_names(ts, split = "pop")
ts_diversity(ts, sample_sets = samples)
}
compute_divergence <- function(ts) {
samples <- ts_names(ts, split = "pop")
ts_divergence(ts, sample_sets = samples)
}
compute_f4 <- function(ts) {
samples <- ts_names(ts, split = "pop")
A <- samples["A"]; B <- samples["B"]
C <- samples["C"]; D <- samples["D"]
ts_f4(ts, A, B, C, D)
}
functions <- list(
diversity = compute_diversity,
divergence = compute_divergence,
f4 = compute_f4
)
##################################################
# simulate data from a single model run
# (step #1 of one replicate of a grid simulation)
ts <- simulate_model(model, grid, sequence_length = 1e6, recombination_rate = 1e-8)
##################################################
# simulate data from a single model run
# (step #2 of one replicate of a grid simulation)
summarise_data(ts, functions)
#> $diversity
#> # A tibble: 4 × 2
#> set diversity
#> <chr> <dbl>
#> 1 A 0
#> 2 B 0
#> 3 C 0
#> 4 D 0
#>
#> $divergence
#> # A tibble: 6 × 3
#> x y divergence
#> <chr> <chr> <dbl>
#> 1 A B 0
#> 2 A C 0
#> 3 A D 0
#> 4 B C 0
#> 5 B D 0
#> 6 C D 0
#>
#> $f4
#> # A tibble: 1 × 5
#> W X Y Z f4
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 A B C D 0
#>
#
# we're skipping the remaining steps because they are extremely
# computationally intensive for the scope of this example
#
##################################################
# set up paralelization
# library(future)
# plan(multisession, workers = availableCores())
##################################################
# simulate data across the parameter grid
data <- simulate_grid(
model, grid, functions, replicates = 1,
sequence_length = 1e6, recombination_rate = 1e-8, mutation_rate = 1e-8
)
# the results (summary statistic values simulated across the parameter grid)
# are present in list-columns in the produced data frame (in this example,
# columns `diversity`, `divergence`, `f4`)
data
#> # A tibble: 10 × 12
#> rep Ne_A Ne_B Ne_C Ne_D T_AB T_BC T_CD gf_BC diversity divergence
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
#> 1 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 2 1 3000 100 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 3 1 1000 1500 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 4 1 3000 1500 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 5 1 1000 100 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 6 1 3000 100 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 7 1 1000 1500 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 8 1 3000 1500 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 9 1 1000 100 5000 7000 100 4000 7000 0.1 <tibble> <tibble>
#> 10 1 3000 100 5000 7000 100 4000 7000 0.1 <tibble> <tibble>
#> # ℹ 1 more variable: f4 <list>
# for easier data analysis, each statistic can be unnested
tidyr::unnest(data, diversity)
#> # A tibble: 40 × 13
#> rep Ne_A Ne_B Ne_C Ne_D T_AB T_BC T_CD gf_BC set diversity
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
#> 1 1 1000 100 5000 2000 100 4000 7000 0.1 A 0.0000563
#> 2 1 1000 100 5000 2000 100 4000 7000 0.1 B 0.00000244
#> 3 1 1000 100 5000 2000 100 4000 7000 0.1 C 0.0000923
#> 4 1 1000 100 5000 2000 100 4000 7000 0.1 D 0.0000725
#> 5 1 3000 100 5000 2000 100 4000 7000 0.1 A 0.000136
#> 6 1 3000 100 5000 2000 100 4000 7000 0.1 B 0.00000795
#> 7 1 3000 100 5000 2000 100 4000 7000 0.1 C 0.0000887
#> 8 1 3000 100 5000 2000 100 4000 7000 0.1 D 0.0000750
#> 9 1 1000 1500 5000 2000 100 4000 7000 0.1 A 0.0000481
#> 10 1 1000 1500 5000 2000 100 4000 7000 0.1 B 0.0000544
#> # ℹ 30 more rows
#> # ℹ 2 more variables: divergence <list>, f4 <list>
tidyr::unnest(data, divergence)
#> # A tibble: 60 × 14
#> rep Ne_A Ne_B Ne_C Ne_D T_AB T_BC T_CD gf_BC diversity x y
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <chr> <chr>
#> 1 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> A B
#> 2 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> A C
#> 3 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> A D
#> 4 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> B C
#> 5 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> B D
#> 6 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> C D
#> 7 1 3000 100 5000 2000 100 4000 7000 0.1 <tibble> A B
#> 8 1 3000 100 5000 2000 100 4000 7000 0.1 <tibble> A C
#> 9 1 3000 100 5000 2000 100 4000 7000 0.1 <tibble> A D
#> 10 1 3000 100 5000 2000 100 4000 7000 0.1 <tibble> B C
#> # ℹ 50 more rows
#> # ℹ 2 more variables: divergence <dbl>, f4 <list>
tidyr::unnest(data, f4)
#> # A tibble: 10 × 16
#> rep Ne_A Ne_B Ne_C Ne_D T_AB T_BC T_CD gf_BC diversity divergence
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
#> 1 1 1000 100 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 2 1 3000 100 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 3 1 1000 1500 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 4 1 3000 1500 5000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 5 1 1000 100 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 6 1 3000 100 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 7 1 1000 1500 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 8 1 3000 1500 10000 2000 100 4000 7000 0.1 <tibble> <tibble>
#> 9 1 1000 100 5000 7000 100 4000 7000 0.1 <tibble> <tibble>
#> 10 1 3000 100 5000 7000 100 4000 7000 0.1 <tibble> <tibble>
#> # ℹ 5 more variables: W <chr>, X <chr>, Y <chr>, Z <chr>, f4 <dbl>