This is a core function for ABC inference using demografr. It generates simulation replicates and computes summary statistic for the next step of an inference procedure, which is the ABC estimation itself.
Usage
simulate_abc(
model,
priors,
functions,
observed,
iterations,
sequence_length,
recombination_rate,
mutation_rate = 0,
data = NULL,
format = c("ts", "files"),
file = NULL,
packages = NULL,
attempts = 1000,
engine = NULL,
model_args = NULL,
engine_args = NULL
)
Arguments
- model
Either a slendr model generating function (in which case
engine
must be either "msprime" or "slim", i.e. one of the two of slendr's simulation back ends), or a path to a custom user-defined SLiM or msprime script (in which caseengine
must be "custom").- priors
A list of prior distributions to use for sampling of model parameters
- functions
A named list of summary statistic functions to apply on simulated tree sequences
- observed
A named list of observed summary statistics
- iterations
How many simulation replicates to run?
- sequence_length
Amount of sequence to simulate using slendr (in numbers of basepairs)
- recombination_rate
Recombination rate to use for the simulation
- mutation_rate
Mutation rate to use for the simulation
- data
A named list of data-generating functions. The names then represent all possible arguments of simulated summary statistic functions.
- format
In which format will the model generate results to be used for computing simulated summary statistics?
- file
If not
NULL
, a path where to save the data frame with simulated grid results- packages
A character vector with package names used by user-defined summary statistic functions. Only relevant when parallelization is set up using
future::plan()
to make sure that the parallelized tree-sequence summary statistic functions have all of their packages available.- attempts
Maximum number of attempts to generate prior values for a valid demographic model (default is 1000)
- engine
Which simulation engine to use? Values "msprime" and "slim" will use one of the built-in slendr simulation back ends. Which engine will be used is determined by the nature of the
model
. Ifengine = NULL
, then spatial slendr models will by default use the "slim" back end, non-spatial models will use the "msprime" back end, and custom user-defined model scripts will use the "custom" engine. Setting this argument explicitly will change the back ends (where appropriate). Setting this argument for custom simulation script has no effect.- model_args
Optional (non-prior) arguments for the slendr model generating function. Setting this argument for custom simulation script has no effect.
- engine_args
Optional arguments for the slendr simulation back end. Setting this argument for custom simulation script has no effect.
Value
A list object of the class demografr_abc_sims
containing the results
of ABC simulations, sampled parameters, priors, and tree-sequence summary statistics
Examples
slendr::check_dependencies(python = TRUE, quit = TRUE)
library(demografr)
library(slendr)
init_env(quiet = TRUE)
##################################################
# define a model
model <- function(Ne_A, Ne_B, Ne_C, Ne_D, T_AB, T_BC, T_CD, gf_BC) {
A <- population("A", time = 1, N = Ne_A)
B <- population("B", time = T_AB, N = Ne_B, parent = A)
C <- population("C", time = T_BC, N = Ne_C, parent = B)
D <- population("D", time = T_CD, N = Ne_D, parent = C)
gf <- gene_flow(from = B, to = C, start = 9000, end = 9301, rate = gf_BC)
model <- compile_model(
populations = list(A, B, C, D), gene_flow = gf,
generation_time = 1, simulation_length = 10000,
direction = "forward", serialize = FALSE
)
samples <- schedule_sampling(
model, times = 10000,
list(A, 25), list(B, 25), list(C, 25), list(D, 25),
strict = TRUE
)
# when a specific sampling schedule is to be used, both model and samples
# must be returned by the function
return(list(model, samples))
}
##################################################
# set up priors
priors <- list(
Ne_A ~ runif(1000, 3000),
Ne_B ~ runif(100, 1500),
Ne_C ~ runif(5000, 10000),
Ne_D ~ runif(2000, 7000),
T_AB ~ runif(1, 4000),
T_BC ~ runif(3000, 9000),
T_CD ~ runif(5000, 10000),
gf_BC ~ runif(0, 0.3)
)
##################################################
# prepare a list of empirical summary statistics
observed_diversity <- read.table(
system.file("examples/basics_diversity.tsv", package = "demografr"),
header = TRUE
)
observed_divergence <- read.table(
system.file("examples/basics_divergence.tsv", package = "demografr"),
header = TRUE
)
observed_f4 <- read.table(
system.file("examples/basics_f4.tsv", package = "demografr"),
header = TRUE
)
observed <- list(
diversity = observed_diversity,
divergence = observed_divergence,
f4 = observed_f4
)
##################################################
# prepare a list of simulated summary statistics
compute_diversity <- function(ts) {
samples <- ts_names(ts, split = "pop")
ts_diversity(ts, sample_sets = samples)
}
compute_divergence <- function(ts) {
samples <- ts_names(ts, split = "pop")
ts_divergence(ts, sample_sets = samples)
}
compute_f4 <- function(ts) {
samples <- ts_names(ts, split = "pop")
A <- samples["A"]; B <- samples["B"]
C <- samples["C"]; D <- samples["D"]
ts_f4(ts, A, B, C, D)
}
functions <- list(
diversity = compute_diversity,
divergence = compute_divergence,
f4 = compute_f4
)
##################################################
# simulate data from a single model run
# (step #1 of a single ABC replicate simulation)
ts <- simulate_model(model, priors, sequence_length = 1e6, recombination_rate = 1e-8)
##################################################
# simulate data from a single model run
# (step #2 of a single ABC replicate simulation)
summarise_data(ts, functions)
#> $diversity
#> # A tibble: 4 × 2
#> set diversity
#> <chr> <dbl>
#> 1 A 0
#> 2 B 0
#> 3 C 0
#> 4 D 0
#>
#> $divergence
#> # A tibble: 6 × 3
#> x y divergence
#> <chr> <chr> <dbl>
#> 1 A B 0
#> 2 A C 0
#> 3 A D 0
#> 4 B C 0
#> 5 B D 0
#> 6 C D 0
#>
#> $f4
#> # A tibble: 1 × 5
#> W X Y Z f4
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 A B C D 0
#>
##################################################
# validate all components of the ABC inference pipeline
validate_abc(model, priors, functions, observed,
sequence_length = 1e6, recombination_rate = 1e-8)
#> ======================================================================
#> Testing sampling of each prior parameter:
#> - Ne_A ✅
#> - Ne_B ✅
#> - Ne_C ✅
#> - Ne_D ✅
#> - T_AB ✅
#> - T_BC ✅
#> - T_CD ✅
#> - gf_BC ✅
#> ---------------------------------------------------------------------
#> The model is a slendr function
#> ---------------------------------------------------------------------
#> Checking the return statement of the model function... ✅
#> ---------------------------------------------------------------------
#> Checking the presence of required model function arguments...---------------------------------------------------------------------
#> Simulating tree sequence from the given model... ✅
#> ---------------------------------------------------------------------
#> Computing user-defined summary functions:
#> - diversity ✅
#> - divergence ✅
#> - f4 ✅
#> ---------------------------------------------------------------------
#> Checking the format of simulated summary statistics:
#> - diversity [data frame] ✅
#> - divergence [data frame] ✅
#> - f4 [data frame] ✅
#> ======================================================================
#> No issues have been found in the ABC setup!
#
# we're skipping the remaining steps because they are extremely
# computationally intensive for the scope of this example
#
##################################################
# set up paralelization
# library(future)
# plan(multisession, workers = availableCores())
##################################################
# simulate data (summary statistics)
# data <- simulate_abc(
# model, priors, functions, observed, iterations = 10000,
# sequence_length = 10e6, recombination_rate = 1e-8, mutation_rate = 1e-8
# )
##################################################
# perform ABC inference
# abc <- run_abc(data, engine = "abc", tol = 0.01, method = "neuralnet")