Skip to contents

Apply summary statistic functions to the simulated data

Usage

summarise_data(data, functions)

Arguments

data

Data generated by simulate_model. Either a named list (when custom data-generating functions were used on user-defined simulation output files) or an object of the class slendr_ts (when a standard tree-sequence-producing simulation was ran by slendr itself).

functions

A named list of data-generating functions

Value

A named list of data frames, one summary statistic for each

Examples

slendr::check_dependencies(python = TRUE, quit = TRUE)

library(demografr)

library(slendr)
init_env(quiet = TRUE)

##################################################
# define a model

model <- function(Ne_A, Ne_B, Ne_C, Ne_D, T_AB, T_BC, T_CD, gf_BC) {
  A <- population("A", time = 1,    N = Ne_A)
  B <- population("B", time = T_AB, N = Ne_B, parent = A)
  C <- population("C", time = T_BC, N = Ne_C, parent = B)
  D <- population("D", time = T_CD, N = Ne_D, parent = C)

  gf <- gene_flow(from = B, to = C, start = 9000, end = 9301, rate = gf_BC)

  model <- compile_model(
    populations = list(A, B, C, D), gene_flow = gf,
    generation_time = 1, simulation_length = 10000,
    direction = "forward", serialize = FALSE
  )

  samples <- schedule_sampling(
    model, times = 10000,
    list(A, 25), list(B, 25), list(C, 25), list(D, 25),
    strict = TRUE
  )

  # when a specific sampling schedule is to be used, both model and samples
  # must be returned by the function
  return(list(model, samples))
}

##################################################
# set up priors

priors <- list(
  Ne_A  ~ runif(1000, 3000),
  Ne_B  ~ runif(100,  1500),
  Ne_C  ~ runif(5000, 10000),
  Ne_D  ~ runif(2000, 7000),

  T_AB  ~ runif(1,    4000),
  T_BC  ~ runif(3000, 9000),
  T_CD  ~ runif(5000, 10000),

  gf_BC ~ runif(0, 0.3)
)

##################################################
# prepare a list of empirical summary statistics

observed_diversity <- read.table(
  system.file("examples/basics_diversity.tsv", package = "demografr"),
  header = TRUE
)
observed_divergence <- read.table(
  system.file("examples/basics_divergence.tsv", package = "demografr"),
  header = TRUE
)
observed_f4  <- read.table(
  system.file("examples/basics_f4.tsv", package = "demografr"),
  header = TRUE
)
observed <- list(
  diversity  = observed_diversity,
  divergence = observed_divergence,
  f4         = observed_f4

)

##################################################
# prepare a list of simulated summary statistics

compute_diversity <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  ts_diversity(ts, sample_sets = samples)
}
compute_divergence <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  ts_divergence(ts, sample_sets = samples)
}
compute_f4 <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  A <- samples["A"]; B <- samples["B"]
  C <- samples["C"]; D <- samples["D"]
  ts_f4(ts, A, B, C, D)
}

functions <- list(
  diversity  = compute_diversity,
  divergence = compute_divergence,
  f4         = compute_f4
)

##################################################
# simulate data from a single model run
# (step #1 of a single ABC replicate simulation)

ts <- simulate_model(model, priors, sequence_length = 1e6, recombination_rate = 1e-8)

##################################################
# simulate data from a single model run
# (step #2 of a single ABC replicate simulation)

summarise_data(ts, functions)
#> $diversity
#> # A tibble: 4 × 2
#>   set   diversity
#>   <chr>     <dbl>
#> 1 A             0
#> 2 B             0
#> 3 C             0
#> 4 D             0
#> 
#> $divergence
#> # A tibble: 6 × 3
#>   x     y     divergence
#>   <chr> <chr>      <dbl>
#> 1 A     B              0
#> 2 A     C              0
#> 3 A     D              0
#> 4 B     C              0
#> 5 B     D              0
#> 6 C     D              0
#> 
#> $f4
#> # A tibble: 1 × 5
#>   W     X     Y     Z        f4
#>   <chr> <chr> <chr> <chr> <dbl>
#> 1 A     B     C     D         0
#> 

##################################################
# validate all components of the ABC inference pipeline

validate_abc(model, priors, functions, observed,
             sequence_length = 1e6, recombination_rate = 1e-8)
#> ======================================================================
#> Testing sampling of each prior parameter:
#>   - Ne_A ✅
#>   - Ne_B ✅
#>   - Ne_C ✅
#>   - Ne_D ✅
#>   - T_AB ✅
#>   - T_BC ✅
#>   - T_CD ✅
#>   - gf_BC ✅
#> ---------------------------------------------------------------------
#> The model is a slendr function
#> ---------------------------------------------------------------------
#> Checking the return statement of the model function... ✅
#> ---------------------------------------------------------------------
#> Checking the presence of required model function arguments...---------------------------------------------------------------------
#> Simulating tree sequence from the given model... ✅
#> ---------------------------------------------------------------------
#> Computing user-defined summary functions:
#>   - diversity ✅
#>   - divergence ✅
#>   - f4 ✅
#> ---------------------------------------------------------------------
#> Checking the format of simulated summary statistics:
#>   - diversity [data frame] ✅
#>   - divergence [data frame] ✅
#>   - f4 [data frame] ✅
#> ======================================================================
#> No issues have been found in the ABC setup!

#
# we're skipping the remaining steps because they are extremely
# computationally intensive for the scope of this example
#

##################################################
# set up paralelization

# library(future)
# plan(multisession, workers = availableCores())

##################################################
# simulate data (summary statistics)

# data <- simulate_abc(
#   model, priors, functions, observed, iterations = 10000,
#   sequence_length = 10e6, recombination_rate = 1e-8, mutation_rate = 1e-8
# )

##################################################
# perform ABC inference

# abc <- run_abc(data, engine = "abc", tol = 0.01, method = "neuralnet")