⚠️⚠️⚠️

Note: The demografr R package is still under active development. As a result, its documentation is in a draft stage at best. Typos, inconsistencies, and other issues are unfortunately expected.

⚠️⚠️⚠️

Load empirical summary statistics computed from the observed data:

observed_diversity <- read.table(system.file("examples/observed_diversity.tsv", package = "demografr"), header = TRUE)
observed_divergence <- read.table(system.file("examples/observed_divergence.tsv", package = "demografr"), header = TRUE)
observed_f4  <- read.table(system.file("examples/observed_f4.tsv", package = "demografr"), header = TRUE)

observed <- list(
  diversity  = observed_diversity,
  divergence = observed_divergence,
  f4         = observed_f4
)

Specify the scaffold model function (each function argument corresponds to one parameter to be inferred):

model <- function(Ne_A, Ne_B, Ne_C, Ne_D, T_AB, T_BC, T_CD, gf_BC) {
  popA <- population("popA", time = 1,    N = Ne_A)
  popB <- population("popB", time = T_AB, N = Ne_B, parent = popA)
  popC <- population("popC", time = T_BC, N = Ne_C, parent = popB)
  popD <- population("popD", time = T_CD, N = Ne_D, parent = popC)

  gf <- gene_flow(from = popB, to = popC, start = 9000, end = 9301, rate = gf_BC)

  model <- compile_model(
    populations = list(popA, popB, popC, popD), gene_flow = gf,
    generation_time = 1, simulation_length = 10000,
    direction = "forward", serialize = FALSE
  )

  samples <- schedule_sampling(
    model, times = 10000,
    list(popA, 25), list(popB, 25), list(popC, 25), list(popD, 25),
    strict = TRUE
  )

  return(list(model, samples))
}

Specify tree-sequence summary statistics (one function per one observed statistic, as defined above):

compute_diversity <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  ts_diversity(ts, sample_sets = samples)
}
compute_divergence <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  ts_divergence(ts, sample_sets = samples)
}
compute_f4 <- function(ts) {
  samples <- ts_names(ts, split = "pop")
  A <- samples["popA"]; B <- samples["popB"]
  C <- samples["popC"]; D <- samples["popD"]
  rbind(
    ts_f4(ts, A, B, C, D),
    ts_f4(ts, A, C, B, D),
    ts_f4(ts, A, D, B, C)
  )
}

functions <- list(
  diversity  = compute_diversity,
  divergence = compute_divergence,
  f4         = compute_f4
)
bounds <- list(
  Ne...  ~ between(1, 10000),
  T...   ~ between(1, 10000),
  gf_BC  ~ between(0, 1)
)

result <- run_ga(
  model, functions, observed, bounds,
  sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8,
  iterations = 500, parallel = TRUE
)
result <- readRDS(result_path)
plot(result)

Checking the fitness of the best solution:

colnames(result@solution) <- names(formals(model))
result@solution
result@fitnessValue
compute_fitness(
  result@solution, model, functions, observed,
  sequence_length = 25e6, recombination_rate = 1e-8, mutation_rate = 1e-8
)

Fitness value for the true parameter vector:

params <- list(Ne_A = 2000, Ne_B = 800, Ne_C = 9000, Ne_D = 4000, T_AB = 2000, T_BC = 6000, T_CD = 8000, gf = 0.1)

do.call(model, params)[[1]] %>% plot_model()

results <-
  parallel::mclapply(
    1:100,
    function(i)
      compute_fitness(
        params, model, functions, observed, statistics = TRUE,
        sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8
      )[[1]] %>% dplyr::mutate(rep = i),
    mc.cores = 100
  ) %>%
  do.call(rbind, .)
p_stats <- lapply(names(functions), function(stat) {
  results %>%
    dplyr::filter(stat == !!stat) %>%
    ggplot() +
    geom_density(aes(error, color = id)) +
    guides(color = guide_legend("statistic")) +
    facet_wrap(~ stat, scales = "free")
})
cowplot::plot_grid(plotlist = p_stats)

Results from the perturbed model:

params_ <- list(Ne_A = 1000, Ne_B = 1000, Ne_C = 1000, Ne_D = 1000, T_AB = 1000, T_BC = 5000, T_CD = 8000, gf = 0.1)

observed_ <- simulate_grid(
  model, as.data.frame(params_), functions, replicates = 1,
  sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8
) %>%
  {
    list(
      diversity = .$diversity[[1]],
      divergence = .$divergence[[1]],
      f4 = .$f4[[1]]
    )
  }

results_ <-
  parallel::mclapply(
    1:100,
    function(i)
      compute_fitness(
        params_, model, functions, observed_, statistics = TRUE,
        sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8
      )[[1]] %>% dplyr::mutate(rep = i),
    mc.cores = 100
  ) %>%
  do.call(rbind, .)

p_stats_ <- lapply(names(functions), function(stat) {
  stat_result <- results_ %>% dplyr::filter(stat == !!stat)
  ggplot(stat_result) +
    geom_density(aes(simulated, color = id)) +
    geom_vline(aes(xintercept = observed, color = id), linetype = 2, alpha = 0.5) +
    geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
    guides(color = guide_legend("statistic")) +
    facet_wrap(~ stat, scales = "free")
})

cowplot::plot_grid(plotlist = c(
  p_stats_,
  list(plot_model(do.call(model, params_)[[1]], proportions = TRUE))
))