vignettes/vignette-07-optimization.Rmd
vignette-07-optimization.Rmd
⚠️⚠️⚠️
Note: The demografr R package is still under active development. As a result, its documentation is in a draft stage at best. Typos, inconsistencies, and other issues are unfortunately expected.
⚠️⚠️⚠️
Load empirical summary statistics computed from the observed data:
observed_diversity <- read.table(system.file("examples/observed_diversity.tsv", package = "demografr"), header = TRUE)
observed_divergence <- read.table(system.file("examples/observed_divergence.tsv", package = "demografr"), header = TRUE)
observed_f4 <- read.table(system.file("examples/observed_f4.tsv", package = "demografr"), header = TRUE)
observed <- list(
diversity = observed_diversity,
divergence = observed_divergence,
f4 = observed_f4
)
Specify the scaffold model function (each function argument corresponds to one parameter to be inferred):
model <- function(Ne_A, Ne_B, Ne_C, Ne_D, T_AB, T_BC, T_CD, gf_BC) {
popA <- population("popA", time = 1, N = Ne_A)
popB <- population("popB", time = T_AB, N = Ne_B, parent = popA)
popC <- population("popC", time = T_BC, N = Ne_C, parent = popB)
popD <- population("popD", time = T_CD, N = Ne_D, parent = popC)
gf <- gene_flow(from = popB, to = popC, start = 9000, end = 9301, rate = gf_BC)
model <- compile_model(
populations = list(popA, popB, popC, popD), gene_flow = gf,
generation_time = 1, simulation_length = 10000,
direction = "forward", serialize = FALSE
)
samples <- schedule_sampling(
model, times = 10000,
list(popA, 25), list(popB, 25), list(popC, 25), list(popD, 25),
strict = TRUE
)
return(list(model, samples))
}
Specify tree-sequence summary statistics (one function per one observed statistic, as defined above):
compute_diversity <- function(ts) {
samples <- ts_names(ts, split = "pop")
ts_diversity(ts, sample_sets = samples)
}
compute_divergence <- function(ts) {
samples <- ts_names(ts, split = "pop")
ts_divergence(ts, sample_sets = samples)
}
compute_f4 <- function(ts) {
samples <- ts_names(ts, split = "pop")
A <- samples["popA"]; B <- samples["popB"]
C <- samples["popC"]; D <- samples["popD"]
rbind(
ts_f4(ts, A, B, C, D),
ts_f4(ts, A, C, B, D),
ts_f4(ts, A, D, B, C)
)
}
functions <- list(
diversity = compute_diversity,
divergence = compute_divergence,
f4 = compute_f4
)
bounds <- list(
Ne... ~ between(1, 10000),
T... ~ between(1, 10000),
gf_BC ~ between(0, 1)
)
result <- run_ga(
model, functions, observed, bounds,
sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8,
iterations = 500, parallel = TRUE
)
result <- readRDS(result_path)
plot(result)
Checking the fitness of the best solution:
result@fitnessValue
compute_fitness(
result@solution, model, functions, observed,
sequence_length = 25e6, recombination_rate = 1e-8, mutation_rate = 1e-8
)
Fitness value for the true parameter vector:
params <- list(Ne_A = 2000, Ne_B = 800, Ne_C = 9000, Ne_D = 4000, T_AB = 2000, T_BC = 6000, T_CD = 8000, gf = 0.1)
do.call(model, params)[[1]] %>% plot_model()
results <-
parallel::mclapply(
1:100,
function(i)
compute_fitness(
params, model, functions, observed, statistics = TRUE,
sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8
)[[1]] %>% dplyr::mutate(rep = i),
mc.cores = 100
) %>%
do.call(rbind, .)
p_stats <- lapply(names(functions), function(stat) {
results %>%
dplyr::filter(stat == !!stat) %>%
ggplot() +
geom_density(aes(error, color = id)) +
guides(color = guide_legend("statistic")) +
facet_wrap(~ stat, scales = "free")
})
cowplot::plot_grid(plotlist = p_stats)
Results from the perturbed model:
params_ <- list(Ne_A = 1000, Ne_B = 1000, Ne_C = 1000, Ne_D = 1000, T_AB = 1000, T_BC = 5000, T_CD = 8000, gf = 0.1)
observed_ <- simulate_grid(
model, as.data.frame(params_), functions, replicates = 1,
sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8
) %>%
{
list(
diversity = .$diversity[[1]],
divergence = .$divergence[[1]],
f4 = .$f4[[1]]
)
}
results_ <-
parallel::mclapply(
1:100,
function(i)
compute_fitness(
params_, model, functions, observed_, statistics = TRUE,
sequence_length = 50e6, recombination_rate = 1e-8, mutation_rate = 1e-8
)[[1]] %>% dplyr::mutate(rep = i),
mc.cores = 100
) %>%
do.call(rbind, .)
p_stats_ <- lapply(names(functions), function(stat) {
stat_result <- results_ %>% dplyr::filter(stat == !!stat)
ggplot(stat_result) +
geom_density(aes(simulated, color = id)) +
geom_vline(aes(xintercept = observed, color = id), linetype = 2, alpha = 0.5) +
geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
guides(color = guide_legend("statistic")) +
facet_wrap(~ stat, scales = "free")
})
cowplot::plot_grid(plotlist = c(
p_stats_,
list(plot_model(do.call(model, params_)[[1]], proportions = TRUE))
))