Admixture tracts and dating

WIP: Extracting ancestry tracts, chromosome painting, and using admixture tracts and LD patterns for dating admixture events.

library(slendr)
init_env()
The interface to all required Python modules has been activated.
library(cowplot)

# helper function for plotting tracts simulated later
plot_tracts <- function(tracts, ind) {
  ind_tracts <- filter(tracts, name %in% ind) %>%
    mutate(haplotype = paste0(name, "\nhap. ", haplotype))
  ind_tracts$haplotype <- factor(ind_tracts$haplotype, levels = unique(ind_tracts$haplotype[order(ind_tracts$node_id)]))

  ggplot(ind_tracts) +
    geom_rect(aes(xmin = left, xmax = right, ymin = 1, ymax = 2, fill = name), linewidth = 1) +
    labs(x = "coordinate along a chromosome [bp]") +
    theme_bw() +
    theme(
      legend.position = "none",
      axis.text.y = element_blank(),
      axis.ticks = element_blank(),
      panel.border = element_blank(),
      panel.grid = element_blank()
    ) +
    facet_grid(haplotype ~ .) +
    expand_limits(x = 0) +
    scale_x_continuous(labels = scales::comma)
}

popZ <- population("popZ", time = 3000, N = 5000)
popX <- population("popX", time = 1500, N = 5000, parent = popZ)
popY <- population("popY", time = 1500, N = 5000, parent = popZ)

gf <- gene_flow(from = popX, to = popY, rate = 0.2, start = 800, end = 799)

model <- compile_model(list(popZ, popX, popY), generation_time = 1, gene_flow = gf)

schedule <- rbind(
  schedule_sampling(model, times = seq(1500, 0, by = -100), list(popY, 50)),
  schedule_sampling(model, times = 0, list(popX, 10), list(popZ, 10))
)

# Use the function plot_model() to make sure that the model and the sampling schedule
# are defined correctly (there's no such thing as too many sanity checks when doing research)
plot_model(model, proportions = TRUE, samples = schedule)

ts <- msprime(model, samples = schedule, sequence_length = 100e6, recombination_rate = 1e-8)


library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

tracts <- ts_tracts(ts, census = 800)

PopAncestry summary
Number of ancestral populations:    3
Number of sample chromosomes:       1540
Number of ancestors:            57374
Total length of genomes:        154000000000.000000
Ancestral coverage:             84000000000.000000
tracts
# A tibble: 56,039 × 10
   name     haplotype  time pop   source_pop    left  right length source_pop_id
   <chr>        <int> <dbl> <chr> <fct>        <dbl>  <dbl>  <dbl>         <dbl>
 1 popY_351         1   700 popY  popX        9.67e6 1.02e7 5.06e5             1
 2 popY_351         1   700 popY  popX        1.04e7 1.05e7 1.81e5             1
 3 popY_351         1   700 popY  popX        1.40e7 1.65e7 2.52e6             1
 4 popY_351         1   700 popY  popX        2.13e7 2.24e7 1.05e6             1
 5 popY_351         1   700 popY  popX        2.49e7 2.49e7 8.01e3             1
 6 popY_351         1   700 popY  popX        3.12e7 3.17e7 5.03e5             1
 7 popY_351         1   700 popY  popX        4.28e7 4.54e7 2.58e6             1
 8 popY_351         1   700 popY  popX        5.10e7 5.10e7 3.81e3             1
 9 popY_351         1   700 popY  popX        6.18e7 6.53e7 3.45e6             1
10 popY_351         1   700 popY  popX        6.86e7 6.96e7 1.02e6             1
# ℹ 56,029 more rows
# ℹ 1 more variable: node_id <dbl>
sample_times <- ts_samples(ts) %>% select(name, time)

# adding haplotype ID and time (automatically done by the dev slendr)
tracts <- tracts %>% dplyr::group_by(name) %>% dplyr::mutate(haplotype = dplyr::dense_rank(node_id)) %>% dplyr::ungroup() %>% inner_join(sample_times) 
Joining with `by = join_by(name, time)`
# Select sampled individuals from different ages (remember, we recorded oldest individuals
# at 1000 generations ago, the youngest individuals at 0 generations ago). Use the function
# plot_tracts to visualize their ancestry tracts. What do you see in terms of the number of
# tracts and the length of tracts across these individuals? Can you eyeball some distinctive
# pattern?

subset_inds <- tracts %>% group_by(time) %>% distinct(name) %>% slice_sample(n = 1) %>% pull(name)
subset_inds
[1] "popY_733" "popY_681" "popY_627" "popY_597" "popY_502" "popY_453" "popY_417"
[8] "popY_369"
plot_tracts(tracts, subset_inds)

# It looks like the older individual is, the closer they lived to the start of the admixture event,
# and the longer the tracts they carry will be. Compute the average length of a ancestry tracts in each
# sample age group and visualize the length distribution of these tracts based on their age:

tracts %>%
  group_by(time) %>%
  summarise(mean(length))
# A tibble: 8 × 2
   time `mean(length)`
  <dbl>          <dbl>
1     0        159443.
2   100        180717.
3   200        213505.
4   300        256409.
5   400        306106.
6   500        409661.
7   600        603636.
8   700       1228810.
ggplot(tracts, aes(length, color = factor(time))) +
  geom_density() +
  coord_cartesian(xlim = c(0, 3e6)) +
  theme_bw()

# Now, try to work backwards. Assuming you have the following distribution of tract lengths...

tracts <- filter(tracts, time == 0, length <= 1e6)
bins <- hist(tracts$length, breaks = 50, plot = FALSE)
length <- bins$mids
density <- bins$density

plot(length, density)

lambda_mle <- 1 / mean(tracts$length)
lambda_mle / 1e-8
[1] 632.9559
y_mle <- dexp(length, rate = lambda_mle)
lines(length, y_mle, lty = 2, col = "darkgreen", lwd = 2)

nls_res <- nls(density ~ SSasymp(length, Asym, R0, lrc))
nls_res
Nonlinear regression model
  model: density ~ SSasymp(length, Asym, R0, lrc)
   data: parent.frame()
      Asym         R0        lrc 
-1.975e-08  6.136e-06 -1.202e+01 
 residual sum-of-squares: 3.138e-13

Number of iterations to convergence: 0 
Achieved convergence tolerance: 3.622e-07
lambda_nls <- exp(unname(coef(nls_res)["lrc"]))
lambda_nls / 1e-8
[1] 601.8527
y_nls <- predict(nls_res, newdata = data.frame(length = length))
lines(length, y_nls, lty = 2, col = "purple", lwd = 2)

legend("topright", fill = c("darkgreen", "purple"),
       legend = c(paste("MLE, t =", round(lambda_mle / 1e-8, 1), "generations ago"),
                  paste("MLE, t =", round(lambda_nls / 1e-8, 1), "generations ago")))