Admixture tracts and dating

WIP: Extracting ancestry tracts, chromosome painting, and using admixture tracts and LD patterns for dating admixture events.

library(slendr)
init_env()

The interface to all required Python modules has been activated.

library(cowplot)

# helper function for plotting tracts simulated later
plot_tracts <- function(tracts, ind) {
  ind_tracts <- filter(tracts, name %in% ind) %>%
    mutate(haplotype = paste0(name, "\nhap. ", haplotype))
  ind_tracts$haplotype <- factor(ind_tracts$haplotype, levels = unique(ind_tracts$haplotype[order(ind_tracts$node_id)]))

  ggplot(ind_tracts) +
    geom_rect(aes(xmin = left, xmax = right, ymin = 1, ymax = 2, fill = name), linewidth = 1) +
    labs(x = "coordinate along a chromosome [bp]") +
    theme_bw() +
    theme(
      legend.position = "none",
      axis.text.y = element_blank(),
      axis.ticks = element_blank(),
      panel.border = element_blank(),
      panel.grid = element_blank()
    ) +
    facet_grid(haplotype ~ .) +
    expand_limits(x = 0) +
    scale_x_continuous(labels = scales::comma)
}

popZ <- population("popZ", time = 3000, N = 5000)
popX <- population("popX", time = 1500, N = 5000, parent = popZ)
popY <- population("popY", time = 1500, N = 5000, parent = popZ)

gf <- gene_flow(from = popX, to = popY, rate = 0.2, start = 800, end = 799)

model <- compile_model(list(popZ, popX, popY), generation_time = 1, gene_flow = gf)

schedule <- rbind(
  schedule_sampling(model, times = seq(1500, 0, by = -100), list(popY, 50)),
  schedule_sampling(model, times = 0, list(popX, 10), list(popZ, 10))
)

# Use the function plot_model() to make sure that the model and the sampling schedule
# are defined correctly (there's no such thing as too many sanity checks when doing research)
plot_model(model, proportions = TRUE, samples = schedule)

ts <- msprime(model, samples = schedule, sequence_length = 100e6, recombination_rate = 1e-8)


library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)

tracts <- ts_tracts(ts, census = 800)


PopAncestry summary
Number of ancestral populations:    3
Number of sample chromosomes:       1540
Number of ancestors:            56552
Total length of genomes:        154000000000.000000
Ancestral coverage:             84000000000.000000

tracts

# A tibble: 54,011 × 10
   name     haplotype  time pop   source_pop    left  right length source_pop_id
   <chr>        <int> <dbl> <chr> <fct>        <dbl>  <dbl>  <dbl>         <dbl>
 1 popY_351         1   700 popY  popX        2.05e6 3.56e6 1.50e6             1
 2 popY_351         1   700 popY  popX        6.48e6 8.41e6 1.93e6             1
 3 popY_351         1   700 popY  popX        1.60e7 1.66e7 5.34e5             1
 4 popY_351         1   700 popY  popX        2.47e7 2.49e7 1.37e5             1
 5 popY_351         1   700 popY  popX        2.72e7 3.10e7 3.74e6             1
 6 popY_351         1   700 popY  popX        3.67e7 3.90e7 2.27e6             1
 7 popY_351         1   700 popY  popX        3.97e7 4.03e7 6.40e5             1
 8 popY_351         1   700 popY  popX        4.21e7 4.75e7 5.34e6             1
 9 popY_351         1   700 popY  popX        4.84e7 4.86e7 2.27e5             1
10 popY_351         1   700 popY  popX        4.87e7 4.92e7 4.37e5             1
# ℹ 54,001 more rows
# ℹ 1 more variable: node_id <dbl>

sample_times <- ts_samples(ts) %>% select(name, time)

# adding haplotype ID and time (automatically done by the dev slendr)
tracts <- tracts %>% dplyr::group_by(name) %>% dplyr::mutate(haplotype = dplyr::dense_rank(node_id)) %>% dplyr::ungroup() %>% inner_join(sample_times)

Joining with `by = join_by(name, time)`

# Select sampled individuals from different ages (remember, we recorded oldest individuals
# at 1000 generations ago, the youngest individuals at 0 generations ago). Use the function
# plot_tracts to visualize their ancestry tracts. What do you see in terms of the number of
# tracts and the length of tracts across these individuals? Can you eyeball some distinctive
# pattern?

subset_inds <- tracts %>% group_by(time) %>% distinct(name) %>% slice_sample(n = 1) %>% pull(name)
subset_inds

[1] "popY_720" "popY_697" "popY_602" "popY_579" "popY_520" "popY_477" "popY_422"
[8] "popY_395"

plot_tracts(tracts, subset_inds)

# It looks like the older individual is, the closer they lived to the start of the admixture event,
# and the longer the tracts they carry will be. Compute the average length of a ancestry tracts in each
# sample age group and visualize the length distribution of these tracts based on their age:

tracts %>%
  group_by(time) %>%
  summarise(mean(length))

# A tibble: 8 × 2
   time `mean(length)`
  <dbl>          <dbl>
1     0        164765.
2   100        190394.
3   200        211716.
4   300        250258.
5   400        318672.
6   500        408389.
7   600        623592.
8   700       1236032.

ggplot(tracts, aes(length, color = factor(time))) +
  geom_density() +
  coord_cartesian(xlim = c(0, 3e6)) +
  theme_bw()

# Now, try to work backwards. Assuming you have the following distribution of tract lengths...

tracts <- filter(tracts, time == 0, length <= 1e6)
bins <- hist(tracts$length, breaks = 50, plot = FALSE)
length <- bins$mids
density <- bins$density

plot(length, density)

lambda_mle <- 1 / mean(tracts$length)
lambda_mle / 1e-8

[1] 619.544

y_mle <- dexp(length, rate = lambda_mle)
lines(length, y_mle, lty = 2, col = "darkgreen", lwd = 2)

nls_res <- nls(density ~ SSasymp(length, Asym, R0, lrc))
nls_res

Nonlinear regression model
  model: density ~ SSasymp(length, Asym, R0, lrc)
   data: parent.frame()
      Asym         R0        lrc 
 3.298e-08  6.342e-06 -1.194e+01 
 residual sum-of-squares: 8.903e-13

Number of iterations to convergence: 0 
Achieved convergence tolerance: 2.816e-07

lambda_nls <- exp(unname(coef(nls_res)["lrc"]))
lambda_nls / 1e-8

[1] 650.94

y_nls <- predict(nls_res, newdata = data.frame(length = length))
lines(length, y_nls, lty = 2, col = "purple", lwd = 2)

legend("topright", fill = c("darkgreen", "purple"),
       legend = c(paste("MLE, t =", round(lambda_mle / 1e-8, 1), "generations ago"),
                  paste("MLE, t =", round(lambda_nls / 1e-8, 1), "generations ago")))