```{r setup, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
options(width=100)
```
In this vignette we explore to what extent the peak predictions are
sensitive to the spatial correlation in typical genomic data. We also
demonstrate how to efficiently compute coverage profiles from raw
aligned read data using the `data.table` package.
First we load a data set with one row for every genomic region with a
unique aligned read, and compute mean read size in bases.
```{r}
library(data.table)
data(ChIPreads, package="PeakSegDisk", envir=environment())
(experiments <- ChIPreads[, .(
mean.bases=mean(chromEnd-chromStart),
median.bases=median(chromEnd-chromStart),
chromStart=min(chromStart)
), by=list(experiment)])
```
From the table above it is clear that the average read size is about
100 for these two experiments.
Below, for each of the two data sets, we compute a data table with two
representations of these aligned reads: count each read at each
aligned base in the read (this induces spatial correlation), or just
the end/last base of the read (this has no spatial correlation).
```{r}
end.counts <- ChIPreads[, list(
count=.N #ignores dup reads, sum(count) would not.
), by=list(experiment, chrom, chromEnd)]
(aligned.dt <- rbind(
ChIPreads[, .(
bases.counted="each", experiment, chrom,
chromStart, chromEnd,
count=1)], #ignore duplicate reads.
end.counts[, .(
bases.counted="end", experiment, chrom,
chromStart=chromEnd-1L, chromEnd,
count)]))
```
Each row of the data table above describes how one read should be
counted in order to compute an aligned read profile.
```{r}
aligned.dt[, {
as.list(quantile(chromEnd-chromStart))
}, by=.(bases.counted, experiment)]
```
The table above shows that when we only count the end of each base, we
only count the read at one position. When we count the read at each
position, that means from 20 to 115 bases, depending on the read.
Next we compute a count profile for each base in these genomic
regions.
```{r}
(seq.dt <- aligned.dt[, {
event.dt <- rbind(
data.table(count, pos=chromStart+1L),
data.table(count=-count, pos=chromEnd+1L))
edge.vec <- event.dt[, {
as.integer(seq(min(pos), max(pos), l=100))
}]
event.bins <- rbind(
event.dt,
data.table(count=0L, pos=edge.vec))
total.dt <- event.bins[, .(
count=sum(count)
), by=list(pos)][order(pos)]
total.dt[, cum := cumsum(count)]
total.dt[, bin.i := cumsum(pos %in% edge.vec)]
## it is somewhat confusing because total.dt pos is the first base
## with cum, and cum goes all the way up to but not including the
## pos of the next row.
total.dt[, data.table(
chromStart=pos[-.N]-1L,
chromEnd=pos[-1]-1L,
count=cum[-.N],
bin.i=bin.i[-.N])]
}, by=list(bases.counted, experiment, chrom)])
```
In contrast to the previous data tables, the table above contains no
information about individual reads. Each row represents how many
reads, `count`, have been counted at all positions between
`chromStart+1` and `chromEnd`. We plot these aligned read counts
below:
```{r}
if(require(ggplot2)){
gg.data <- ggplot()+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_grid(
bases.counted ~ experiment,
scales="free",
labeller=label_both)+
geom_step(aes(
chromStart/1e3, count, color=data.type),
data=data.table(seq.dt, data.type="exact"))+
scale_color_manual(values=c(
exact="black",
bins="red",
model="deepskyblue"
))+
scale_x_continuous("Position on hg19 chrom (kb = kilo bases)")
print(gg.data)
}
```
It is clear from the plot above that, for each experiment (left:H3K36,
right:H3K4), the profiles in the top and bottom plots have peaks in
similar regions.
Next we compute mean profile in bins, using the `bin.i` column we
created earlier, which assigns each genomic region to one of 99 bins.
```{r}
bin.dt <- seq.dt[, {
bases <- chromEnd - chromStart
data.table(
binStart=min(chromStart),
binEnd=max(chromEnd),
mean.count=sum(count*bases)/sum(bases),
bases=sum(bases)
)}, by=list(bases.counted, experiment, bin.i)]
if(require(ggplot2)){
gg.bins <- gg.data+
geom_step(aes(
binStart/1e3, mean.count, color=data.type),
alpha=0.75,
linewidth=1,
data=data.table(bin.dt, data.type="bins"))+
scale_y_log10("Aligned DNA sequence reads (log scale)")
print(gg.bins)
}
```
The mean counts in each bin appear in the plot above as a red line.
Next we compute the optimal segmentation model with 2 peaks for each
data set.
```{r}
if(interactive() && requireNamespace("future"))future::plan("multisession")
segs.dt <- seq.dt[, {
data.dir <- file.path(tempdir(), bases.counted, experiment)
dir.create(data.dir, showWarnings=FALSE, recursive=TRUE)
coverage.bedGraph <- file.path(data.dir, "coverage.bedGraph")
fwrite(
.SD[, .(chrom, chromStart, chromEnd, count)],
coverage.bedGraph,
sep="\t",
quote=FALSE,
col.names=FALSE)
fit <- PeakSegDisk::sequentialSearch_dir(data.dir, 2L, verbose=1)
data.table(fit$segments, data.type="model")
}, by=list(bases.counted, experiment)]
changes.dt <- segs.dt[, {
.SD[-1]
}, by=list(bases.counted, experiment, data.type)]
if(require(ggplot2)){
gg.model <- gg.bins+
geom_segment(aes(
chromStart/1e3, mean,
xend=chromEnd/1e3, yend=mean,
color=data.type),
data=segs.dt)+
geom_vline(aes(
xintercept=chromEnd/1e3,
color=data.type),
data=changes.dt)
print(gg.model)
}
```
The plot above shows that the predicted peaks (blue) of the models
occur in similar positions, using either the each or end
representations of the data.
Next we compute the difference between predicted peak positions:
```{r, fig.width=10}
peaks.dt <- segs.dt[status=="peak"]
peaks.dt[, peak.i := rep(1:2, l=.N)]
peak.pos.tall <- melt(
peaks.dt,
measure.vars=c("chromStart", "chromEnd"))
peak.pos.wide <- dcast(
peak.pos.tall,
experiment + variable + peak.i ~ bases.counted)
peak.pos.wide[, diff.bases := abs(each-end)]
read.size.panel <- "each"
bases.max.dt <- seq.dt[, .(max.count=max(count)), by=list(bases.counted)]
read.size.y <- bases.max.dt[
read.size.panel, max.count, on=list(bases.counted)]
diff.panel <- "end"
diff.y <- bases.max.dt[
diff.panel, max.count, on=list(bases.counted)]
diff.y <- Inf
diff.vjust <- 1.1
if(require(ggplot2)){
gg.model+
geom_text(aes(
chromStart/1e3, read.size.y, label=sprintf(
"Median read size:\n%.0f bases",
median.bases)),
hjust=0,
vjust=1,
data=data.table(experiments, bases.counted=read.size.panel))+
geom_text(aes(
end/1e3, diff.y,
label=diff.bases,
color=data.type),
data=data.table(
bases.counted=diff.panel,
data.type="model",
peak.pos.wide),
vjust=diff.vjust,
hjust=0)+
geom_text(aes(
chromStart/1e3, diff.y,
label="Peak position\ndifference in bases:",
color=data.type,
),
hjust=0,
vjust=diff.vjust,
data=data.table(
data.type="model",
bases.counted=diff.panel,
experiments["H3K36me3", on=list(experiment)]))
}
```
In the plot above the blue numbers show the differences (in bases)
between the top and bottom panel peak predictions. It is clear that
the peak predictions are highly consistent between the each/end
representations, with variation on the order of read size (100 bases).
Overall these results indicate that the peak detection algorithm is
highly robust to the spatial correlation that is present in typical
ChIP-seq coverage profile data.