Skip to content

Commit 1d1c8bc

Browse files
committed
Improve the quality of the knee point detection, again.
1 parent 93a01ff commit 1d1c8bc

File tree

4 files changed

+75
-145
lines changed

4 files changed

+75
-145
lines changed

NAMESPACE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ importClassesFrom(DelayedArray,DelayedArray)
4747
importClassesFrom(Matrix,CsparseMatrix)
4848
importClassesFrom(Matrix,dgCMatrix)
4949
importClassesFrom(SparseArray,COO_SparseArray)
50+
importClassesFrom(SparseArray,SVT_SparseMatrix)
5051
importClassesFrom(SparseArray,SparseArray)
5152
importFrom(BiocGenerics,match)
5253
importFrom(BiocParallel,SerialParam)
@@ -85,7 +86,6 @@ importFrom(S4Vectors,extractROWS)
8586
importFrom(S4Vectors,make_zero_col_DFrame)
8687
importFrom(S4Vectors,metadata)
8788
importFrom(SingleCellExperiment,SingleCellExperiment)
88-
importFrom(SparseArray,SVT_SparseArray)
8989
importFrom(SparseArray,nzvals)
9090
importFrom(SparseArray,nzwhich)
9191
importFrom(SummarizedExperiment,assay)
@@ -134,6 +134,5 @@ importFrom(stats,rpois)
134134
importFrom(stats,runif)
135135
importFrom(utils,head)
136136
importFrom(utils,read.delim)
137-
importFrom(utils,tail)
138137
importFrom(utils,write.table)
139138
useDynLib(DropletUtils)

R/barcodeRanks.R

Lines changed: 62 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,13 @@
55
#' @param m A numeric matrix-like object containing UMI counts, where columns represent barcoded droplets and rows represent genes.
66
#' Alternatively, a \linkS4class{SummarizedExperiment} containing such a matrix.
77
#' @param lower A numeric scalar specifying the lower bound on the total UMI count,
8-
#' at or below which all barcodes are assumed to correspond to empty droplets.
9-
#' @param fit.bounds A numeric vector of length 2, specifying the lower and upper bounds on the total UMI count
10-
#' from which to obtain a section of the curve for spline fitting.
11-
#' @param exclude.from An integer scalar specifying the number of highest ranking barcodes to exclude from spline fitting.
12-
#' Ignored if \code{fit.bounds} is specified.
8+
#' at or below which all barcodes are assumed to correspond to empty droplets and excluded from knee/inflection point identification.
9+
#' @param exclude.from An integer scalar specifying the number of highest ranking barcodes to exclude from knee/inflection point identification.
10+
#' @param fit.bounds,df Deprecated and ignored.
1311
#' @param assay.type Integer or string specifying the assay containing the count matrix.
14-
#' @param df Deprecated and ignored.
1512
#' @param ... For the generic, further arguments to pass to individual methods.
1613
#'
1714
#' For the SummarizedExperiment method, further arguments to pass to the ANY method.
18-
#'
19-
#' For the ANY method, further arguments to pass to \code{\link{smooth.spline}}.
2015
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed.
2116
#'
2217
#' @details
@@ -27,27 +22,17 @@
2722
#' To help create this plot, the \code{barcodeRanks} function will compute these ranks for all barcodes in \code{m}.
2823
#' Barcodes with the same total count receive the same average rank to avoid problems with discrete runs of the same total.
2924
#'
30-
#' The function will also identify the inflection and knee points on the curve for downstream use,
25+
#' The function will also identify the inflection and knee points on the curve for downstream use.
3126
#' Both of these points correspond to a sharp transition between two components of the total count distribution,
3227
#' presumably reflecting the difference between empty droplets with little RNA and cell-containing droplets with much more RNA.
3328
#' \itemize{
34-
#' \item The inflection point is computed as the point on the rank/total curve where the first derivative is minimized.
35-
#' The derivative is computed directly from all points on the curve with total counts greater than \code{lower}.
36-
#' This avoids issues with erratic behaviour of the curve at lower totals.
37-
#' \item The knee point is defined as the point on the curve that is furthest from the straight line drawn between the \code{fit.bounds} locations on the curve.
38-
#' We used to minimize the signed curvature to identify the knee point but this relies on the second derivative,
39-
#' which was too unstable even after smoothing.
29+
#' \item The inflection point is defined as the point on the log-rank/log-total curve where the first derivative is minimized.
30+
#' If multiple inflection points are present, we choose the point that immediately follows the knee point.
31+
#' \item To find the knee point, we draw a diagonal line that passes through the inflection point in the log-rank/log-total curve.
32+
#' The knee point is defined as the location on the curve that is above and most distant from this line.
4033
#' }
41-
#'
42-
#' If \code{fit.bounds} is not specified, the lower bound is automatically set to the inflection point
43-
#' as this should lie below the knee point on typical curves.
44-
#' The upper bound is set to the point at which the first derivative is closest to zero,
45-
#' i.e., the \dQuote{plateau} region before the knee point.
46-
#' The first \code{exclude.from} barcodes with the highest totals are ignored in this process
47-
#' to avoid spuriously large numerical derivatives from unstable parts of the curve with low point density.
48-
#'
49-
#' Note that only points with total counts above \code{lower} will be considered for curve fitting,
50-
#' regardless of how \code{fit.bounds} is defined.
34+
#' Only points with total counts above \code{lower} will be considered for knee/inflection point identification.
35+
#' Similarly, the first \code{exclude.from} points will be ignored to avoid instability at the start of the curve.
5136
#'
5237
#' @return
5338
#' A \linkS4class{DataFrame} where each row corresponds to a column of \code{m}, and containing the following fields:
@@ -86,10 +71,10 @@
8671
#' @name barcodeRanks
8772
NULL
8873

89-
#' @importFrom utils tail
74+
#' @importFrom utils head
9075
#' @importFrom Matrix colSums
9176
#' @importFrom S4Vectors DataFrame metadata<-
92-
.barcode_ranks <- function(m, lower=100, fit.bounds=NULL, exclude.from=50, df=20, ..., BPPARAM=SerialParam()) {
77+
.barcode_ranks <- function(m, lower=100, exclude.from=50, fit.bounds=NULL, df=20, ..., BPPARAM=SerialParam()) {
9378
old <- .parallelize(BPPARAM)
9479
on.exit(setAutoBPPARAM(old))
9580

@@ -101,46 +86,63 @@ NULL
10186
run.totals <- stuff$values
10287

10388
keep <- run.totals > lower
104-
if (sum(keep)<3) {
89+
keep[run.rank <= exclude.from] <- FALSE
90+
if (sum(keep) < 2L) {
10591
stop("insufficient unique points for computing knee/inflection points")
10692
}
93+
10794
y <- log10(run.totals[keep])
10895
x <- log10(run.rank[keep])
109-
110-
# Numerical differentiation to identify bounds for spline fitting.
111-
edge.out <- .find_curve_bounds(x=x, y=y, exclude.from=exclude.from)
112-
left.edge <- edge.out["left"]
113-
right.edge <- edge.out["right"]
114-
115-
# As an aside: taking the right edge to get the total for the inflection point.
116-
# We use the numerical derivative as the spline is optimized for the knee.
117-
inflection <- 10^(y[right.edge])
118-
119-
# We restrict curve fitting to this region, thereby simplifying the shape of the curve.
120-
# This allows us to get a decent fit with low df for stable differentiation.
121-
if (is.null(fit.bounds)) {
122-
new.keep <- left.edge:right.edge
123-
} else {
124-
new.keep <- which(y > log10(fit.bounds[1]) & y < log10(fit.bounds[2]))
96+
deriv <- diff(y) / diff(x)
97+
98+
# Initial inflection point is defined as the minima in the first derivative.
99+
infl.index <- which.min(deriv)
100+
101+
# Heuristically drawing a diagonal line (gradient -1) from the initial inflection point.
102+
# The knee is defined as the point on the curve with the maximum distance from that line.
103+
# The -1 is more or less pulled out of thin air based on what most curves look like;
104+
if (infl.index > 1) {
105+
infl.x <- x[infl.index]
106+
infl.y <- y[infl.index]
107+
left.of.infl.x <- head(x, infl.index) # only considering points to the left of the inflection.
108+
left.of.infl.y <- head(y, infl.index)
109+
110+
.find_knee <- function(gradient) {
111+
intercept <- infl.y - gradient * infl.x
112+
relative.dist <- left.of.infl.y - gradient * left.of.infl.x - intercept # vertical vs perpendicular distance is the same, relatively.
113+
knee.index <- which.max(relative.dist)
114+
if (relative.dist[knee.index] <= 0) { # if it's not above the line, we failed to find the knee.
115+
NULL
116+
} else {
117+
knee.index
118+
}
119+
}
120+
121+
knee.index <- .find_knee(-1)
122+
123+
# If there's nothing above the line with a fixed gradient, we fall back to an empirical gradient from the start of the curve.
124+
# This is more sensitive to the number of real cells, which stretches out the plateau and causes a leftward shift in the knee point.
125+
# But, at least we'll get something approximating a knee point.
126+
if (is.null(knee.index)) {
127+
gradient <- (infl.y - y[1]) / (infl.x - x[1])
128+
knee.index <- .find_knee(gradient)
129+
130+
# If there's still nothing, we just set the knee index to the inflection point.
131+
if (is.null(knee.index)) {
132+
knee.index <- infl.index
133+
}
134+
}
125135
}
126136

127-
# Using the maximum distance to identify the knee point.
128-
if (length(new.keep) >= 4) {
129-
curx <- x[new.keep]
130-
cury <- y[new.keep]
131-
xbounds <- curx[c(1L, length(new.keep))]
132-
ybounds <- cury[c(1L, length(new.keep))]
133-
gradient <- diff(ybounds)/diff(xbounds)
134-
intercept <- ybounds[1] - xbounds[1] * gradient
135-
above <- which(cury >= curx * gradient + intercept)
136-
dist <- abs(gradient * curx[above] - cury[above] + intercept)/sqrt(gradient^2 + 1)
137-
knee <- 10^(cury[above[which.max(dist)]])
138-
} else {
139-
# Sane fallback upon overly aggressive filtering by 'exclude.from', 'lower'.
140-
knee <- 10^(y[new.keep[1]])
141-
}
137+
# Refining the inflection point to the interval immediately following the knee point.
138+
# This aims to protect against curves with multiple inflection points.
139+
up.to <- findInterval(x[knee.index] + 1, x)
140+
new.infl.index <- knee.index + which.min(deriv[knee.index:up.to]) - 1L
141+
infl.index <- new.infl.index
142+
143+
knee <- 10^y[knee.index]
144+
inflection <- 10^y[infl.index]
142145

143-
# Returning a whole stack of useful stats.
144146
out <- DataFrame(
145147
rank=.reorder(run.rank, stuff$lengths, o),
146148
total=.reorder(run.totals, stuff$lengths, o)
@@ -156,21 +158,6 @@ NULL
156158
return(out)
157159
}
158160

159-
.find_curve_bounds <- function(x, y, exclude.from)
160-
# The upper/lower bounds are defined at the plateau and inflection, respectively.
161-
# Some exclusion of the LHS points avoids problems with discreteness.
162-
{
163-
d1n <- diff(y)/diff(x)
164-
165-
skip <- min(length(d1n) - 1, sum(x <= log10(exclude.from)))
166-
d1n <- tail(d1n, length(d1n) - skip)
167-
168-
right.edge <- which.min(d1n)
169-
left.edge <- which.max(d1n[seq_len(right.edge)])
170-
171-
c(left=left.edge, right=right.edge) + skip
172-
}
173-
174161
#' @export
175162
#' @rdname barcodeRanks
176163
setGeneric("barcodeRanks", function(m, ...) standardGeneric("barcodeRanks"))

man/barcodeRanks.Rd

Lines changed: 12 additions & 28 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-misc.R

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -36,46 +36,6 @@ test_that("barcodeRanks runs to completion", {
3636
expect_error(barcodeRanks(my.counts[0,]), "insufficient")
3737
})
3838

39-
test_that("barcodeRanks' excluder works correctly", {
40-
brout <- barcodeRanks(my.counts)
41-
keep <- brout$total >= 100 & !duplicated(brout$total)
42-
x <- log10(brout$rank[keep])
43-
y <- log10(brout$total[keep])
44-
45-
o <- order(x)
46-
x <- x[o]
47-
y <- y[o]
48-
49-
# Compares correctly to a reference.
50-
edge.out <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=100)
51-
ref.out <- DropletUtils:::.find_curve_bounds(x=tail(x, -100), y=tail(y, -100), exclude.from=0)
52-
expect_identical(edge.out, ref.out+100)
53-
54-
edge.outx <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=200)
55-
ref.outx <- DropletUtils:::.find_curve_bounds(x=tail(x, -200), y=tail(y, -200), exclude.from=0)
56-
expect_false(identical(edge.outx, ref.outx+200))
57-
58-
# Proper edge behavior.
59-
edge.out2 <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=0)
60-
expect_identical(edge.out[2], edge.out2[2])
61-
expect_false(identical(edge.out[1], edge.out2[1]))
62-
63-
edge.out3 <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=Inf)
64-
expect_identical(unname(edge.out3[1]), length(y)-1)
65-
expect_identical(unname(edge.out3[2]), length(y)-1)
66-
67-
# Works properly when put together.
68-
ref <- barcodeRanks(my.counts)
69-
brout <- barcodeRanks(my.counts, exclude.from=0)
70-
expect_false(identical(ref, brout))
71-
72-
brout2 <- barcodeRanks(my.counts, exclude.from=200)
73-
expect_false(identical(ref, brout2))
74-
75-
brout3 <- barcodeRanks(my.counts, exclude.from=Inf)
76-
expect_false(identical(ref, brout2))
77-
})
78-
7939
test_that("defaultDrops runs to completion", {
8040
out <- defaultDrops(my.counts)
8141

0 commit comments

Comments
 (0)