Improve the quality of the knee point detection, again.

LTLA · LTLA · commit 1d1c8bc0730f · 2025-08-08T15:52:43.000-07:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -47,6 +47,7 @@ importClassesFrom(DelayedArray,DelayedArray)
 importClassesFrom(Matrix,CsparseMatrix)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(SparseArray,COO_SparseArray)
+importClassesFrom(SparseArray,SVT_SparseMatrix)
 importClassesFrom(SparseArray,SparseArray)
 importFrom(BiocGenerics,match)
 importFrom(BiocParallel,SerialParam)
@@ -85,7 +86,6 @@ importFrom(S4Vectors,extractROWS)
 importFrom(S4Vectors,make_zero_col_DFrame)
 importFrom(S4Vectors,metadata)
 importFrom(SingleCellExperiment,SingleCellExperiment)
-importFrom(SparseArray,SVT_SparseArray)
 importFrom(SparseArray,nzvals)
 importFrom(SparseArray,nzwhich)
 importFrom(SummarizedExperiment,assay)
@@ -134,6 +134,5 @@ importFrom(stats,rpois)
 importFrom(stats,runif)
 importFrom(utils,head)
 importFrom(utils,read.delim)
-importFrom(utils,tail)
 importFrom(utils,write.table)
 useDynLib(DropletUtils)
diff --git a/R/barcodeRanks.R b/R/barcodeRanks.R
@@ -5,18 +5,13 @@
 #' @param m A numeric matrix-like object containing UMI counts, where columns represent barcoded droplets and rows represent genes.
 #' Alternatively, a \linkS4class{SummarizedExperiment} containing such a matrix.
 #' @param lower A numeric scalar specifying the lower bound on the total UMI count, 
-#' at or below which all barcodes are assumed to correspond to empty droplets.
-#' @param fit.bounds A numeric vector of length 2, specifying the lower and upper bounds on the total UMI count
-#' from which to obtain a section of the curve for spline fitting.
-#' @param exclude.from An integer scalar specifying the number of highest ranking barcodes to exclude from spline fitting.
-#' Ignored if \code{fit.bounds} is specified.
+#' at or below which all barcodes are assumed to correspond to empty droplets and excluded from knee/inflection point identification.
+#' @param exclude.from An integer scalar specifying the number of highest ranking barcodes to exclude from knee/inflection point identification.
+#' @param fit.bounds,df Deprecated and ignored.
 #' @param assay.type Integer or string specifying the assay containing the count matrix.
-#' @param df Deprecated and ignored.
 #' @param ... For the generic, further arguments to pass to individual methods.
 #'
 #' For the SummarizedExperiment method, further arguments to pass to the ANY method.
-#'
-#' For the ANY method, further arguments to pass to \code{\link{smooth.spline}}.
 #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed.
 #' 
 #' @details
@@ -27,27 +22,17 @@
 #' To help create this plot, the \code{barcodeRanks} function will compute these ranks for all barcodes in \code{m}.
 #' Barcodes with the same total count receive the same average rank to avoid problems with discrete runs of the same total.
 #' 
-#' The function will also identify the inflection and knee points on the curve for downstream use, 
+#' The function will also identify the inflection and knee points on the curve for downstream use.
 #' Both of these points correspond to a sharp transition between two components of the total count distribution, 
 #' presumably reflecting the difference between empty droplets with little RNA and cell-containing droplets with much more RNA.
 #' \itemize{
-#' \item The inflection point is computed as the point on the rank/total curve where the first derivative is minimized.
-#' The derivative is computed directly from all points on the curve with total counts greater than \code{lower}.
-#' This avoids issues with erratic behaviour of the curve at lower totals.
-#' \item The knee point is defined as the point on the curve that is furthest from the straight line drawn between the \code{fit.bounds} locations on the curve.
-#' We used to minimize the signed curvature to identify the knee point but this relies on the second derivative,
-#' which was too unstable even after smoothing.
+#' \item The inflection point is defined as the point on the log-rank/log-total curve where the first derivative is minimized.
+#' If multiple inflection points are present, we choose the point that immediately follows the knee point.
+#' \item To find the knee point, we draw a diagonal line that passes through the inflection point in the log-rank/log-total curve.
+#' The knee point is defined as the location on the curve that is above and most distant from this line.
 #' }
-#'
-#' If \code{fit.bounds} is not specified, the lower bound is automatically set to the inflection point
-#' as this should lie below the knee point on typical curves.
-#' The upper bound is set to the point at which the first derivative is closest to zero, 
-#' i.e., the \dQuote{plateau} region before the knee point.
-#' The first \code{exclude.from} barcodes with the highest totals are ignored in this process 
-#' to avoid spuriously large numerical derivatives from unstable parts of the curve with low point density.
-#'
-#' Note that only points with total counts above \code{lower} will be considered for curve fitting,
-#' regardless of how \code{fit.bounds} is defined.
+#' Only points with total counts above \code{lower} will be considered for knee/inflection point identification.
+#' Similarly, the first \code{exclude.from} points will be ignored to avoid instability at the start of the curve.
 #' 
 #' @return
 #' A \linkS4class{DataFrame} where each row corresponds to a column of \code{m}, and containing the following fields:
@@ -86,10 +71,10 @@
 #' @name barcodeRanks
 NULL
 
-#' @importFrom utils tail
+#' @importFrom utils head
 #' @importFrom Matrix colSums
 #' @importFrom S4Vectors DataFrame metadata<-
-.barcode_ranks <- function(m, lower=100, fit.bounds=NULL, exclude.from=50, df=20, ..., BPPARAM=SerialParam()) {
+.barcode_ranks <- function(m, lower=100, exclude.from=50, fit.bounds=NULL, df=20, ..., BPPARAM=SerialParam()) {
     old <- .parallelize(BPPARAM)
     on.exit(setAutoBPPARAM(old))
 
@@ -101,46 +86,63 @@ NULL
     run.totals <- stuff$values
 
     keep <- run.totals > lower
-    if (sum(keep)<3) { 
+    keep[run.rank <= exclude.from] <- FALSE
+    if (sum(keep) < 2L) { 
         stop("insufficient unique points for computing knee/inflection points")
     } 
+
     y <- log10(run.totals[keep])
     x <- log10(run.rank[keep])
-    
-    # Numerical differentiation to identify bounds for spline fitting.
-    edge.out <- .find_curve_bounds(x=x, y=y, exclude.from=exclude.from) 
-    left.edge <- edge.out["left"]
-    right.edge <- edge.out["right"]
-
-    # As an aside: taking the right edge to get the total for the inflection point.
-    # We use the numerical derivative as the spline is optimized for the knee.
-    inflection <- 10^(y[right.edge])
-
-    # We restrict curve fitting to this region, thereby simplifying the shape of the curve.
-    # This allows us to get a decent fit with low df for stable differentiation.
-    if (is.null(fit.bounds)) {
-        new.keep <- left.edge:right.edge
-    } else {
-        new.keep <- which(y > log10(fit.bounds[1]) & y < log10(fit.bounds[2]))
+    deriv <- diff(y) / diff(x)
+
+    # Initial inflection point is defined as the minima in the first derivative.
+    infl.index <- which.min(deriv)
+
+    # Heuristically drawing a diagonal line (gradient -1) from the initial inflection point.
+    # The knee is defined as the point on the curve with the maximum distance from that line.
+    # The -1 is more or less pulled out of thin air based on what most curves look like;
+    if (infl.index > 1) {
+        infl.x <- x[infl.index]
+        infl.y <- y[infl.index] 
+        left.of.infl.x <- head(x, infl.index) # only considering points to the left of the inflection.
+        left.of.infl.y <- head(y, infl.index)
+
+        .find_knee <- function(gradient) {
+            intercept <- infl.y - gradient * infl.x
+            relative.dist <- left.of.infl.y - gradient * left.of.infl.x - intercept # vertical vs perpendicular distance is the same, relatively.
+            knee.index <- which.max(relative.dist)
+            if (relative.dist[knee.index] <= 0) { # if it's not above the line, we failed to find the knee.
+                NULL
+            } else {
+                knee.index
+            }
+        }
+
+        knee.index <- .find_knee(-1)
+
+        # If there's nothing above the line with a fixed gradient, we fall back to an empirical gradient from the start of the curve.
+        # This is more sensitive to the number of real cells, which stretches out the plateau and causes a leftward shift in the knee point.
+        # But, at least we'll get something approximating a knee point.
+        if (is.null(knee.index)) {
+            gradient <- (infl.y - y[1]) / (infl.x - x[1])
+            knee.index <- .find_knee(gradient)
+
+            # If there's still nothing, we just set the knee index to the inflection point.
+            if (is.null(knee.index)) {
+                knee.index <- infl.index
+            }
+        }
     }
 
-    # Using the maximum distance to identify the knee point.
-    if (length(new.keep) >= 4) {
-        curx <- x[new.keep]
-        cury <- y[new.keep]
-        xbounds <- curx[c(1L, length(new.keep))]
-        ybounds <- cury[c(1L, length(new.keep))]
-        gradient <- diff(ybounds)/diff(xbounds)
-        intercept <- ybounds[1] - xbounds[1] * gradient
-        above <- which(cury >= curx * gradient + intercept)
-        dist <- abs(gradient * curx[above] - cury[above] + intercept)/sqrt(gradient^2 + 1)
-        knee <- 10^(cury[above[which.max(dist)]])
-    } else {
-        # Sane fallback upon overly aggressive filtering by 'exclude.from', 'lower'.
-        knee <- 10^(y[new.keep[1]]) 
-    }
+    # Refining the inflection point to the interval immediately following the knee point. 
+    # This aims to protect against curves with multiple inflection points.
+    up.to <- findInterval(x[knee.index] + 1, x)
+    new.infl.index <- knee.index + which.min(deriv[knee.index:up.to]) - 1L
+    infl.index <- new.infl.index
+
+    knee <- 10^y[knee.index]
+    inflection <- 10^y[infl.index]
 
-    # Returning a whole stack of useful stats.
     out <- DataFrame(
         rank=.reorder(run.rank, stuff$lengths, o), 
         total=.reorder(run.totals, stuff$lengths, o)
@@ -156,21 +158,6 @@ NULL
     return(out)
 }
 
-.find_curve_bounds <- function(x, y, exclude.from) 
-# The upper/lower bounds are defined at the plateau and inflection, respectively.
-# Some exclusion of the LHS points avoids problems with discreteness.
-{
-    d1n <- diff(y)/diff(x)
-
-    skip <- min(length(d1n) - 1, sum(x <= log10(exclude.from)))
-    d1n <- tail(d1n, length(d1n) - skip)
-
-    right.edge <- which.min(d1n)
-    left.edge <- which.max(d1n[seq_len(right.edge)])
-
-    c(left=left.edge, right=right.edge) + skip
-}
-
 #' @export
 #' @rdname barcodeRanks
 setGeneric("barcodeRanks", function(m, ...) standardGeneric("barcodeRanks"))
diff --git a/man/barcodeRanks.Rd b/man/barcodeRanks.Rd
diff --git a/tests/testthat/test-misc.R b/tests/testthat/test-misc.R
@@ -36,46 +36,6 @@ test_that("barcodeRanks runs to completion", {
     expect_error(barcodeRanks(my.counts[0,]), "insufficient")
 })
 
-test_that("barcodeRanks' excluder works correctly", {
-    brout <- barcodeRanks(my.counts)
-    keep <- brout$total >= 100 & !duplicated(brout$total)
-    x <- log10(brout$rank[keep])
-    y <- log10(brout$total[keep])
-
-    o <- order(x)
-    x <- x[o]
-    y <- y[o]
-
-    # Compares correctly to a reference.
-    edge.out <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=100) 
-    ref.out <- DropletUtils:::.find_curve_bounds(x=tail(x, -100), y=tail(y, -100), exclude.from=0) 
-    expect_identical(edge.out, ref.out+100)
-
-    edge.outx <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=200) 
-    ref.outx <- DropletUtils:::.find_curve_bounds(x=tail(x, -200), y=tail(y, -200), exclude.from=0) 
-    expect_false(identical(edge.outx, ref.outx+200))
-
-    # Proper edge behavior.
-    edge.out2 <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=0) 
-    expect_identical(edge.out[2], edge.out2[2])
-    expect_false(identical(edge.out[1], edge.out2[1]))
-
-    edge.out3 <- DropletUtils:::.find_curve_bounds(x=x, y=y, exclude.from=Inf)
-    expect_identical(unname(edge.out3[1]), length(y)-1)
-    expect_identical(unname(edge.out3[2]), length(y)-1)
-
-    # Works properly when put together. 
-    ref <- barcodeRanks(my.counts)
-    brout <- barcodeRanks(my.counts, exclude.from=0)
-    expect_false(identical(ref, brout))
-
-    brout2 <- barcodeRanks(my.counts, exclude.from=200)
-    expect_false(identical(ref, brout2))
-
-    brout3 <- barcodeRanks(my.counts, exclude.from=Inf)
-    expect_false(identical(ref, brout2))
-})
-
 test_that("defaultDrops runs to completion", {
     out <- defaultDrops(my.counts)