|
11 | 11 | #' @param exclude.from An integer scalar specifying the number of highest ranking barcodes to exclude from spline fitting.
|
12 | 12 | #' Ignored if \code{fit.bounds} is specified.
|
13 | 13 | #' @param assay.type Integer or string specifying the assay containing the count matrix.
|
14 |
| -#' @param df Integer scalar specifying the number of degrees of freedom, to pass to \code{\link{smooth.spline}}. |
| 14 | +#' @param df Deprecated and ignored. |
15 | 15 | #' @param ... For the generic, further arguments to pass to individual methods.
|
16 | 16 | #'
|
17 | 17 | #' For the SummarizedExperiment method, further arguments to pass to the ANY method.
|
|
34 | 34 | #' \item The inflection point is computed as the point on the rank/total curve where the first derivative is minimized.
|
35 | 35 | #' The derivative is computed directly from all points on the curve with total counts greater than \code{lower}.
|
36 | 36 | #' This avoids issues with erratic behaviour of the curve at lower totals.
|
37 |
| -#' \item The knee point is defined as the point on the curve where the signed curvature is minimized. |
38 |
| -#' This requires calculation of the second derivative, which is much more sensitive to noise in the curve. |
39 |
| -#' To overcome this, a smooth spline is fitted to the log-total counts against the log-rank using \code{\link{smooth.spline}}. |
40 |
| -#' Derivatives are then calculated from the fitted spline using \code{\link{predict}}. |
| 37 | +#' \item The knee point is defined as the point on the curve that is furthest from the straight line drawn between the \code{fit.bounds} locations on the curve. |
| 38 | +#' We used to minimize the signed curvature to identify the knee point but this relies on the second derivative, |
| 39 | +#' which was too unstable even after smoothing. |
41 | 40 | #' }
|
42 | 41 | #'
|
43 |
| -#' @section Details on curve fitting: |
44 |
| -#' We supply a relatively low default setting of \code{df} to avoid overfitting the spline, |
45 |
| -#' as this results in unstability in the higher derivatives (and thus the curvature). |
46 |
| -#' \code{df} and other arguments to \code{\link{smooth.spline}} can be tuned |
47 |
| -#' if the estimated knee point is not at an appropriate location. |
48 |
| -#' We also restrict the fit to lie within the bounds defined by \code{fit.bounds} to focus on the region containing the knee point. |
49 |
| -#' This allows us to obtain an accurate fit with low \code{df} rather than attempting to model the entire curve. |
50 |
| -#' |
51 | 42 | #' If \code{fit.bounds} is not specified, the lower bound is automatically set to the inflection point
|
52 | 43 | #' as this should lie below the knee point on typical curves.
|
53 | 44 | #' The upper bound is set to the point at which the first derivative is closest to zero,
|
|
63 | 54 | #' \describe{
|
64 | 55 | #' \item{\code{rank}:}{Numeric, the rank of each barcode (averaged across ties).}
|
65 | 56 | #' \item{\code{total}:}{Numeric, the total counts for each barcode.}
|
66 |
| -#' \item{\code{fitted}:}{Numeric, the fitted value from the spline for each barcode. |
67 |
| -#' This is \code{NA} for points with \code{x} outside of \code{fit.bounds}.} |
68 | 57 | #' }
|
69 | 58 | #'
|
70 | 59 | #' The metadata contains \code{knee}, a numeric scalar containing the total count at the knee point;
|
|
85 | 74 | #' # Making a plot.
|
86 | 75 | #' plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total")
|
87 | 76 | #' o <- order(br.out$rank)
|
88 |
| -#' lines(br.out$rank[o], br.out$fitted[o], col="red") |
89 | 77 | #' abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
|
90 | 78 | #' abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
|
91 | 79 | #' legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"),
|
|
98 | 86 | #' @name barcodeRanks
|
99 | 87 | NULL
|
100 | 88 |
|
101 |
| -#' @importFrom stats smooth.spline predict fitted |
102 | 89 | #' @importFrom utils tail
|
103 | 90 | #' @importFrom Matrix colSums
|
104 | 91 | #' @importFrom S4Vectors DataFrame metadata<-
|
@@ -134,21 +121,20 @@ NULL
|
134 | 121 | if (is.null(fit.bounds)) {
|
135 | 122 | new.keep <- left.edge:right.edge
|
136 | 123 | } else {
|
137 |
| - new.keep <- y > log10(fit.bounds[1]) & y < log10(fit.bounds[2]) |
| 124 | + new.keep <- which(y > log10(fit.bounds[1]) & y < log10(fit.bounds[2])) |
138 | 125 | }
|
139 | 126 |
|
140 |
| - # Smoothing to avoid error multiplication upon differentiation. |
141 |
| - # Minimizing the signed curvature and returning the total for the knee point. |
142 |
| - fitted.vals <- rep(NA_real_, length(keep)) |
143 |
| - |
| 127 | + # Using the maximum distance to identify the knee point. |
144 | 128 | if (length(new.keep) >= 4) {
|
145 |
| - fit <- smooth.spline(x[new.keep], y[new.keep], df=df, ...) |
146 |
| - fitted.vals[keep][new.keep] <- 10^fitted(fit) |
147 |
| - |
148 |
| - d1 <- predict(fit, deriv=1)$y |
149 |
| - d2 <- predict(fit, deriv=2)$y |
150 |
| - curvature <- d2/(1 + d1^2)^1.5 |
151 |
| - knee <- 10^(y[new.keep][which.min(curvature)]) |
| 129 | + curx <- x[new.keep] |
| 130 | + cury <- y[new.keep] |
| 131 | + xbounds <- curx[c(1L, length(new.keep))] |
| 132 | + ybounds <- cury[c(1L, length(new.keep))] |
| 133 | + gradient <- diff(ybounds)/diff(xbounds) |
| 134 | + intercept <- ybounds[1] - xbounds[1] * gradient |
| 135 | + above <- which(cury >= curx * gradient + intercept) |
| 136 | + dist <- abs(gradient * curx[above] - cury[above] + intercept)/sqrt(gradient^2 + 1) |
| 137 | + knee <- 10^(cury[above[which.max(dist)]]) |
152 | 138 | } else {
|
153 | 139 | # Sane fallback upon overly aggressive filtering by 'exclude.from', 'lower'.
|
154 | 140 | knee <- 10^(y[new.keep[1]])
|
|
157 | 143 | # Returning a whole stack of useful stats.
|
158 | 144 | out <- DataFrame(
|
159 | 145 | rank=.reorder(run.rank, stuff$lengths, o),
|
160 |
| - total=.reorder(run.totals, stuff$lengths, o), |
161 |
| - fitted=.reorder(fitted.vals, stuff$lengths, o) |
| 146 | + total=.reorder(run.totals, stuff$lengths, o) |
162 | 147 | )
|
163 | 148 | rownames(out) <- colnames(m)
|
164 | 149 | metadata(out) <- list(knee=knee, inflection=inflection)
|
|
0 commit comments