Use same density estimation code in stat_density and stat_ydensity.

hadley · hadley · commit d58b00a58691 · 2015-06-12T11:06:56.000-05:00
Fixes #972
diff --git a/R/position-collide.r b/R/position-collide.r
@@ -67,11 +67,10 @@ pos_stack <- function(df, width) {
     heights <- c(0, cumsum(y))
   }
 
-  within(df, {
-    ymin <- heights[-n]
-    ymax <- heights[-1]
-    y <- ymax
-  })
+  df$ymin <- heights[-n]
+  df$ymax <- heights[-1]
+  df$y <- df$ymax
+  df
 }
 
 # Stack overlapping intervals and set height to 1.
diff --git a/R/stat-density.r b/R/stat-density.r
@@ -6,10 +6,12 @@
 #' @param adjust see \code{\link{density}} for details
 #' @param kernel kernel used for density estimation, see
 #'   \code{\link{density}} for details
-#' @param trim if \code{TRUE}, the default, densities are trimmed to the
-#'   actual range of the data.  If \code{FALSE}, they are extended by the
-#'   default 3 bandwidths (as specified by the \code{cut} parameter to
-#'   \code{\link{density}})
+#' @param trim This parameter only matters if you are displaying multiple
+#'   densities in one plot. If \code{FALSE}, the default, each density is
+#'   computed on the full range of the data. If \code{TRUE}, each density
+#'   is computed over the range of that group: this typically means the
+#'   estimated x values will not line-up, and hence you won't be able to
+#'   stack density values.
 #' @param na.rm If \code{FALSE} (the default), removes missing values with
 #'    a warning.  If \code{TRUE} silently removes missing values.
 #' @inheritParams stat_identity
@@ -100,25 +102,48 @@ StatDensity <- proto(Stat, {
     data <- remove_missing(data, na.rm, "x", name = "stat_density",
       finite = TRUE)
 
-    n <- nrow(data)
-    if (n < 3) return(data.frame())
-    if (is.null(data$weight)) data$weight <- rep(1, n) / n
+    if (trim) {
+      range <- range(data$x, na.rm = TRUE)
+    } else {
+      range <- scale_dimension(scales$x, c(0, 0))
+    }
 
-    range <- scale_dimension(scales$x, c(0, 0))
-    xgrid <- seq(range[1], range[2], length=200)
-
-    dens <- stats::density(data$x, adjust=adjust, kernel=kernel, weight=data$weight, from=range[1], to=range[2])
-    densdf <- as.data.frame(dens[c("x","y")])
-
-    densdf$scaled <- densdf$y / max(densdf$y, na.rm = TRUE)
-    if (trim) densdf <- subset(densdf, x > min(data$x, na.rm = TRUE) & x < max(data$x, na.rm = TRUE))
-
-    densdf$count <- densdf$y * n
-    rename(densdf, c(y = "density"), warn_missing = FALSE)
+    compute_density(data$x, data$w, from = range[1], to = range[2],
+      adjust = adjust, kernel = kernel)
   }
 
   default_geom <- function(.) GeomArea
   default_aes <- function(.) aes(y = ..density.., fill=NA)
   required_aes <- c("x")
 
 })
+
+compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
+                            kernel = "gaussian") {
+  n <- length(x)
+  if (is.null(w)) {
+    w <- rep(1 / n, n)
+  }
+
+  # if less than 3 points, spread density evenly over points
+  if (n < 3) {
+    return(data.frame(
+      x = x,
+      density = w / sum(w),
+      scaled = w / max(w),
+      count = 1,
+      n = n
+    ))
+  }
+
+  dens <- stats::density(x, weight = w, bw = bw, adjust = adjust,
+    kernel = kernel, from = from, to = to)
+
+  data.frame(
+    x = dens$x,
+    density = dens$y,
+    scaled =  dens$y / max(dens$y, na.rm = TRUE),
+    count =   dens$y * n,
+    n = n
+  )
+}
diff --git a/R/stat-ydensity.r b/R/stat-ydensity.r
@@ -5,8 +5,6 @@
 #'
 #' @inheritParams stat_density
 #' @inheritParams stat_identity
-#' @param trim If \code{TRUE} (default), trim the tails of the violins
-#'   to the range of the data. If \code{FALSE}, don't trim the tails.
 #' @param scale if "area" (default), all violins have the same area (before trimming
 #'   the tails). If "count", areas are scaled proportionally to the number of
 #'   observations. If "width", all violins have the same maximum width.
@@ -60,52 +58,29 @@ StatYdensity <- proto(Stat, {
   }
 
   calculate <- function(., data, scales, width=NULL, adjust=1, kernel="gaussian",
-                        trim=TRUE, na.rm = FALSE, ...) {
+                        trim = FALSE, na.rm = FALSE, ...) {
+    data <- remove_missing(data, na.rm, "x", name = "stat_density",
+      finite = TRUE)
 
-    n <- nrow(data)
-
-    # if less than 3 points, return a density of 1 everywhere
-    if (n < 3) {
-      return(data.frame(data, density = 1, scaled = 1, count = 1))
-    }
-
-    # initialize weights if they are not supplied by the user
-    if (is.null(data$weight)) { data$weight <- rep(1, n) / n }
-
-    # compute the density
-    dens <- stats::density(data$y, adjust = adjust, kernel = kernel,
-      weight = data$weight, n = 200)
-
-    # NB: stat_density restricts to the scale range, here we leave that
-    # free so violins can extend the y scale
-    densdf <- data.frame(y = dens$x, density = dens$y)
-
-    # scale density to a maximum of 1
-    densdf$scaled <- densdf$density / max(densdf$density, na.rm = TRUE)
-
-    # trim density outside of the data range
     if (trim) {
-      densdf <- subset(densdf, y > min(data$y, na.rm = TRUE) & y < max(data$y, na.rm = TRUE))
+      range <- range(data$y, na.rm = TRUE)
+    } else {
+      range <- scale_dimension(scales$y, c(0, 0))
     }
-    # NB: equivalently, we could have used these bounds in the from and
-    # to arguments of density()
-
-    # scale density by the number of observations
-    densdf$count <- densdf$density * n
-    # record the number of observations to be able to scale the density later
-    densdf$n <- n
-
-    # coordinate on the x axis
-    densdf$x <- if (is.factor(data$x)) data$x[1] else mean(range(data$x))
+    dens <- compute_density(data$y, data$w, from = range[1], to = range[2],
+      adjust = adjust, kernel = kernel)
 
-    # width of the bounding box of the violin plot on the x axis for continuous x
-    if (length(unique(data$x)) > 1) { width <- diff(range(data$x)) * 0.9 }
-    densdf$width <- width
+    dens$y <- dens$x
+    dens$x <- mean(range(data$x))
 
-    densdf
+    # Compute width if x has multiple values
+    if (length(unique(data$x)) > 1) {
+      width <- diff(range(data$x)) * 0.9
+    }
+    dens$width <- width
+    dens
   }
 
   default_geom <- function(.) GeomViolin
   required_aes <- c("x", "y")
-
 })
diff --git a/inst/tests/test-stat-density.R b/inst/tests/test-stat-density.R
@@ -0,0 +1,13 @@
+context("stat_density") # and stat_ydensity
+
+test_that("compute_density succeeds when variance is zero", {
+  dens <- compute_density(rep(0, 10), NULL, from = 0.5, to = 0.5)
+  expect_equal(dens$n, rep(10, 512))
+})
+
+test_that("compute_density returns useful df when <3 values", {
+  dens <- compute_density(c(1, 2), NULL, from = 0, to = 0)
+
+  expect_equal(nrow(dens), 2)
+  expect_equal(names(dens), c("x", "density", "scaled", "count", "n"))
+})
diff --git a/man/stat_density.Rd b/man/stat_density.Rd
@@ -26,10 +26,12 @@ on this layer}
 \item{kernel}{kernel used for density estimation, see
 \code{\link{density}} for details}
 
-\item{trim}{if \code{TRUE}, the default, densities are trimmed to the
-actual range of the data.  If \code{FALSE}, they are extended by the
-default 3 bandwidths (as specified by the \code{cut} parameter to
-\code{\link{density}})}
+\item{trim}{This parameter only matters if you are displaying multiple
+densities in one plot. If \code{FALSE}, the default, each density is
+computed on the full range of the data. If \code{TRUE}, each density
+is computed over the range of that group: this typically means the
+estimated x values will not line-up, and hence you won't be able to
+stack density values.}
 
 \item{na.rm}{If \code{FALSE} (the default), removes missing values with
 a warning.  If \code{TRUE} silently removes missing values.}
diff --git a/man/stat_ydensity.Rd b/man/stat_ydensity.Rd
@@ -26,8 +26,12 @@ on this layer}
 \item{kernel}{kernel used for density estimation, see
 \code{\link{density}} for details}
 
-\item{trim}{If \code{TRUE} (default), trim the tails of the violins
-to the range of the data. If \code{FALSE}, don't trim the tails.}
+\item{trim}{This parameter only matters if you are displaying multiple
+densities in one plot. If \code{FALSE}, the default, each density is
+computed on the full range of the data. If \code{TRUE}, each density
+is computed over the range of that group: this typically means the
+estimated x values will not line-up, and hence you won't be able to
+stack density values.}
 
 \item{scale}{if "area" (default), all violins have the same area (before trimming
 the tails). If "count", areas are scaled proportionally to the number of

Original file line number	Diff line number	Diff line change
`@@ -67,11 +67,10 @@ pos_stack <- function(df, width) {`
`67`	`67`	`heights <- c(0, cumsum(y))`
`68`	`68`	`}`
`69`	`69`
`70`		`- within(df, {`
`71`		`- ymin <- heights[-n]`
`72`		`- ymax <- heights[-1]`
`73`		`- y <- ymax`
`74`		`- })`
	`70`	`+ df$ymin <- heights[-n]`
	`71`	`+ df$ymax <- heights[-1]`
	`72`	`+ df$y <- df$ymax`
	`73`	`+ df`
`75`	`74`	`}`
`76`	`75`
`77`	`76`	`# Stack overlapping intervals and set height to 1.`