Skip to content

Commit 6d171ae

Browse files
authored
Switch to the eminem library for more efficient parsing of Matrix Market files. (#122)
This avoids unnecessary memory allocations for large matrices, especially if we use a two-pass approach to avoid an intermediate vector of vectors. We support multi-threaded parsing without relying on BiocParallel, and we allow direct reading into dgCMatrices or SVT_SparseMatrices to avoid R-level fiddling.
1 parent 113692c commit 6d171ae

File tree

10 files changed

+592
-75
lines changed

10 files changed

+592
-75
lines changed

DESCRIPTION

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: DropletUtils
2-
Version: 1.29.0
3-
Date: 2024-12-10
2+
Version: 1.29.1
3+
Date: 2025-05-29
44
Title: Utilities for Handling Single-Cell Droplet Data
55
Authors@R: c(
66
person("Aaron", "Lun", role = "aut"),
@@ -59,10 +59,11 @@ VignetteBuilder: knitr
5959
LinkingTo:
6060
Rcpp,
6161
beachmat,
62+
assorthead,
6263
Rhdf5lib,
6364
BH,
6465
dqrng,
6566
scuttle
66-
SystemRequirements: C++11, GNU make
67+
SystemRequirements: C++17, GNU make
6768
RoxygenNote: 7.3.2
6869
Encoding: UTF-8

NAMESPACE

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ importFrom(GenomicRanges,GRanges)
6969
importFrom(HDF5Array,TENxMatrix)
7070
importFrom(IRanges,IRanges)
7171
importFrom(Matrix,colSums)
72-
importFrom(Matrix,readMM)
7372
importFrom(Matrix,rowMeans)
7473
importFrom(Matrix,rowSums)
7574
importFrom(Matrix,sparseMatrix)
@@ -86,6 +85,7 @@ importFrom(S4Vectors,extractROWS)
8685
importFrom(S4Vectors,make_zero_col_DFrame)
8786
importFrom(S4Vectors,metadata)
8887
importFrom(SingleCellExperiment,SingleCellExperiment)
88+
importFrom(SparseArray,SVT_SparseArray)
8989
importFrom(SparseArray,nzvals)
9090
importFrom(SparseArray,nzwhich)
9191
importFrom(SummarizedExperiment,assay)
@@ -96,6 +96,7 @@ importFrom(edgeR,goodTuringProportions)
9696
importFrom(edgeR,q2qnbinom)
9797
importFrom(methods,as)
9898
importFrom(methods,is)
99+
importFrom(methods,new)
99100
importFrom(rhdf5,H5Dclose)
100101
importFrom(rhdf5,H5Dget_space)
101102
importFrom(rhdf5,H5Dopen)

R/RcppExports.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,7 @@ montecarlo_pval <- function(totalval, totallen, prob, ambient, iterations, alpha
4141
.Call('_DropletUtils_montecarlo_pval', PACKAGE = 'DropletUtils', totalval, totallen, prob, ambient, iterations, alpha, seeds, streams)
4242
}
4343

44+
read_mm <- function(path, two_pass, class_name, threads) {
45+
.Call('_DropletUtils_read_mm', PACKAGE = 'DropletUtils', path, two_pass, class_name, threads)
46+
}
47+

R/read10xCounts.R

Lines changed: 50 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,22 @@
1313
#' @param sample.names A character vector of length equal to \code{samples}, containing the sample names to store in the column metadata of the output object.
1414
#' If \code{NULL}, the file paths in \code{samples} are used directly.
1515
#' @param col.names A logical scalar indicating whether the columns of the output object should be named with the cell barcodes.
16-
#' @param row.names String specifying whether to use Ensembl IDs ("ID") or gene symbols ("Symbol") as row names. If using symbols, the Ensembl ID will be appended to disambiguate in case the same symbol corresponds to multiple Ensembl IDs.
16+
#' @param row.names String specifying whether to use Ensembl IDs ("ID") or gene symbols ("Symbol") as row names.
17+
#' For symbols, the Ensembl ID will be appended to disambiguate rows where the same symbol corresponds to multiple Ensembl IDs.
1718
#' @param type String specifying the type of 10X format to read data from.
1819
#' @param version String specifying the version of the 10X format to read data from.
1920
#' @param delayed Logical scalar indicating whether sparse matrices should be wrapped in \linkS4class{DelayedArray}s before combining.
2021
#' Only relevant for multiple \code{samples}.
2122
#' @param genome String specifying the genome if \code{type="HDF5"} and \code{version='2'}.
22-
#' @param compressed Logical scalar indicating whether the text files are compressed for \code{type="sparse"} or \code{"prefix"}.
23+
#' @param compressed Logical scalar indicating whether the text files are compressed for \code{type="mtx"} or \code{"prefix"}.
2324
#' @param intersect.genes Logical scalar indicating whether to take the intersection of common genes across all samples.
2425
#' If \code{FALSE}, differences in gene information across samples will cause an error to be raised.
26+
#' @param mtx.two.pass Logical scalar indicating whether to use a two-pass approach for loading data from a Matrix Market file.
27+
#' This reduces peak memory usage at the cost of some additional runtime.
28+
#' Only relevant when \code{type="mtx"} or \code{type="prefix"}.
29+
#' @param mtx.class String specifying the class of the output matrix when \code{type="mtx"} or \code{type="prefix"}.
30+
#' @param mtx.threads Integer scalar specifying the number of threads to use for reading Matrix Market files.
31+
#' Only relevant when \code{type="mtx"} or \code{type="prefix"}.
2532
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how loading should be parallelized for multiple \code{samples}.
2633
#'
2734
#' @return A \linkS4class{SingleCellExperiment} object containing count data for each gene (row) and cell (column) across all \code{samples}.
@@ -47,34 +54,31 @@
4754
#' If \code{type="auto"}, the format of the input file is automatically detected for each \code{samples} based on whether it ends with \code{".h5"}.
4855
#' If so, \code{type} is set to \code{"HDF5"}; otherwise it is set to \code{"sparse"}.
4956
#' \itemize{
50-
#' \item If \code{type="sparse"}, count data are loaded as a \linkS4class{dgCMatrix} object.
51-
#' This is a conventional column-sparse compressed matrix format produced by the CellRanger pipeline,
52-
#' consisting of a (possibly Gzipped) MatrixMarket text file (\code{"matrix.mtx"})
57+
#' \item If \code{type="mtx"} (or its older alias \code{"sparse"}), count data are assumed to be stored in a directory.
58+
#' This should contain a (possibly Gzipped) MatrixMarket text file (\code{"matrix.mtx"})
5359
#' with additional tab-delimited files for barcodes (\code{"barcodes.tsv"})
5460
#' and gene annotation (\code{"features.tsv"} for version 3 or \code{"genes.tsv"} for version 2).
55-
#' \item If \code{type="prefix"}, count data are also loaded as a \linkS4class{dgCMatrix} object.
56-
#' This assumes the same three-file structure for each sample as described for \code{type="sparse"},
57-
#' but each sample is defined here by a prefix in the file names rather than by being a separate directory.
58-
#' For example, if the \code{samples} entry is \code{"xyx_"},
59-
#' the files are expected to be \code{"xyz_matrix.mtx"}, \code{"xyz_barcodes.tsv"}, etc.
60-
#' \item If \code{type="HDF5"}, count data are assumed to follow the 10X sparse HDF5 format for large data sets.
61+
#' \item If \code{type="prefix"}, count data are assumed to follow same three-file structure for each sample as described for \code{type="mtx"}.
62+
#' However, each sample is defined by a prefix in the file names rather than by being stored a separate directory.
63+
#' For example, if the \code{samples} entry is \code{"xyx_"}, the files are expected to be \code{"xyz_matrix.mtx"}, \code{"xyz_barcodes.tsv"}, etc.
64+
#' \item If \code{type="hdf5"} (or its older alias \code{"HDF5"}), count data are assumed to follow the 10X sparse HDF5 format for large data sets.
6165
#' It is loaded as a \linkS4class{TENxMatrix} object, which is a stub object that refers back to the file in \code{samples}.
6266
#' Users may need to set \code{genome} if it cannot be automatically determined when \code{version="2"}.
6367
#' }
6468
#'
65-
#' When \code{type="sparse"} or \code{"prefix"} and \code{compressed=NULL},
69+
#' When \code{type="mtx"} or \code{"prefix"} and \code{compressed=NULL},
6670
#' the function will automatically search for both the unzipped and Gzipped versions of the files.
6771
#' This assumes that the compressed files have an additional \code{".gz"} suffix.
6872
#' We can restrict to only compressed or uncompressed files by setting \code{compressed=TRUE} or \code{FALSE}, respectively.
6973
#'
7074
#' CellRanger 3.0 introduced a major change in the format of the output files for both \code{type}s.
7175
#' If \code{version="auto"}, the version of the format is automatically detected from the supplied paths.
72-
#' For \code{type="sparse"}, this is based on whether there is a \code{"features.tsv.gz"} file in the directory.
76+
#' For \code{type="mtx"}, this is based on whether there is a \code{"features.tsv.gz"} file in the directory.
7377
#' For \code{type="HDF5"}, this is based on whether there is a top-level \code{"matrix"} group with a \code{"matrix/features"} subgroup in the file.
7478
#'
7579
#' Matrices are combined by column if multiple \code{samples} were specified.
7680
#' This will throw an error if the gene information is not consistent across \code{samples}.
77-
#' For \code{type="sparse"} or \code{"prefix"}, users can set \code{delayed=TRUE} to save memory during the combining process.
81+
#' For \code{type="mtx"} or \code{"prefix"}, users can set \code{delayed=TRUE} to save memory during the combining process.
7882
#' This also avoids integer overflow for very large datasets.
7983
#'
8084
#' If \code{col.names=TRUE} and \code{length(sample)==1}, each column is named by the cell barcode.
@@ -94,7 +98,7 @@
9498
#'
9599
#' @examples
96100
#' # Mocking up some 10X genomics output.
97-
#' example(write10xCounts)
101+
#' example(write10xCounts, echo=FALSE)
98102
#'
99103
#' # Reading it in.
100104
#' sce10x <- read10xCounts(tmpdir)
@@ -133,12 +137,15 @@ read10xCounts <- function(samples,
133137
sample.names=names(samples),
134138
col.names=FALSE,
135139
row.names = c("id", "symbol"),
136-
type=c("auto", "sparse", "HDF5", "prefix"),
140+
type=c("auto", "mtx", "hdf5", "prefix", "sparse", "HDF5"),
137141
delayed=FALSE,
138142
version=c("auto", "2", "3"),
139143
genome=NULL,
140144
compressed=NULL,
141145
intersect.genes=FALSE,
146+
mtx.two.pass=FALSE,
147+
mtx.class=c("CsparseMatrix", "SVT_SparseMatrix"),
148+
mtx.threads=1,
142149
BPPARAM=SerialParam())
143150
{
144151
type <- match.arg(type)
@@ -148,8 +155,17 @@ read10xCounts <- function(samples,
148155
sample.names <- samples
149156
}
150157

151-
load.out <- bplapply(samples, FUN=.tenx_loader, type=type, version=version,
152-
genome=genome, compressed=compressed, BPPARAM=BPPARAM)
158+
load.out <- bplapply(samples,
159+
FUN=.tenx_loader,
160+
type=type,
161+
version=version,
162+
genome=genome,
163+
compressed=compressed,
164+
mtx.two.pass=mtx.two.pass,
165+
mtx.class=match.arg(mtx.class),
166+
mtx.threads=mtx.threads,
167+
BPPARAM=BPPARAM
168+
)
153169

154170
nsets <- length(samples)
155171
full_data <- vector("list", nsets)
@@ -218,24 +234,24 @@ read10xCounts <- function(samples,
218234
SingleCellExperiment(list(counts = full_data), rowData = gene_info, colData = cell_info, metadata=list(Samples=samples))
219235
}
220236

221-
.tenx_loader <- function(run, type, version, genome, compressed) {
237+
.tenx_loader <- function(run, type, version, genome, compressed, mtx.two.pass, mtx.class, mtx.threads) {
222238
cur.type <- .type_chooser(run, type)
223-
if (cur.type=="sparse") {
224-
.read_from_sparse(run, version=version, compressed=compressed)
239+
if (cur.type=="mtx") {
240+
.read_from_sparse(run, version=version, is.prefix=FALSE, compressed=compressed, mtx.two.pass=mtx.two.pass, mtx.class=mtx.class, mtx.threads=mtx.threads)
225241
} else if (cur.type=="prefix") {
226-
.read_from_sparse(run, version=version, is.prefix=TRUE, compressed=compressed)
242+
.read_from_sparse(run, version=version, is.prefix=TRUE, compressed=compressed, mtx.two.pass=mtx.two.pass, mtx.class=mtx.class, mtx.threads=mtx.threads)
227243
} else {
228244
.read_from_hdf5(run, genome=genome, version=version)
229245
}
230246
}
231247

232-
#' @importFrom methods as
248+
#' @importFrom methods as new
233249
#' @importClassesFrom Matrix dgCMatrix
234-
#' @importFrom Matrix readMM
235250
#' @importFrom utils read.delim head
236251
#' @importFrom IRanges IRanges
237252
#' @importFrom S4Vectors mcols<-
238-
.read_from_sparse <- function(path, version, is.prefix=FALSE, compressed=NULL) {
253+
#' @importClassesFrom SparseArray SVT_SparseMatrix
254+
.read_from_sparse <- function(path, version, is.prefix, compressed, mtx.two.pass, mtx.class, mtx.threads) {
239255
FUN <- if (is.prefix) paste0 else file.path
240256

241257
if (version=="auto") {
@@ -279,8 +295,16 @@ read10xCounts <- function(samples,
279295
gene.info <- gr
280296
}
281297

298+
raw_mat <- read_mm(matrix.loc, two_pass=mtx.two.pass, class_name=mtx.class, threads=mtx.threads)
299+
if (mtx.class == "CsparseMatrix") {
300+
# Don't use sparseMatrix as this seems to do an unnecessary roundtrip through the triplet form.
301+
mat <- new("dgCMatrix", Dim=raw_mat$dim, i=raw_mat$contents$i, x=raw_mat$contents$x, p=raw_mat$contents$p)
302+
} else {
303+
mat <- new("SVT_SparseMatrix", SVT=raw_mat$contents$list, dim=raw_mat$dim, type=raw_mat$contents$type)
304+
}
305+
282306
list(
283-
mat=as(readMM(matrix.loc), "CsparseMatrix"),
307+
mat=mat,
284308
cell.names=readLines(barcode.loc),
285309
gene.info=gene.info
286310
)

R/write10xCounts.R

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,27 @@
44
#' in the format produced by the CellRanger software suite.
55
#'
66
#' @param x A sparse numeric matrix of UMI counts.
7-
#' @param path A string containing the path to the output directory (for \code{type="sparse"}) or file (for \code{type="HDF5"}).
7+
#' @param path A string containing the path to the output directory (for \code{type="mtx"}) or file (for \code{type="hdf5"}).
88
#' @param barcodes A character vector of cell barcodes, one per column of \code{x}.
99
#' @param gene.id A character vector of gene identifiers, one per row of \code{x}.
1010
#' @param gene.symbol A character vector of gene symbols, one per row of \code{x}.
1111
#' @param gene.type A character vector of gene types, expanded to one per row of \code{x}.
1212
#' Only used when \code{version="3"}.
1313
#' @param overwrite A logical scalar specifying whether \code{path} should be overwritten if it already exists.
1414
#' @param type String specifying the type of 10X format to save \code{x} to.
15-
#' This is either a directory containing a sparse matrix with row/column annotation (\code{"sparse"})
16-
#' or a HDF5 file containing the same information (\code{"HDF5"}).
17-
#' @param genome String specifying the genome for storage when \code{type="HDF5"}.
15+
#' This is either a directory containing a sparse matrix with row/column annotation (\code{"mtx"}, or its older alias \code{"sparse"})
16+
#' or a HDF5 file containing the same information (\code{"hdf5"}, or its older alias \code{"HDF5"}).
17+
#' @param genome String specifying the genome for storage when \code{type="hdf5"}.
1818
#' This can be a character vector with one genome per feature if \code{version="3"}.
1919
#' @param version String specifying the version of the CellRanger format to produce.
2020
#' @param chemistry,original.gem.groups,library.ids
21-
#' Strings containing metadata attributes to be added to the HDF5 file for \code{type="HDF5"}.
21+
#' Strings containing metadata attributes to be added to the HDF5 file for \code{type="hdf5"}.
2222
#' Their interpretation is not formally documented and is left to the user's imagination.
2323
#'
2424
#' @details
2525
#' This function will try to automatically detect the desired format based on whether \code{path} ends with \code{".h5"}.
26-
#' If so, it assumes that \code{path} specifies a HDF5 file path and sets \code{type="HDF5"}.
27-
#' Otherwise it will set \code{type="sparse"} under the assumption that \code{path} specifies a path to a directory.
26+
#' If so, it assumes that \code{path} specifies a HDF5 file path and sets \code{type="hdf5"}.
27+
#' Otherwise it will set \code{type="mtx"} under the assumption that \code{path} specifies a path to a directory.
2828
#'
2929
#' Note that there were major changes in the output format for CellRanger version 3.0 to account for non-gene features such as antibody or CRISPR tags.
3030
#' Users can switch to this new format using \code{version="3"}.
@@ -35,11 +35,11 @@
3535
#' We recommend against doing so routinely due to CellRanger's dependence on undocumented metadata attributes that may change without notice.
3636
#'
3737
#' @return
38-
#' For \code{type="sparse"}, a directory is produced at \code{path}.
38+
#' For \code{type="mtx"}, a directory is produced at \code{path}.
3939
#' If \code{version="2"}, this will contain the files \code{"matrix.mtx"}, \code{"barcodes.tsv"} and \code{"genes.tsv"}.
4040
#' If \code{version="3"}, it will instead contain \code{"matrix.mtx.gz"}, \code{"barcodes.tsv.gz"} and \code{"features.tsv.gz"}.
4141
#'
42-
#' For \code{type="HDF5"}, a HDF5 file is produced at \code{path} containing data in column-sparse format.
42+
#' For \code{type="hdf5"}, a HDF5 file is produced at \code{path} containing data in column-sparse format.
4343
#' If \code{version="2"}, data are stored in the HDF5 group named \code{genome}.
4444
#' If \code{version="3"}, data are stored in the group \code{"matrix"}.
4545
#'
@@ -54,8 +54,7 @@
5454
#' @examples
5555
#' # Mocking up some count data.
5656
#' library(Matrix)
57-
#' my.counts <- matrix(rpois(1000, lambda=5), ncol=10, nrow=100)
58-
#' my.counts <- as(my.counts, "CsparseMatrix")
57+
#' my.counts <- abs(rsparsematrix(100, 10, 0.2) * 10)
5958
#' cell.ids <- paste0("BARCODE-", seq_len(ncol(my.counts)))
6059
#'
6160
#' ngenes <- nrow(my.counts)
@@ -91,9 +90,19 @@
9190
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices}
9291
#'
9392
#' @export
94-
write10xCounts <- function(path, x, barcodes=colnames(x), gene.id=rownames(x), gene.symbol=gene.id, gene.type="Gene Expression",
95-
overwrite=FALSE, type=c("auto", "sparse", "HDF5"), genome="unknown", version=c("2", "3"),
96-
chemistry="Single Cell 3' v3", original.gem.groups=1L, library.ids="custom")
93+
write10xCounts <- function(path,
94+
x,
95+
barcodes=colnames(x),
96+
gene.id=rownames(x),
97+
gene.symbol=gene.id,
98+
gene.type="Gene Expression",
99+
overwrite=FALSE,
100+
type=c("auto", "mtx", "hdf5", "sparse", "HDF5"),
101+
genome="unknown",
102+
version=c("2", "3"),
103+
chemistry="Single Cell 3' v3",
104+
original.gem.groups=1L,
105+
library.ids="custom")
97106
{
98107
# Doing all the work on a temporary location next to 'path', as we have permissions there.
99108
# This avoids problems with 'path' already existing.
@@ -113,7 +122,7 @@ write10xCounts <- function(path, x, barcodes=colnames(x), gene.id=rownames(x), g
113122
# Determining what format to save in.
114123
version <- match.arg(version)
115124
type <- .type_chooser(path, match.arg(type))
116-
if (type=="sparse") {
125+
if (type == "mtx") {
117126
.write_sparse(temp.path, x, barcodes, gene.id, gene.symbol, gene.type, version=version)
118127
} else {
119128
.write_hdf5(temp.path, genome, x, barcodes, gene.id, gene.symbol, gene.type, version=version)
@@ -223,7 +232,11 @@ write10xCounts <- function(path, x, barcodes=colnames(x), gene.id=rownames(x), g
223232

224233
.type_chooser <- function(path, type) {
225234
if (type=="auto") {
226-
type <- if (grepl("\\.h5", path)) "HDF5" else "sparse"
235+
type <- if (grepl("\\.h5", path)) "hdf5" else "mtx"
236+
} else if (type == "HDF5") {
237+
type <- "hdf5"
238+
} else if (type == "sparse") {
239+
type <- "mtx"
227240
}
228241
type
229242
}

0 commit comments

Comments
 (0)