Skip to content

Switch to the eminem library for more efficient parsing of Matrix Market files. #122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: DropletUtils
Version: 1.29.0
Date: 2024-12-10
Version: 1.29.1
Date: 2025-05-29
Title: Utilities for Handling Single-Cell Droplet Data
Authors@R: c(
person("Aaron", "Lun", role = "aut"),
Expand Down Expand Up @@ -59,10 +59,11 @@ VignetteBuilder: knitr
LinkingTo:
Rcpp,
beachmat,
assorthead,
Rhdf5lib,
BH,
dqrng,
scuttle
SystemRequirements: C++11, GNU make
SystemRequirements: C++17, GNU make
RoxygenNote: 7.3.2
Encoding: UTF-8
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ importFrom(GenomicRanges,GRanges)
importFrom(HDF5Array,TENxMatrix)
importFrom(IRanges,IRanges)
importFrom(Matrix,colSums)
importFrom(Matrix,readMM)
importFrom(Matrix,rowMeans)
importFrom(Matrix,rowSums)
importFrom(Matrix,sparseMatrix)
Expand All @@ -86,6 +85,7 @@ importFrom(S4Vectors,extractROWS)
importFrom(S4Vectors,make_zero_col_DFrame)
importFrom(S4Vectors,metadata)
importFrom(SingleCellExperiment,SingleCellExperiment)
importFrom(SparseArray,SVT_SparseArray)
importFrom(SparseArray,nzvals)
importFrom(SparseArray,nzwhich)
importFrom(SummarizedExperiment,assay)
Expand All @@ -96,6 +96,7 @@ importFrom(edgeR,goodTuringProportions)
importFrom(edgeR,q2qnbinom)
importFrom(methods,as)
importFrom(methods,is)
importFrom(methods,new)
importFrom(rhdf5,H5Dclose)
importFrom(rhdf5,H5Dget_space)
importFrom(rhdf5,H5Dopen)
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ montecarlo_pval <- function(totalval, totallen, prob, ambient, iterations, alpha
.Call('_DropletUtils_montecarlo_pval', PACKAGE = 'DropletUtils', totalval, totallen, prob, ambient, iterations, alpha, seeds, streams)
}

read_mm <- function(path, two_pass, class_name, threads) {
.Call('_DropletUtils_read_mm', PACKAGE = 'DropletUtils', path, two_pass, class_name, threads)
}

76 changes: 50 additions & 26 deletions R/read10xCounts.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,22 @@
#' @param sample.names A character vector of length equal to \code{samples}, containing the sample names to store in the column metadata of the output object.
#' If \code{NULL}, the file paths in \code{samples} are used directly.
#' @param col.names A logical scalar indicating whether the columns of the output object should be named with the cell barcodes.
#' @param row.names String specifying whether to use Ensembl IDs ("ID") or gene symbols ("Symbol") as row names. If using symbols, the Ensembl ID will be appended to disambiguate in case the same symbol corresponds to multiple Ensembl IDs.
#' @param row.names String specifying whether to use Ensembl IDs ("ID") or gene symbols ("Symbol") as row names.
#' For symbols, the Ensembl ID will be appended to disambiguate rows where the same symbol corresponds to multiple Ensembl IDs.
#' @param type String specifying the type of 10X format to read data from.
#' @param version String specifying the version of the 10X format to read data from.
#' @param delayed Logical scalar indicating whether sparse matrices should be wrapped in \linkS4class{DelayedArray}s before combining.
#' Only relevant for multiple \code{samples}.
#' @param genome String specifying the genome if \code{type="HDF5"} and \code{version='2'}.
#' @param compressed Logical scalar indicating whether the text files are compressed for \code{type="sparse"} or \code{"prefix"}.
#' @param compressed Logical scalar indicating whether the text files are compressed for \code{type="mtx"} or \code{"prefix"}.
#' @param intersect.genes Logical scalar indicating whether to take the intersection of common genes across all samples.
#' If \code{FALSE}, differences in gene information across samples will cause an error to be raised.
#' @param mtx.two.pass Logical scalar indicating whether to use a two-pass approach for loading data from a Matrix Market file.
#' This reduces peak memory usage at the cost of some additional runtime.
#' Only relevant when \code{type="mtx"} or \code{type="prefix"}.
#' @param mtx.class String specifying the class of the output matrix when \code{type="mtx"} or \code{type="prefix"}.
#' @param mtx.threads Integer scalar specifying the number of threads to use for reading Matrix Market files.
#' Only relevant when \code{type="mtx"} or \code{type="prefix"}.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how loading should be parallelized for multiple \code{samples}.
#'
#' @return A \linkS4class{SingleCellExperiment} object containing count data for each gene (row) and cell (column) across all \code{samples}.
Expand All @@ -47,34 +54,31 @@
#' If \code{type="auto"}, the format of the input file is automatically detected for each \code{samples} based on whether it ends with \code{".h5"}.
#' If so, \code{type} is set to \code{"HDF5"}; otherwise it is set to \code{"sparse"}.
#' \itemize{
#' \item If \code{type="sparse"}, count data are loaded as a \linkS4class{dgCMatrix} object.
#' This is a conventional column-sparse compressed matrix format produced by the CellRanger pipeline,
#' consisting of a (possibly Gzipped) MatrixMarket text file (\code{"matrix.mtx"})
#' \item If \code{type="mtx"} (or its older alias \code{"sparse"}), count data are assumed to be stored in a directory.
#' This should contain a (possibly Gzipped) MatrixMarket text file (\code{"matrix.mtx"})
#' with additional tab-delimited files for barcodes (\code{"barcodes.tsv"})
#' and gene annotation (\code{"features.tsv"} for version 3 or \code{"genes.tsv"} for version 2).
#' \item If \code{type="prefix"}, count data are also loaded as a \linkS4class{dgCMatrix} object.
#' This assumes the same three-file structure for each sample as described for \code{type="sparse"},
#' but each sample is defined here by a prefix in the file names rather than by being a separate directory.
#' For example, if the \code{samples} entry is \code{"xyx_"},
#' the files are expected to be \code{"xyz_matrix.mtx"}, \code{"xyz_barcodes.tsv"}, etc.
#' \item If \code{type="HDF5"}, count data are assumed to follow the 10X sparse HDF5 format for large data sets.
#' \item If \code{type="prefix"}, count data are assumed to follow same three-file structure for each sample as described for \code{type="mtx"}.
#' However, each sample is defined by a prefix in the file names rather than by being stored a separate directory.
#' For example, if the \code{samples} entry is \code{"xyx_"}, the files are expected to be \code{"xyz_matrix.mtx"}, \code{"xyz_barcodes.tsv"}, etc.
#' \item If \code{type="hdf5"} (or its older alias \code{"HDF5"}), count data are assumed to follow the 10X sparse HDF5 format for large data sets.
#' It is loaded as a \linkS4class{TENxMatrix} object, which is a stub object that refers back to the file in \code{samples}.
#' Users may need to set \code{genome} if it cannot be automatically determined when \code{version="2"}.
#' }
#'
#' When \code{type="sparse"} or \code{"prefix"} and \code{compressed=NULL},
#' When \code{type="mtx"} or \code{"prefix"} and \code{compressed=NULL},
#' the function will automatically search for both the unzipped and Gzipped versions of the files.
#' This assumes that the compressed files have an additional \code{".gz"} suffix.
#' We can restrict to only compressed or uncompressed files by setting \code{compressed=TRUE} or \code{FALSE}, respectively.
#'
#' CellRanger 3.0 introduced a major change in the format of the output files for both \code{type}s.
#' If \code{version="auto"}, the version of the format is automatically detected from the supplied paths.
#' For \code{type="sparse"}, this is based on whether there is a \code{"features.tsv.gz"} file in the directory.
#' For \code{type="mtx"}, this is based on whether there is a \code{"features.tsv.gz"} file in the directory.
#' For \code{type="HDF5"}, this is based on whether there is a top-level \code{"matrix"} group with a \code{"matrix/features"} subgroup in the file.
#'
#' Matrices are combined by column if multiple \code{samples} were specified.
#' This will throw an error if the gene information is not consistent across \code{samples}.
#' For \code{type="sparse"} or \code{"prefix"}, users can set \code{delayed=TRUE} to save memory during the combining process.
#' For \code{type="mtx"} or \code{"prefix"}, users can set \code{delayed=TRUE} to save memory during the combining process.
#' This also avoids integer overflow for very large datasets.
#'
#' If \code{col.names=TRUE} and \code{length(sample)==1}, each column is named by the cell barcode.
Expand All @@ -94,7 +98,7 @@
#'
#' @examples
#' # Mocking up some 10X genomics output.
#' example(write10xCounts)
#' example(write10xCounts, echo=FALSE)
#'
#' # Reading it in.
#' sce10x <- read10xCounts(tmpdir)
Expand Down Expand Up @@ -133,12 +137,15 @@ read10xCounts <- function(samples,
sample.names=names(samples),
col.names=FALSE,
row.names = c("id", "symbol"),
type=c("auto", "sparse", "HDF5", "prefix"),
type=c("auto", "mtx", "hdf5", "prefix", "sparse", "HDF5"),
delayed=FALSE,
version=c("auto", "2", "3"),
genome=NULL,
compressed=NULL,
intersect.genes=FALSE,
mtx.two.pass=FALSE,
mtx.class=c("CsparseMatrix", "SVT_SparseMatrix"),
mtx.threads=1,
BPPARAM=SerialParam())
{
type <- match.arg(type)
Expand All @@ -148,8 +155,17 @@ read10xCounts <- function(samples,
sample.names <- samples
}

load.out <- bplapply(samples, FUN=.tenx_loader, type=type, version=version,
genome=genome, compressed=compressed, BPPARAM=BPPARAM)
load.out <- bplapply(samples,
FUN=.tenx_loader,
type=type,
version=version,
genome=genome,
compressed=compressed,
mtx.two.pass=mtx.two.pass,
mtx.class=match.arg(mtx.class),
mtx.threads=mtx.threads,
BPPARAM=BPPARAM
)

nsets <- length(samples)
full_data <- vector("list", nsets)
Expand Down Expand Up @@ -218,24 +234,24 @@ read10xCounts <- function(samples,
SingleCellExperiment(list(counts = full_data), rowData = gene_info, colData = cell_info, metadata=list(Samples=samples))
}

.tenx_loader <- function(run, type, version, genome, compressed) {
.tenx_loader <- function(run, type, version, genome, compressed, mtx.two.pass, mtx.class, mtx.threads) {
cur.type <- .type_chooser(run, type)
if (cur.type=="sparse") {
.read_from_sparse(run, version=version, compressed=compressed)
if (cur.type=="mtx") {
.read_from_sparse(run, version=version, is.prefix=FALSE, compressed=compressed, mtx.two.pass=mtx.two.pass, mtx.class=mtx.class, mtx.threads=mtx.threads)
} else if (cur.type=="prefix") {
.read_from_sparse(run, version=version, is.prefix=TRUE, compressed=compressed)
.read_from_sparse(run, version=version, is.prefix=TRUE, compressed=compressed, mtx.two.pass=mtx.two.pass, mtx.class=mtx.class, mtx.threads=mtx.threads)
} else {
.read_from_hdf5(run, genome=genome, version=version)
}
}

#' @importFrom methods as
#' @importFrom methods as new
#' @importClassesFrom Matrix dgCMatrix
#' @importFrom Matrix readMM
#' @importFrom utils read.delim head
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors mcols<-
.read_from_sparse <- function(path, version, is.prefix=FALSE, compressed=NULL) {
#' @importClassesFrom SparseArray SVT_SparseMatrix
.read_from_sparse <- function(path, version, is.prefix, compressed, mtx.two.pass, mtx.class, mtx.threads) {
FUN <- if (is.prefix) paste0 else file.path

if (version=="auto") {
Expand Down Expand Up @@ -279,8 +295,16 @@ read10xCounts <- function(samples,
gene.info <- gr
}

raw_mat <- read_mm(matrix.loc, two_pass=mtx.two.pass, class_name=mtx.class, threads=mtx.threads)
if (mtx.class == "CsparseMatrix") {
# Don't use sparseMatrix as this seems to do an unnecessary roundtrip through the triplet form.
mat <- new("dgCMatrix", Dim=raw_mat$dim, i=raw_mat$contents$i, x=raw_mat$contents$x, p=raw_mat$contents$p)
} else {
mat <- new("SVT_SparseMatrix", SVT=raw_mat$contents$list, dim=raw_mat$dim, type=raw_mat$contents$type)
}

list(
mat=as(readMM(matrix.loc), "CsparseMatrix"),
mat=mat,
cell.names=readLines(barcode.loc),
gene.info=gene.info
)
Expand Down
45 changes: 29 additions & 16 deletions R/write10xCounts.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,27 @@
#' in the format produced by the CellRanger software suite.
#'
#' @param x A sparse numeric matrix of UMI counts.
#' @param path A string containing the path to the output directory (for \code{type="sparse"}) or file (for \code{type="HDF5"}).
#' @param path A string containing the path to the output directory (for \code{type="mtx"}) or file (for \code{type="hdf5"}).
#' @param barcodes A character vector of cell barcodes, one per column of \code{x}.
#' @param gene.id A character vector of gene identifiers, one per row of \code{x}.
#' @param gene.symbol A character vector of gene symbols, one per row of \code{x}.
#' @param gene.type A character vector of gene types, expanded to one per row of \code{x}.
#' Only used when \code{version="3"}.
#' @param overwrite A logical scalar specifying whether \code{path} should be overwritten if it already exists.
#' @param type String specifying the type of 10X format to save \code{x} to.
#' This is either a directory containing a sparse matrix with row/column annotation (\code{"sparse"})
#' or a HDF5 file containing the same information (\code{"HDF5"}).
#' @param genome String specifying the genome for storage when \code{type="HDF5"}.
#' This is either a directory containing a sparse matrix with row/column annotation (\code{"mtx"}, or its older alias \code{"sparse"})
#' or a HDF5 file containing the same information (\code{"hdf5"}, or its older alias \code{"HDF5"}).
#' @param genome String specifying the genome for storage when \code{type="hdf5"}.
#' This can be a character vector with one genome per feature if \code{version="3"}.
#' @param version String specifying the version of the CellRanger format to produce.
#' @param chemistry,original.gem.groups,library.ids
#' Strings containing metadata attributes to be added to the HDF5 file for \code{type="HDF5"}.
#' Strings containing metadata attributes to be added to the HDF5 file for \code{type="hdf5"}.
#' Their interpretation is not formally documented and is left to the user's imagination.
#'
#' @details
#' This function will try to automatically detect the desired format based on whether \code{path} ends with \code{".h5"}.
#' If so, it assumes that \code{path} specifies a HDF5 file path and sets \code{type="HDF5"}.
#' Otherwise it will set \code{type="sparse"} under the assumption that \code{path} specifies a path to a directory.
#' If so, it assumes that \code{path} specifies a HDF5 file path and sets \code{type="hdf5"}.
#' Otherwise it will set \code{type="mtx"} under the assumption that \code{path} specifies a path to a directory.
#'
#' Note that there were major changes in the output format for CellRanger version 3.0 to account for non-gene features such as antibody or CRISPR tags.
#' Users can switch to this new format using \code{version="3"}.
Expand All @@ -35,11 +35,11 @@
#' We recommend against doing so routinely due to CellRanger's dependence on undocumented metadata attributes that may change without notice.
#'
#' @return
#' For \code{type="sparse"}, a directory is produced at \code{path}.
#' For \code{type="mtx"}, a directory is produced at \code{path}.
#' If \code{version="2"}, this will contain the files \code{"matrix.mtx"}, \code{"barcodes.tsv"} and \code{"genes.tsv"}.
#' If \code{version="3"}, it will instead contain \code{"matrix.mtx.gz"}, \code{"barcodes.tsv.gz"} and \code{"features.tsv.gz"}.
#'
#' For \code{type="HDF5"}, a HDF5 file is produced at \code{path} containing data in column-sparse format.
#' For \code{type="hdf5"}, a HDF5 file is produced at \code{path} containing data in column-sparse format.
#' If \code{version="2"}, data are stored in the HDF5 group named \code{genome}.
#' If \code{version="3"}, data are stored in the group \code{"matrix"}.
#'
Expand All @@ -54,8 +54,7 @@
#' @examples
#' # Mocking up some count data.
#' library(Matrix)
#' my.counts <- matrix(rpois(1000, lambda=5), ncol=10, nrow=100)
#' my.counts <- as(my.counts, "CsparseMatrix")
#' my.counts <- abs(rsparsematrix(100, 10, 0.2) * 10)
#' cell.ids <- paste0("BARCODE-", seq_len(ncol(my.counts)))
#'
#' ngenes <- nrow(my.counts)
Expand Down Expand Up @@ -91,9 +90,19 @@
#' \url{https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices}
#'
#' @export
write10xCounts <- function(path, x, barcodes=colnames(x), gene.id=rownames(x), gene.symbol=gene.id, gene.type="Gene Expression",
overwrite=FALSE, type=c("auto", "sparse", "HDF5"), genome="unknown", version=c("2", "3"),
chemistry="Single Cell 3' v3", original.gem.groups=1L, library.ids="custom")
write10xCounts <- function(path,
x,
barcodes=colnames(x),
gene.id=rownames(x),
gene.symbol=gene.id,
gene.type="Gene Expression",
overwrite=FALSE,
type=c("auto", "mtx", "hdf5", "sparse", "HDF5"),
genome="unknown",
version=c("2", "3"),
chemistry="Single Cell 3' v3",
original.gem.groups=1L,
library.ids="custom")
{
# Doing all the work on a temporary location next to 'path', as we have permissions there.
# This avoids problems with 'path' already existing.
Expand All @@ -113,7 +122,7 @@ write10xCounts <- function(path, x, barcodes=colnames(x), gene.id=rownames(x), g
# Determining what format to save in.
version <- match.arg(version)
type <- .type_chooser(path, match.arg(type))
if (type=="sparse") {
if (type == "mtx") {
.write_sparse(temp.path, x, barcodes, gene.id, gene.symbol, gene.type, version=version)
} else {
.write_hdf5(temp.path, genome, x, barcodes, gene.id, gene.symbol, gene.type, version=version)
Expand Down Expand Up @@ -223,7 +232,11 @@ write10xCounts <- function(path, x, barcodes=colnames(x), gene.id=rownames(x), g

.type_chooser <- function(path, type) {
if (type=="auto") {
type <- if (grepl("\\.h5", path)) "HDF5" else "sparse"
type <- if (grepl("\\.h5", path)) "hdf5" else "mtx"
} else if (type == "HDF5") {
type <- "hdf5"
} else if (type == "sparse") {
type <- "mtx"
}
type
}
Loading
Loading