sgkit-dev · mergify · Dec 22, 2020 · Dec 8, 2020 · Dec 22, 2020 · tomwhite
diff --git a/docs/api.rst b/docs/api.rst
@@ -7,6 +7,17 @@ This page provides an auto-generated summary of sgkits's API.
 IO/imports
 ==========
 
+BGEN
+-----
+
+.. currentmodule:: sgkit.io.bgen
+.. autosummary::
+   :toctree: generated/
+
+   bgen_to_zarr
+   read_bgen
+   rechunk_bgen
+
 PLINK
 -----
 
@@ -42,6 +53,7 @@ Methods
 .. autosummary::
    :toctree: generated/
 
+   convert_probability_to_call
    count_call_alleles
    count_cohort_alleles
    count_variant_alleles

diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -19,6 +19,7 @@
     variant_stats,
 )
 from .stats.association import gwas_linear_regression
+from .stats.conversion import convert_probability_to_call
 from .stats.hwe import hardy_weinberg_test
 from .stats.pc_relate import pc_relate
 from .stats.pca import pca
@@ -39,6 +40,7 @@
     "DIM_SAMPLE",
     "DIM_VARIANT",
     "create_genotype_call_dataset",
+    "convert_probability_to_call",
     "count_variant_alleles",
     "count_call_alleles",
     "count_cohort_alleles",

diff --git a/sgkit/io/bgen/bgen_reader.py b/sgkit/io/bgen/bgen_reader.py
@@ -250,15 +250,15 @@ def read_bgen(
     -------
     A dataset containing the following variables:
 
-    - :data:`sgkit.variables.variant_id` (variants)
-    - :data:`sgkit.variables.variant_contig` (variants)
-    - :data:`sgkit.variables.variant_position` (variants)
-    - :data:`sgkit.variables.variant_allele` (variants)
-    - :data:`sgkit.variables.sample_id` (samples)
-    - :data:`sgkit.variables.call_dosage` (variants, samples)
-    - :data:`sgkit.variables.call_dosage_mask` (variants, samples)
-    - :data:`sgkit.variables.call_genotype_probability` (variants, samples, genotypes)
-    - :data:`sgkit.variables.call_genotype_probability_mask` (variants, samples, genotypes)
+    - :data:`sgkit.variables.variant_id_spec` (variants)
+    - :data:`sgkit.variables.variant_contig_spec` (variants)
+    - :data:`sgkit.variables.variant_position_spec` (variants)
+    - :data:`sgkit.variables.variant_allele_spec` (variants)
+    - :data:`sgkit.variables.sample_id_spec` (samples)
+    - :data:`sgkit.variables.call_dosage_spec` (variants, samples)
+    - :data:`sgkit.variables.call_dosage_mask_spec` (variants, samples)
+    - :data:`sgkit.variables.call_genotype_probability_spec` (variants, samples, genotypes)
+    - :data:`sgkit.variables.call_genotype_probability_mask_spec` (variants, samples, genotypes)
 
     """
     if isinstance(chunks, tuple) and len(chunks) != 3:
@@ -445,30 +445,30 @@ def rechunk_bgen(
 
     Parameters
     ----------
-    ds : Dataset
+    ds
         Dataset to rechunk, typically the result from `read_bgen`.
-    output : Union[PathType, MutableMapping[str, bytes]]
+    output
         Zarr store or path to directory in file system.
-    chunk_length : int
+    chunk_length
         Length (number of variants) of chunks in which data are stored, by default 10_000.
-    chunk_width : int
+    chunk_width
         Width (number of samples) to use when storing chunks in output, by default 1_000.
-    compressor : Optional[Any]
+    compressor
         Zarr compressor, no compression is used when set as None.
-    probability_dtype : DType
+    probability_dtype
         Data type used to encode genotype probabilities, must be either uint8 or uint16.
         Setting this parameter results in a loss of precision. If None, probabilities
         will not be altered when stored.
-    max_mem : str
+    max_mem
         The amount of memory (in bytes) that workers are allowed to use. A string
         (e.g. 100MB) can also be used.
-    pack : bool
+    pack
         Whether or not to optimize variable representations by removing unnecessary
         dimensions and elements. This includes storing 2 genotypes instead of 3, omitting
         dosage and collapsing the genotype probability mask to 2 dimensions. All of
         the above are restored in the resulting Dataset at the expense of extra
         computations on read.
-    tempdir : Optional[PathType]
+    tempdir
         Temporary directory where intermediate files are stored. The default None means
         use the system default temporary directory.
 
@@ -538,58 +538,48 @@ def bgen_to_zarr(
     pack: bool = True,
     tempdir: Optional[PathType] = None,
 ) -> Dataset:
-    """Rechunk BGEN dataset as Zarr.
+    """Convert a BGEN file to a Zarr on-disk store.
 
-    This function will use the algorithm https://rechunker.readthedocs.io/en/latest/
-    to rechunk certain fields in a provided Dataset for better downstream performance.
-    Depending on the system memory available (and the `max_mem` setting) this
-    rechunking may occur without the need of any intermediate data store. Otherwise,
-    approximately as much disk space is required as was needed to store the original
-    BGEN data. Experiments show that this Zarr representation is ~20% larger even
-    with all available optimizations and fairly aggressive compression (i.e. the
-    default `clevel` 7).
-
-    Note that this function is not evaluated lazily. The rechunking algorithm
-    will run inline so calls to it may be slow. The resulting Dataset is
-    generated based on the final, serialized Zarr data.
+    This function is a convenience for calling :func:`read_bgen` followed by
+    :func:`rechunk_bgen`.
 
     Parameters
     ----------
-    input : PathType
+    input
         Path to local BGEN dataset.
-    output : Union[PathType, MutableMapping[str, bytes]]
+    output
         Zarr store or path to directory in file system.
-    region : Optional[Mapping[Hashable, Any]]
+    region
         Indexers on dataset dimensions used to define a subset of data to convert.
         Must be None or a dict with keys matching dimension names and values
         equal to integers or slice objects. This is passed directly to `Dataset.isel`
         so it has the same semantics.
-    chunk_length : int
+    chunk_length
         Length (number of variants) of chunks in which data are stored, by default 10_000.
-    chunk_width : int
+    chunk_width
         Width (number of samples) to use when storing chunks in output, by default 1_000.
-    temp_chunk_length : int
+    temp_chunk_length
         Length of chunks used in raw BGEN read, by default 100. This defines the vertical
         chunking (i.e. in the variants dimension) used when reading the raw data and because
         there is no horizontal chunking at this phase (i.e. in the samples dimension), this
         value should be much smaller than the target `chunk_length`.
-    compressor : Optional[Any]
+    compressor
         Zarr compressor, by default Blosc + zstd with compression level 7. No compression
         is used when set as None.
-    probability_dtype : DType
+    probability_dtype
         Data type used to encode genotype probabilities, must be either uint8 or uint16.
         Setting this parameter results in a loss of precision. If None, probabilities
         will not be altered when stored.
-    max_mem : str
+    max_mem
         The amount of memory (in bytes) that workers are allowed to use. A string
         (e.g. 100MB) can also be used.
-    pack : bool
+    pack
         Whether or not to optimize variable representations by removing unnecessary
         dimensions and elements. This includes storing 2 genotypes instead of 3, omitting
         dosage and collapsing the genotype probability mask to 2 dimensions. All of
         the above are restored in the resulting Dataset at the expense of extra
         computations on read.
-    tempdir : Optional[PathType]
+    tempdir
         Temporary directory where intermediate files are stored. The default None means
         use the system default temporary directory.
 

diff --git a/sgkit/stats/conversion.py b/sgkit/stats/conversion.py
@@ -0,0 +1,128 @@
+import dask.array as da
+import numpy as np
+from numba import guvectorize
+from xarray import Dataset
+
+from sgkit import variables
+from sgkit.typing import ArrayLike
+from sgkit.utils import conditional_merge_datasets
+
+
+@guvectorize(  # type: ignore
+    [
+        "void(float64[:], uint8[:], float64, int8[:])",
+        "void(float32[:], uint8[:], float64, int8[:])",
+    ],
+    "(p),(k),()->(k)",
+    nopython=True,
+    cache=True,
+)
+def _convert_probability_to_call(
+    gp: ArrayLike, _: ArrayLike, threshold: float, out: ArrayLike
+) -> None:  # pragma: no cover
+    """Generalized U-function for converting genotype probabilities to hard calls
+
+    Parameters
+    ----------
+    gp
+        Genotype probabilities of shape (genotypes,) containing unphased, biallelic
+        probabilities in the order homozygous reference, heterozygous, homozygous alternate.
+    _
+        Dummy variable of type `uint8` and shape (ploidy,) used to define
+        the ploidy of the resulting array
+    threshold
+        Probability threshold that must be met or exceeded by at least one genotype
+        probability in order for any calls to be made -- all values will be -1 (missing)
+        otherwise. Setting this value to less than 0 disables any effect it has.
+    out
+        Hard calls array of shape (ploidy,).
+    """
+    # Ignore singleton array inputs used for metadata inference by dask
+    if gp.shape[0] == 1 and out.shape[0] == 1:
+        return
+    if gp.shape[0] != 3 or out.shape[0] != 2:
+        raise NotImplementedError(
+            "Hard call conversion only supported for diploid, biallelic genotypes."
+        )
+    out[:] = -1  # (ploidy,)
+    # Return no call if any probability is absent
+    if np.any(np.isnan(gp)):
+        return
+    i = np.argmax(gp)
+    # Return no call if max probability does not exceed threshold
+    if threshold > 0 and gp[i] < threshold:
+        return
+    # Return no call if max probability is not unique
+    if (gp[i] == gp).sum() > 1:
+        return
+    # Homozygous reference
+    if i == 0:
+        out[:] = 0
+    # Heterozygous
+    elif i == 1:
+        out[0] = 1
+        out[1] = 0
+    # Homozygous alternate
+    else:
+        out[:] = 1
+
+
+def convert_probability_to_call(
+    ds: Dataset,
+    call_genotype_probability: str = variables.call_genotype_probability,
+    threshold: float = 0.9,
+    merge: bool = True,
+) -> Dataset:
+    """
+
+    Parameters
+    ----------
+    ds
+        Dataset containing genotype probabilities, such as from :func:`sgkit.io.bgen.read_bgen`.
+    call_genotype_probability
+        Genotype probability variable to be converted as defined by
+        :data:`sgkit.variables.call_genotype_probability_spec`.
+    threshold
+        Probability threshold in [0, 1] that must be met or exceeded by at least one genotype
+        probability in order for any calls to be made -- all values will be -1 (missing)
+        otherwise. Setting this value to less than or equal to 0 disables any effect it has.
+        Default value is 0.9.
+    merge
+        If True (the default), merge the input dataset and the computed
+        output variables into a single dataset, otherwise return only
+        the computed output variables.
+        See :ref:`dataset_merge` for more details.
+
+    Returns
+    -------
+    A dataset containing the following variables:
+
+    - `call_genotype` (variants, samples, ploidy): Converted hard calls.
+        Defined by :data:`sgkit.variables.call_genotype_spec`.
+
+    - `call_genotype_mask` (variants, samples, ploidy): Mask for converted hard calls.
+        Defined by :data:`sgkit.variables.call_genotype_mask_spec`.
+    """
+    if not (0 <= threshold <= 1):
+        raise ValueError(f"Threshold must be float in [0, 1], not {threshold}.")
+    variables.validate(
+        ds, {call_genotype_probability: variables.call_genotype_probability_spec}
+    )
+    if ds.dims["genotypes"] != 3:
+        raise NotImplementedError(
+            f"Hard call conversion only supported for diploid, biallelic genotypes; "
+            f"num genotypes in provided probabilities array = {ds.dims['genotypes']}."
+        )
+    GP = da.asarray(ds[call_genotype_probability])
+    # Remove chunking in genotypes dimension, if present
+    if len(GP.chunks[2]) > 1:
+        GP = GP.rechunk((None, None, -1))
+    K = da.empty(2, dtype=np.uint8)
+    GT = _convert_probability_to_call(GP, K, threshold)
+    new_ds = Dataset(
+        {
+            variables.call_genotype: (("variants", "samples", "ploidy"), GT),
+            variables.call_genotype_mask: (("variants", "samples", "ploidy"), GT < 0),
+        }
+    )
+    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)