HWE Test Implementation (#76)

eric-czech · web-flow · commit 305ce19542f7 · 2020-08-17T05:18:25.000-04:00
* HWE exact test implementation for scalar genotype counts

* Formatting

* Formatting

* Adding more tests

* Adding tests for full coverage

* Refactoring tests to match new conventions

* Cleaning up docs

* Fix typo in test name

* Update variable names for new convention
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ numpy
 xarray
 dask[array]
 scipy
-zarr
+numba
+zarr
diff --git a/setup.cfg b/setup.cfg
@@ -58,7 +58,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,fire,glow,hail,hypothesis,invoke,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
+known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
@@ -71,6 +71,8 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 [mypy-pandas.*]
 ignore_missing_imports = True
+[mypy-numba.*]
+ignore_missing_imports = True
 [mypy-pytest.*]
 ignore_missing_imports = True
 [mypy-statsmodels.*]
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -10,6 +10,7 @@
 from .io.vcfzarr_reader import read_vcfzarr
 from .stats.aggregation import count_alleles
 from .stats.association import gwas_linear_regression
+from .stats.hwe import hardy_weinberg_test
 from .stats.regenie import regenie
 
 __all__ = [
@@ -24,4 +25,5 @@
     "gwas_linear_regression",
     "read_vcfzarr",
     "regenie",
+    "hardy_weinberg_test",
 ]
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
@@ -164,24 +164,13 @@ def gwas_linear_regression(
     -------
     :class:`xarray.Dataset`
         Dataset containing (N = num variants, O = num traits):
-            beta : (N, O) array-like
+            variant_beta : (N, O) array-like
                 Beta values associated with each variant and trait
-            t_value : (N, O) array-like
+            variant_t_value : (N, O) array-like
                 T statistics for each beta
-            p_value : (N, O) array-like
+            variant_p_value : (N, O) array-like
                 P values as float in [0, 1]
 
-    Warnings
-    --------
-    Regression statistics from this implementation are only valid when an
-    intercept is present. The `add_intercept` flag is a convenience for adding one
-    when not already present, but there is currently no parameterization for
-    intercept-free regression.
-
-    Additionally, both covariate and trait arrays will be rechunked to have blocks
-    along the sample (row) dimension but not the column dimension (i.e.
-    they must be tall and skinny).
-
     References
     ----------
     - [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements
@@ -192,7 +181,6 @@ def gwas_linear_regression(
         Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”
         Nature Genetics 47 (3): 284–90.
 
-
     """
     if isinstance(covariates, str):
         covariates = [covariates]
diff --git a/sgkit/stats/hwe.py b/sgkit/stats/hwe.py
@@ -0,0 +1,178 @@
+from typing import Hashable, Optional
+
+import dask.array as da
+import numpy as np
+import xarray as xr
+from numba import njit
+from numpy import ndarray
+from xarray import Dataset
+
+
+def hardy_weinberg_p_value(obs_hets: int, obs_hom1: int, obs_hom2: int) -> float:
+    """Exact test for HWE as described in Wigginton et al. 2005 [1].
+
+    Parameters
+    ----------
+    obs_hets : int
+        Number of heterozygotes with minor variant.
+    obs_hom1 : int
+        Number of reference/major homozygotes.
+    obs_hom2 : int
+        Number of alternate/minor homozygotes.
+
+    Returns
+    -------
+    float
+        P value in [0, 1]
+
+    References
+    ----------
+    - [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
+        “A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
+        Human Genetics 76 (5): 887–93.
+
+    Raises
+    ------
+    ValueError
+        If any observed counts are negative.
+    """
+    if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
+        raise ValueError("Observed genotype counts must be positive")
+
+    obs_homc = obs_hom2 if obs_hom1 < obs_hom2 else obs_hom1
+    obs_homr = obs_hom1 if obs_hom1 < obs_hom2 else obs_hom2
+    obs_mac = 2 * obs_homr + obs_hets
+    obs_n = obs_hets + obs_homc + obs_homr
+    het_probs = np.zeros(obs_mac + 1, dtype=np.float64)
+
+    if obs_n == 0:
+        return np.nan  # type: ignore[no-any-return]
+
+    # Identify distribution midpoint
+    mid = int(obs_mac * (2 * obs_n - obs_mac) / (2 * obs_n))
+    if (obs_mac & 1) ^ (mid & 1):
+        mid += 1
+    het_probs[mid] = 1.0
+    prob_sum = het_probs[mid]
+
+    # Integrate downward from distribution midpoint
+    curr_hets = mid
+    curr_homr = int((obs_mac - mid) / 2)
+    curr_homc = obs_n - curr_hets - curr_homr
+    while curr_hets > 1:
+        het_probs[curr_hets - 2] = (
+            het_probs[curr_hets]
+            * curr_hets
+            * (curr_hets - 1.0)
+            / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
+        )
+        prob_sum += het_probs[curr_hets - 2]
+        curr_homr += 1
+        curr_homc += 1
+        curr_hets -= 2
+
+    # Integrate upward from distribution midpoint
+    curr_hets = mid
+    curr_homr = int((obs_mac - mid) / 2)
+    curr_homc = obs_n - curr_hets - curr_homr
+    while curr_hets <= obs_mac - 2:
+        het_probs[curr_hets + 2] = (
+            het_probs[curr_hets]
+            * 4.0
+            * curr_homr
+            * curr_homc
+            / ((curr_hets + 2.0) * (curr_hets + 1.0))
+        )
+        prob_sum += het_probs[curr_hets + 2]
+        curr_homr -= 1
+        curr_homc -= 1
+        curr_hets += 2
+
+    if prob_sum <= 0:  # pragma: no cover
+        return np.nan  # type: ignore[no-any-return]
+    het_probs = het_probs / prob_sum
+    p = het_probs[het_probs <= het_probs[obs_hets]].sum()
+    p = max(min(1.0, p), 0.0)
+
+    return p  # type: ignore[no-any-return]
+
+
+# Benchmarks show ~25% improvement w/ fastmath on large (~10M) counts
+hardy_weinberg_p_value_jit = njit(hardy_weinberg_p_value, fastmath=True)
+
+
+def hardy_weinberg_p_value_vec(
+    obs_hets: ndarray, obs_hom1: ndarray, obs_hom2: ndarray
+) -> ndarray:
+    arrs = [obs_hets, obs_hom1, obs_hom2]
+    if len(set(map(len, arrs))) != 1:
+        raise ValueError("All arrays must have same length")
+    if list(set(map(lambda x: x.ndim, arrs))) != [1]:
+        raise ValueError("All arrays must be 1D")
+    n = len(obs_hets)
+    p = np.empty(n, dtype=np.float64)
+    for i in range(n):
+        p[i] = hardy_weinberg_p_value_jit(obs_hets[i], obs_hom1[i], obs_hom2[i])
+    return p
+
+
+hardy_weinberg_p_value_vec_jit = njit(hardy_weinberg_p_value_vec, fastmath=True)
+
+
+def hardy_weinberg_test(
+    ds: Dataset, genotype_counts: Optional[Hashable] = None
+) -> Dataset:
+    """Exact test for HWE as described in Wigginton et al. 2005 [1].
+
+    Parameters
+    ----------
+    ds : Dataset
+        Dataset containing genotype calls or precomputed genotype counts.
+    genotype_counts : Optional[Hashable], optional
+        Name of variable containing precomputed genotype counts, by default
+        None. If not provided, these counts will be computed automatically
+        from genotype calls. If present, must correspond to an (`N`, 3) array
+        where `N` is equal to the number of variants and the 3 columns contain
+        heterozygous, homozygous reference, and homozygous alternate counts
+        (in that order) across all samples for a variant.
+
+    Warnings
+    --------
+    This function is only applicable to diploid, biallelic datasets.
+
+    Returns
+    -------
+    Dataset
+        Dataset containing (N = num variants):
+        variant_hwe_p_value : (N,) ArrayLike
+            P values from HWE test for each variant as float in [0, 1].
+
+    References
+    ----------
+    - [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
+        “A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
+        Human Genetics 76 (5): 887–93.
+
+    Raises
+    ------
+    NotImplementedError
+        * If ploidy of provided dataset != 2
+        * If maximum number of alleles in provided dataset != 2
+    """
+    if ds.dims["ploidy"] != 2:
+        raise NotImplementedError("HWE test only implemented for diploid genotypes")
+    if ds.dims["alleles"] != 2:
+        raise NotImplementedError("HWE test only implemented for biallelic genotypes")
+    # Use precomputed genotype counts if provided
+    if genotype_counts is not None:
+        obs = list(da.asarray(ds[genotype_counts]).T)
+    # Otherwise compute genotype counts from calls
+    else:
+        # TODO: Use API genotype counting function instead, e.g.
+        # https://github.com/pystatgen/sgkit/issues/29#issuecomment-656691069
+        M = ds["call_genotype_mask"].any(dim="ploidy")
+        AC = xr.where(M, -1, ds["call_genotype"].sum(dim="ploidy"))  # type: ignore[no-untyped-call]
+        cts = [1, 0, 2]  # arg order: hets, hom1, hom2
+        obs = [da.asarray((AC == ct).sum(dim="samples")) for ct in cts]
+    p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs)
+    return xr.Dataset({"variant_hwe_p_value": ("variants", p)})
diff --git a/sgkit/tests/test_hwe.py b/sgkit/tests/test_hwe.py
diff --git a/sgkit/tests/test_hwe/sim_01.csv b/sgkit/tests/test_hwe/sim_01.csv

-Original file line number
+Diff line change
 xarray
 dask[array]
 scipy
 -zarr
 +numba
 +zarr