Skip to content

Commit 305ce19

Browse files
authored
HWE Test Implementation (#76)
* HWE exact test implementation for scalar genotype counts * Formatting * Formatting * Adding more tests * Adding tests for full coverage * Refactoring tests to match new conventions * Cleaning up docs * Fix typo in test name * Update variable names for new convention
1 parent 1e635b4 commit 305ce19

File tree

7 files changed

+548
-17
lines changed

7 files changed

+548
-17
lines changed

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ numpy
22
xarray
33
dask[array]
44
scipy
5-
zarr
5+
numba
6+
zarr

setup.cfg

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ ignore =
5858
profile = black
5959
default_section = THIRDPARTY
6060
known_first_party = sgkit
61-
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
61+
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
6262
multi_line_output = 3
6363
include_trailing_comma = True
6464
force_grid_wrap = 0
@@ -71,6 +71,8 @@ ignore_missing_imports = True
7171
ignore_missing_imports = True
7272
[mypy-pandas.*]
7373
ignore_missing_imports = True
74+
[mypy-numba.*]
75+
ignore_missing_imports = True
7476
[mypy-pytest.*]
7577
ignore_missing_imports = True
7678
[mypy-statsmodels.*]

sgkit/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from .io.vcfzarr_reader import read_vcfzarr
1111
from .stats.aggregation import count_alleles
1212
from .stats.association import gwas_linear_regression
13+
from .stats.hwe import hardy_weinberg_test
1314
from .stats.regenie import regenie
1415

1516
__all__ = [
@@ -24,4 +25,5 @@
2425
"gwas_linear_regression",
2526
"read_vcfzarr",
2627
"regenie",
28+
"hardy_weinberg_test",
2729
]

sgkit/stats/association.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -164,24 +164,13 @@ def gwas_linear_regression(
164164
-------
165165
:class:`xarray.Dataset`
166166
Dataset containing (N = num variants, O = num traits):
167-
beta : (N, O) array-like
167+
variant_beta : (N, O) array-like
168168
Beta values associated with each variant and trait
169-
t_value : (N, O) array-like
169+
variant_t_value : (N, O) array-like
170170
T statistics for each beta
171-
p_value : (N, O) array-like
171+
variant_p_value : (N, O) array-like
172172
P values as float in [0, 1]
173173
174-
Warnings
175-
--------
176-
Regression statistics from this implementation are only valid when an
177-
intercept is present. The `add_intercept` flag is a convenience for adding one
178-
when not already present, but there is currently no parameterization for
179-
intercept-free regression.
180-
181-
Additionally, both covariate and trait arrays will be rechunked to have blocks
182-
along the sample (row) dimension but not the column dimension (i.e.
183-
they must be tall and skinny).
184-
185174
References
186175
----------
187176
- [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements
@@ -192,7 +181,6 @@ def gwas_linear_regression(
192181
Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”
193182
Nature Genetics 47 (3): 284–90.
194183
195-
196184
"""
197185
if isinstance(covariates, str):
198186
covariates = [covariates]

sgkit/stats/hwe.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
from typing import Hashable, Optional
2+
3+
import dask.array as da
4+
import numpy as np
5+
import xarray as xr
6+
from numba import njit
7+
from numpy import ndarray
8+
from xarray import Dataset
9+
10+
11+
def hardy_weinberg_p_value(obs_hets: int, obs_hom1: int, obs_hom2: int) -> float:
12+
"""Exact test for HWE as described in Wigginton et al. 2005 [1].
13+
14+
Parameters
15+
----------
16+
obs_hets : int
17+
Number of heterozygotes with minor variant.
18+
obs_hom1 : int
19+
Number of reference/major homozygotes.
20+
obs_hom2 : int
21+
Number of alternate/minor homozygotes.
22+
23+
Returns
24+
-------
25+
float
26+
P value in [0, 1]
27+
28+
References
29+
----------
30+
- [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
31+
“A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
32+
Human Genetics 76 (5): 887–93.
33+
34+
Raises
35+
------
36+
ValueError
37+
If any observed counts are negative.
38+
"""
39+
if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
40+
raise ValueError("Observed genotype counts must be positive")
41+
42+
obs_homc = obs_hom2 if obs_hom1 < obs_hom2 else obs_hom1
43+
obs_homr = obs_hom1 if obs_hom1 < obs_hom2 else obs_hom2
44+
obs_mac = 2 * obs_homr + obs_hets
45+
obs_n = obs_hets + obs_homc + obs_homr
46+
het_probs = np.zeros(obs_mac + 1, dtype=np.float64)
47+
48+
if obs_n == 0:
49+
return np.nan # type: ignore[no-any-return]
50+
51+
# Identify distribution midpoint
52+
mid = int(obs_mac * (2 * obs_n - obs_mac) / (2 * obs_n))
53+
if (obs_mac & 1) ^ (mid & 1):
54+
mid += 1
55+
het_probs[mid] = 1.0
56+
prob_sum = het_probs[mid]
57+
58+
# Integrate downward from distribution midpoint
59+
curr_hets = mid
60+
curr_homr = int((obs_mac - mid) / 2)
61+
curr_homc = obs_n - curr_hets - curr_homr
62+
while curr_hets > 1:
63+
het_probs[curr_hets - 2] = (
64+
het_probs[curr_hets]
65+
* curr_hets
66+
* (curr_hets - 1.0)
67+
/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
68+
)
69+
prob_sum += het_probs[curr_hets - 2]
70+
curr_homr += 1
71+
curr_homc += 1
72+
curr_hets -= 2
73+
74+
# Integrate upward from distribution midpoint
75+
curr_hets = mid
76+
curr_homr = int((obs_mac - mid) / 2)
77+
curr_homc = obs_n - curr_hets - curr_homr
78+
while curr_hets <= obs_mac - 2:
79+
het_probs[curr_hets + 2] = (
80+
het_probs[curr_hets]
81+
* 4.0
82+
* curr_homr
83+
* curr_homc
84+
/ ((curr_hets + 2.0) * (curr_hets + 1.0))
85+
)
86+
prob_sum += het_probs[curr_hets + 2]
87+
curr_homr -= 1
88+
curr_homc -= 1
89+
curr_hets += 2
90+
91+
if prob_sum <= 0: # pragma: no cover
92+
return np.nan # type: ignore[no-any-return]
93+
het_probs = het_probs / prob_sum
94+
p = het_probs[het_probs <= het_probs[obs_hets]].sum()
95+
p = max(min(1.0, p), 0.0)
96+
97+
return p # type: ignore[no-any-return]
98+
99+
100+
# Benchmarks show ~25% improvement w/ fastmath on large (~10M) counts
101+
hardy_weinberg_p_value_jit = njit(hardy_weinberg_p_value, fastmath=True)
102+
103+
104+
def hardy_weinberg_p_value_vec(
105+
obs_hets: ndarray, obs_hom1: ndarray, obs_hom2: ndarray
106+
) -> ndarray:
107+
arrs = [obs_hets, obs_hom1, obs_hom2]
108+
if len(set(map(len, arrs))) != 1:
109+
raise ValueError("All arrays must have same length")
110+
if list(set(map(lambda x: x.ndim, arrs))) != [1]:
111+
raise ValueError("All arrays must be 1D")
112+
n = len(obs_hets)
113+
p = np.empty(n, dtype=np.float64)
114+
for i in range(n):
115+
p[i] = hardy_weinberg_p_value_jit(obs_hets[i], obs_hom1[i], obs_hom2[i])
116+
return p
117+
118+
119+
hardy_weinberg_p_value_vec_jit = njit(hardy_weinberg_p_value_vec, fastmath=True)
120+
121+
122+
def hardy_weinberg_test(
123+
ds: Dataset, genotype_counts: Optional[Hashable] = None
124+
) -> Dataset:
125+
"""Exact test for HWE as described in Wigginton et al. 2005 [1].
126+
127+
Parameters
128+
----------
129+
ds : Dataset
130+
Dataset containing genotype calls or precomputed genotype counts.
131+
genotype_counts : Optional[Hashable], optional
132+
Name of variable containing precomputed genotype counts, by default
133+
None. If not provided, these counts will be computed automatically
134+
from genotype calls. If present, must correspond to an (`N`, 3) array
135+
where `N` is equal to the number of variants and the 3 columns contain
136+
heterozygous, homozygous reference, and homozygous alternate counts
137+
(in that order) across all samples for a variant.
138+
139+
Warnings
140+
--------
141+
This function is only applicable to diploid, biallelic datasets.
142+
143+
Returns
144+
-------
145+
Dataset
146+
Dataset containing (N = num variants):
147+
variant_hwe_p_value : (N,) ArrayLike
148+
P values from HWE test for each variant as float in [0, 1].
149+
150+
References
151+
----------
152+
- [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
153+
“A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
154+
Human Genetics 76 (5): 887–93.
155+
156+
Raises
157+
------
158+
NotImplementedError
159+
* If ploidy of provided dataset != 2
160+
* If maximum number of alleles in provided dataset != 2
161+
"""
162+
if ds.dims["ploidy"] != 2:
163+
raise NotImplementedError("HWE test only implemented for diploid genotypes")
164+
if ds.dims["alleles"] != 2:
165+
raise NotImplementedError("HWE test only implemented for biallelic genotypes")
166+
# Use precomputed genotype counts if provided
167+
if genotype_counts is not None:
168+
obs = list(da.asarray(ds[genotype_counts]).T)
169+
# Otherwise compute genotype counts from calls
170+
else:
171+
# TODO: Use API genotype counting function instead, e.g.
172+
# https://github.com/pystatgen/sgkit/issues/29#issuecomment-656691069
173+
M = ds["call_genotype_mask"].any(dim="ploidy")
174+
AC = xr.where(M, -1, ds["call_genotype"].sum(dim="ploidy")) # type: ignore[no-untyped-call]
175+
cts = [1, 0, 2] # arg order: hets, hom1, hom2
176+
obs = [da.asarray((AC == ct).sum(dim="samples")) for ct in cts]
177+
p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs)
178+
return xr.Dataset({"variant_hwe_p_value": ("variants", p)})

0 commit comments

Comments
 (0)