Don't use to_haplotype_calls for Garud H

tomwhite · tomwhite · commit 3c31253020b6 · 2020-11-12T10:18:24.000Z
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -6,13 +6,12 @@
 from numba import guvectorize
 from xarray import Dataset
 
-from sgkit import to_haplotype_calls
 from sgkit.stats.utils import assert_array_shape
 from sgkit.typing import ArrayLike
 from sgkit.utils import (
     conditional_merge_datasets,
     define_variable_if_absent,
-    hash_columns,
+    hash_array,
 )
 from sgkit.window import has_windows, window_statistic
 
@@ -693,13 +692,13 @@ def pbs(
 N_GARUD_H_STATS = 4  # H1, H12, H123, H2/H1
 
 
-def _Garud_h(k: ArrayLike) -> ArrayLike:
+def _Garud_h(haplotypes: ArrayLike) -> ArrayLike:
     # find haplotype counts (sorted in descending order)
-    counts = sorted(collections.Counter(k.tolist()).values(), reverse=True)
+    counts = sorted(collections.Counter(haplotypes.tolist()).values(), reverse=True)
     counts = np.array(counts)
 
     # find haplotype frequencies
-    n = k.shape[0]
+    n = haplotypes.shape[0]
     f = counts / n
 
     # compute H1
@@ -719,19 +718,20 @@ def _Garud_h(k: ArrayLike) -> ArrayLike:
 
 
 def _Garud_h_cohorts(
-    ht: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int
+    gt: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int
 ) -> ArrayLike:
-    k = hash_columns(ht)  # hash haplotypes
+    # transpose to hash columns (haplotypes)
+    haplotypes = hash_array(gt.transpose()).transpose().flatten()
     arr = np.empty((n_cohorts, N_GARUD_H_STATS))
     for c in range(n_cohorts):
-        arr[c, :] = _Garud_h(k[sample_cohort == c])
+        arr[c, :] = _Garud_h(haplotypes[sample_cohort == c])
     return arr
 
 
 def Garud_h(
     ds: Dataset,
     *,
-    call_haplotype: Hashable = variables.call_haplotype,
+    call_genotype: Hashable = variables.call_genotype,
     merge: bool = True,
 ) -> Dataset:
     """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
@@ -745,11 +745,10 @@ def Garud_h(
     ----------
     ds
         Genotype call dataset.
-    call_haplotype
-        Call haplotype variable to use or calculate. Defined by
-        :data:`sgkit.variables.call_haplotype_spec`.
-        If the variable is not present in ``ds``, it will be computed
-        using :func:`to_haplotype_calls`.
+    call_genotype
+        Input variable name holding call_genotype as defined by
+        :data:`sgkit.variables.call_genotype_spec`.
+        Must be present in ``ds``.
     merge
         If True (the default), merge the input dataset and the computed
         output variables into a single dataset, otherwise return only
@@ -814,12 +813,9 @@ def Garud_h(
     if ds.dims["ploidy"] != 2:
         raise NotImplementedError("Garud H only implemented for diploid genotypes")
 
-    ds = define_variable_if_absent(
-        ds, variables.call_haplotype, call_haplotype, to_haplotype_calls
-    )
-    variables.validate(ds, {call_haplotype: variables.call_haplotype_spec})
+    variables.validate(ds, {call_genotype: variables.call_genotype_spec})
 
-    ht = ds[call_haplotype]
+    gt = ds[call_genotype]
 
     # convert sample cohorts to haplotype layout
     sc = ds.sample_cohort.values
@@ -828,14 +824,13 @@ def Garud_h(
 
     if has_windows(ds):
         gh = window_statistic(
-            ht,
-            lambda ht: _Garud_h_cohorts(ht, hsc, n_cohorts),
+            gt,
+            lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts),
             ds.window_start.values,
             ds.window_stop.values,
             dtype=np.float64,
             # first chunks dimension is windows, computed in window_statistic
             chunks=(-1, n_cohorts, N_GARUD_H_STATS),
-            new_axis=2,  # 2d -> 3d
         )
         n_windows = ds.window_start.shape[0]
         assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS)
@@ -861,9 +856,9 @@ def Garud_h(
         )
     else:
         # TODO: note this materializes all the data, so windowless should be discouraged/not supported
-        ht = ht.values
+        gt = gt.values
 
-        gh = _Garud_h_cohorts(ht, sample_cohort=hsc, n_cohorts=n_cohorts)
+        gh = _Garud_h_cohorts(gt, sample_cohort=hsc, n_cohorts=n_cohorts)
         assert_array_shape(gh, n_cohorts, N_GARUD_H_STATS)
 
         new_ds = Dataset(
diff --git a/sgkit/tests/test_utils.py b/sgkit/tests/test_utils.py
@@ -13,7 +13,7 @@
     check_array_like,
     define_variable_if_absent,
     encode_array,
-    hash_columns,
+    hash_array,
     max_str_len,
     merge_datasets,
     split_array_chunks,
@@ -211,21 +211,21 @@ def test_split_array_chunks__raise_on_n_lte_0():
         split_array_chunks(0, 0)
 
 
-@given(st.integers(1, 50), st.integers(2, 50))
+@given(st.integers(2, 50), st.integers(1, 50))
 @settings(deadline=None)  # avoid problem with numba jit compilation
-def test_hash_columns(n_rows, n_cols):
-    # construct an array with random repeated columns
-    x = np.random.randint(-2, 10, size=(n_rows, n_cols // 2))
-    cols = np.random.choice(x.shape[1], n_cols, replace=True)
-    x = x[:, cols]
+def test_hash_array(n_rows, n_cols):
+    # construct an array with random repeated rows
+    x = np.random.randint(-2, 10, size=(n_rows // 2, n_cols))
+    rows = np.random.choice(x.shape[0], n_rows, replace=True)
+    x = x[rows, :]
 
     # find unique column counts (exact method)
     _, expected_inverse, expected_counts = np.unique(
-        x, axis=1, return_inverse=True, return_counts=True
+        x, axis=0, return_inverse=True, return_counts=True
     )
 
     # hash columns, then find unique column counts using the hash values
-    h = hash_columns(x)
+    h = hash_array(x)
     _, inverse, counts = np.unique(h, return_inverse=True, return_counts=True)
 
     # counts[inverse] gives the count for each column in x
diff --git a/sgkit/utils.py b/sgkit/utils.py
@@ -1,8 +1,8 @@
 import warnings
 from typing import Any, Callable, Hashable, List, Optional, Set, Tuple, Union
 
-import numba
 import numpy as np
+from numba import guvectorize
 from xarray import Dataset
 
 from .typing import ArrayLike, DType
@@ -274,9 +274,19 @@ def max_str_len(a: ArrayLike) -> ArrayLike:
     return lens.max()
 
 
-@numba.njit(nogil=True, cache=True)  # type: ignore
-def hash_columns(x: ArrayLike) -> ArrayLike:
-    """Hash columns of ``x`` using the DJBX33A hash function.
+@guvectorize(  # type: ignore
+    [
+        "void(int8[:], int64[:])",
+        "void(int16[:], int64[:])",
+        "void(int32[:], int64[:])",
+        "void(int64[:], int64[:])",
+    ],
+    "(n)->()",
+    nopython=True,
+    cache=True,
+)
+def hash_array(x: ArrayLike, out: ArrayLike) -> None:
+    """Hash entries of ``x`` using the DJBX33A hash function.
 
     This is ~5 times faster than calling ``tobytes()`` followed
     by ``hash()`` on array columns. This function also does not
@@ -286,15 +296,12 @@ def hash_columns(x: ArrayLike) -> ArrayLike:
     Parameters
     ----------
     x
-        Array of shape (m, n) and type integer.
+        1D array of type integer.
 
     Returns
     -------
-    Array containing hash values of shape (n,) and type int64.
+    Array containing a single hash value of type int64.
     """
-    h = np.empty((x.shape[1]), dtype=np.int64)
-    for j in range(x.shape[1]):
-        h[j] = 5381
-        for i in range(x.shape[0]):
-            h[j] = h[j] * 33 + x[i, j]
-    return h
+    out[0] = 5381
+    for i in range(x.shape[0]):
+        out[0] = out[0] * 33 + x[i]