Garud H should only support windowed datasets

tomwhite · tomwhite · commit 3b5194f01191 · 2020-11-12T10:23:43.000Z
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -813,6 +813,9 @@ def Garud_h(
     if ds.dims["ploidy"] != 2:
         raise NotImplementedError("Garud H only implemented for diploid genotypes")
 
+    if not has_windows(ds):
+        raise ValueError("Dataset must be windowed for Garud_h")
+
     variables.validate(ds, {call_genotype: variables.call_genotype_spec})
 
     gt = ds[call_genotype]
@@ -822,51 +825,36 @@ def Garud_h(
     hsc = np.stack((sc, sc), axis=1).ravel()  # TODO: assumes diploid
     n_cohorts = sc.max() + 1  # 0-based indexing
 
-    if has_windows(ds):
-        gh = window_statistic(
-            gt,
-            lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts),
-            ds.window_start.values,
-            ds.window_stop.values,
-            dtype=np.float64,
-            # first chunks dimension is windows, computed in window_statistic
-            chunks=(-1, n_cohorts, N_GARUD_H_STATS),
-        )
-        n_windows = ds.window_start.shape[0]
-        assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS)
-        new_ds = Dataset(
-            {
-                variables.stat_Garud_h1: (
-                    ("windows", "cohorts"),
-                    gh[:, :, 0],
-                ),
-                variables.stat_Garud_h12: (
-                    ("windows", "cohorts"),
-                    gh[:, :, 1],
-                ),
-                variables.stat_Garud_h123: (
-                    ("windows", "cohorts"),
-                    gh[:, :, 2],
-                ),
-                variables.stat_Garud_h2_h1: (
-                    ("windows", "cohorts"),
-                    gh[:, :, 3],
-                ),
-            }
-        )
-    else:
-        # TODO: note this materializes all the data, so windowless should be discouraged/not supported
-        gt = gt.values
-
-        gh = _Garud_h_cohorts(gt, sample_cohort=hsc, n_cohorts=n_cohorts)
-        assert_array_shape(gh, n_cohorts, N_GARUD_H_STATS)
+    gh = window_statistic(
+        gt,
+        lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts),
+        ds.window_start.values,
+        ds.window_stop.values,
+        dtype=np.float64,
+        # first chunks dimension is windows, computed in window_statistic
+        chunks=(-1, n_cohorts, N_GARUD_H_STATS),
+    )
+    n_windows = ds.window_start.shape[0]
+    assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS)
+    new_ds = Dataset(
+        {
+            variables.stat_Garud_h1: (
+                ("windows", "cohorts"),
+                gh[:, :, 0],
+            ),
+            variables.stat_Garud_h12: (
+                ("windows", "cohorts"),
+                gh[:, :, 1],
+            ),
+            variables.stat_Garud_h123: (
+                ("windows", "cohorts"),
+                gh[:, :, 2],
+            ),
+            variables.stat_Garud_h2_h1: (
+                ("windows", "cohorts"),
+                gh[:, :, 3],
+            ),
+        }
+    )
 
-        new_ds = Dataset(
-            {
-                variables.stat_Garud_h1: gh[:, 0],
-                variables.stat_Garud_h12: gh[:, 1],
-                variables.stat_Garud_h123: gh[:, 2],
-                variables.stat_Garud_h2_h1: gh[:, 3],
-            }
-        )
     return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -405,46 +405,29 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
 
 @pytest.mark.parametrize(
     "n_variants, n_samples, n_contigs, n_cohorts",
-    [(3, 5, 1, 1), (3, 5, 1, 2)],
+    [(3, 5, 1, 1)],
 )
-@pytest.mark.parametrize("chunks", [(-1, -1), (2, -1)])
-def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
+def test_Garud_h__no_windows(n_variants, n_samples, n_contigs, n_cohorts):
     # We can't use msprime since it doesn't generate diploid data, and Garud uses phased data
     ds = simulate_genotype_call_dataset(
         n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs
     )
-    ds = ds.chunk(dict(zip(["variants", "samples"], chunks)))
     subsets = np.array_split(ds.samples.values, n_cohorts)
     sample_cohorts = np.concatenate(
         [np.full_like(subset, i) for i, subset in enumerate(subsets)]
     )
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
 
-    gh = Garud_h(ds)
-    h1 = gh.stat_Garud_h1.values
-    h12 = gh.stat_Garud_h12.values
-    h123 = gh.stat_Garud_h123.values
-    h2_h1 = gh.stat_Garud_h2_h1.values
-
-    # scikit-allel
-    for c in range(n_cohorts):
-        gt = ds.call_genotype.values[:, sample_cohorts == c, :]
-        ska_gt = allel.GenotypeArray(gt)
-        ska_ha = ska_gt.to_haplotypes()
-        ska_h = allel.garud_h(ska_ha)
-
-        np.testing.assert_allclose(h1[c], ska_h[0])
-        np.testing.assert_allclose(h12[c], ska_h[1])
-        np.testing.assert_allclose(h123[c], ska_h[2])
-        np.testing.assert_allclose(h2_h1[c], ska_h[3])
+    with pytest.raises(ValueError, match="Dataset must be windowed for Garud_h"):
+        Garud_h(ds)
 
 
 @pytest.mark.parametrize(
     "n_variants, n_samples, n_contigs, n_cohorts",
     [(9, 5, 1, 1), (9, 5, 1, 2)],
 )
 @pytest.mark.parametrize("chunks", [(-1, -1), (5, -1)])
-def test_Garud_h__windowed(n_variants, n_samples, n_contigs, n_cohorts, chunks):
+def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
     ds = simulate_genotype_call_dataset(
         n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs
     )