From f20e9291fde8758df301b15d9c3cac09a6a5d168 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 14 Sep 2020 14:55:44 +0100 Subject: [PATCH 1/8] Cohort grouping for popgen Use guvectorize'd implementation of count_cohort_alleles suggested by Eric Czech. Fix test_Tajimas_D Avoid divide by zero error Remove check for number of samples, since it is now handled by where clause Return dataset for diversity Return dataset for divergence Return dataset for Fst Return dataset for Tajimas_D Cohort coordinates Use xarray's combine_nested to build grid of cohort combinations Use conditional_merge_datasets Fix Fst implementation for n_cohorts > 2, and parameterize tests for n_cohorts. Return values in upper and lower diagonal array. Vectorized divergence Vectorized diversity pairs Add doc for count_cohort_alleles --- sgkit/stats/aggregation.py | 80 ++++++++++++ sgkit/stats/popgen.py | 217 +++++++++++++++++++++----------- sgkit/tests/test_aggregation.py | 21 ++++ sgkit/tests/test_popgen.py | 70 ++++++++--- 4 files changed, 298 insertions(+), 90 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index 0cc428a4f..d666097cb 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -7,6 +7,7 @@ from typing_extensions import Literal from xarray import Dataset +from sgkit.stats.utils import assert_array_shape from sgkit.typing import ArrayLike from sgkit.utils import conditional_merge_datasets @@ -51,6 +52,38 @@ def count_alleles(g: ArrayLike, _: ArrayLike, out: ArrayLike) -> None: out[a] += 1 +# n = samples, c = cohorts, k = alleles +@guvectorize( # type: ignore + [ + "void(uint8[:, :], int32[:], uint8[:], int32[:,:])", + "void(uint8[:, :], int64[:], uint8[:], int32[:,:])", + ], + "(n, k),(n),(c)->(c,k)", + nopython=True, +) +def _count_cohort_alleles( + ac: ArrayLike, cohorts: ArrayLike, _: ArrayLike, out: ArrayLike +) -> None: + """Generalized U-function for computing per cohort allele counts. + + Parameters + ---------- + ac + Allele counts of shape (samples, alleles) containing per-sample allele counts. + cohorts + Cohort indexes for samples of shape (samples,). + _ + Dummy variable of type `uint8` and shape (cohorts,) used to + define the number of cohorts. + out + Allele counts with shape (cohorts, alleles) and values corresponding to + the number of non-missing occurrences of each allele in each cohort. + """ + out[:, :] = 0 # (cohorts, alleles) + for i in range(ac.shape[0]): + out[cohorts[i]] += ac[i] + + def count_call_alleles(ds: Dataset, merge: bool = True) -> Dataset: """Compute per sample allele counts from genotype calls. @@ -167,6 +200,53 @@ def count_variant_alleles(ds: Dataset, merge: bool = True) -> Dataset: return conditional_merge_datasets(ds, new_ds, merge) +def count_cohort_alleles(ds: Dataset, merge: bool = True) -> Dataset: + """Compute per cohort allele counts from genotype calls. + + Parameters + ---------- + ds + Genotype call dataset such as from + `sgkit.create_genotype_call_dataset`. + merge + (optional) + If True (the default), merge the input dataset and the computed + output variables into a single dataset, otherwise return only + the computed output variables. + See :ref:`dataset_merge` for more details. + + Returns + ------- + Dataset containing variable `call_allele_count` of allele counts with + shape (variants, cohorts, alleles) and values corresponding to + the number of non-missing occurrences of each allele. + """ + + n_variants = ds.dims["variants"] + n_alleles = ds.dims["alleles"] + + ds = count_call_alleles(ds) + AC, SC = da.asarray(ds.call_allele_count), da.asarray(ds.sample_cohort) + n_cohorts = SC.max().compute() + 1 # 0-based indexing + C = da.empty(n_cohorts, dtype=np.uint8) + + G = da.asarray(ds["call_genotype"]) + shape = (G.chunks[0], n_cohorts, n_alleles) + + AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32) + assert_array_shape( + AC, n_variants, n_cohorts * AC.numblocks[0], n_alleles * AC.numblocks[1] + ) + + # Stack the blocks and sum across them + # (which will only work because each chunk is guaranteed to have same size) + AC = da.stack([AC.blocks[:, i] for i in range(AC.numblocks[1])]).sum(axis=0) + assert_array_shape(AC, n_variants, n_cohorts, n_alleles) + + new_ds = Dataset({"cohort_allele_count": (("variants", "cohorts", "alleles"), AC)}) + return conditional_merge_datasets(ds, new_ds, merge) + + def _swap(dim: Dimension) -> Dimension: return "samples" if dim == "variants" else "variants" diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index 416732bb2..82340b0d7 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -2,17 +2,20 @@ import dask.array as da import numpy as np -import xarray as xr -from xarray import DataArray, Dataset +from numba import guvectorize +from xarray import Dataset -from .aggregation import count_variant_alleles +from sgkit.stats.utils import assert_array_shape +from sgkit.typing import ArrayLike +from sgkit.utils import conditional_merge_datasets + +from .aggregation import count_cohort_alleles, count_variant_alleles def diversity( - ds: Dataset, - allele_counts: Hashable = "variant_allele_count", -) -> DataArray: - """Compute diversity from allele counts. + ds: Dataset, allele_counts: Hashable = "cohort_allele_count", merge: bool = True +) -> Dataset: + """Compute diversity from cohort allele counts. Because we're not providing any arguments on windowing, etc, we return the total over the whole region. Maybe this isn't @@ -28,98 +31,168 @@ def diversity( ds Genotype call dataset. allele_counts - allele counts to use or calculate. + cohort allele counts to use or calculate. Returns ------- diversity value. """ - if len(ds.samples) < 2: - return xr.DataArray(np.nan) if allele_counts not in ds: - ds_new = count_variant_alleles(ds) - else: - ds_new = ds - ac = ds_new[allele_counts] - an = ac.sum(axis=1) + ds = count_cohort_alleles(ds) + ac = ds[allele_counts] + an = ac.sum(axis=2) n_pairs = an * (an - 1) / 2 - n_same = (ac * (ac - 1) / 2).sum(axis=1) + n_same = (ac * (ac - 1) / 2).sum(axis=2) n_diff = n_pairs - n_same - pi = n_diff / n_pairs - return pi.sum() # type: ignore[no-any-return] + # replace zeros to avoid divide by zero error + n_pairs_na = n_pairs.where(n_pairs != 0) + pi = n_diff / n_pairs_na + pi_sum = pi.sum(axis=0, skipna=False) + new_ds = Dataset( + { + "stat_diversity": ( + "cohorts", + pi_sum, + ) + } + ) + return conditional_merge_datasets(ds, new_ds, merge) + + +# c = cohorts, k = alleles +@guvectorize( # type: ignore + ["void(int64[:, :], int64[:], float64[:,:])"], + "(c, k),(c)->(c,c)", + nopython=True, +) +def _divergence(ac: ArrayLike, an: ArrayLike, out: ArrayLike) -> None: + """Generalized U-function for computing divergence. + + Parameters + ---------- + ac + Allele counts of shape (cohorts, alleles) containing per-cohort allele counts. + an + Allele totals of shape (cohorts,) containing per-cohort allele totals. + out + Pairwise divergence stats with shape (cohorts, cohorts), where the entry at + (i, j) is the divergence between cohort i and cohort j. + """ + out[:, :] = np.nan # (cohorts, cohorts) + n_cohorts = ac.shape[0] + n_alleles = ac.shape[1] + # calculate the divergence for each cohort pair + for i in range(n_cohorts): + for j in range(i + 1, n_cohorts): + n_pairs = an[i] * an[j] + n_same = 0 + for k in range(n_alleles): + n_same += ac[i, k] * ac[j, k] + n_diff = n_pairs - n_same + div = n_diff / n_pairs + out[i, j] = div + out[j, i] = div def divergence( - ds1: Dataset, - ds2: Dataset, - allele_counts: Hashable = "variant_allele_count", -) -> DataArray: - """Compute divergence between two genotype call datasets. + ds: Dataset, allele_counts: Hashable = "cohort_allele_count", merge: bool = True +) -> Dataset: + """Compute divergence between pairs of cohorts. Parameters ---------- - ds1 - Genotype call dataset. - ds2 + ds Genotype call dataset. allele_counts - allele counts to use or calculate. + cohort allele counts to use or calculate. Returns ------- - divergence value between the two datasets. + divergence value between pairs of cohorts. """ - if allele_counts not in ds1: - ds1_new = count_variant_alleles(ds1) - else: - ds1_new = ds1 - ac1 = ds1_new[allele_counts] - if allele_counts not in ds2: - ds2_new = count_variant_alleles(ds2) - else: - ds2_new = ds2 - ac2 = ds2_new[allele_counts] - an1 = ds1_new[allele_counts].sum(axis=1) - an2 = ds2_new[allele_counts].sum(axis=1) - n_pairs = an1 * an2 - n_same = (ac1 * ac2).sum(axis=1) - n_diff = n_pairs - n_same - div = n_diff / n_pairs - return div.sum() # type: ignore[no-any-return] + if allele_counts not in ds: + ds = count_cohort_alleles(ds) + ac = ds[allele_counts] + an = ac.sum(axis=2) + + n_variants = ds.dims["variants"] + n_cohorts = ds.dims["cohorts"] + ac = da.asarray(ac) + an = da.asarray(an) + shape = (ac.chunks[0], n_cohorts, n_cohorts) + d = da.map_blocks(_divergence, ac, an, chunks=shape, dtype=np.float64) + assert_array_shape(d, n_variants, n_cohorts, n_cohorts) + + d_sum = d.sum(axis=0) + assert_array_shape(d_sum, n_cohorts, n_cohorts) + + new_ds = Dataset({"stat_divergence": (("cohorts_a", "cohorts_b"), d_sum)}) + return conditional_merge_datasets(ds, new_ds, merge) + + +# c = cohorts +@guvectorize( # type: ignore + ["void(float64[:], float64[:,:])"], + "(c)->(c,c)", + nopython=True, +) +def _pairwise_sum(d: ArrayLike, out: ArrayLike) -> None: + """Generalized U-function for computing pairwise sums of diversity. + + Parameters + ---------- + ac + Diversity values of shape (cohorts,). + out + Pairwise diversity stats with shape (cohorts, cohorts), where the entry at + (i, j) is the sum of the diversities for cohort i and cohort j. + """ + n_cohorts = d.shape[0] + # calculate the divergence for each cohort pair + for i in range(n_cohorts): + for j in range(n_cohorts): + out[i, j] = d[i] + d[j] def Fst( - ds1: Dataset, - ds2: Dataset, - allele_counts: Hashable = "variant_allele_count", -) -> DataArray: - """Compute Fst between two genotype call datasets. + ds: Dataset, allele_counts: Hashable = "cohort_allele_count", merge: bool = True +) -> Dataset: + """Compute Fst between pairs of cohorts. Parameters ---------- - ds1 - Genotype call dataset. - ds2 + ds Genotype call dataset. allele_counts - allele counts to use or calculate. + cohort allele counts to use or calculate. Returns ------- - fst value between the two datasets. + Fst value between pairs of cohorts. """ - total_div = diversity(ds1) + diversity(ds2) - gs = divergence(ds1, ds2) - den = total_div + 2 * gs # type: ignore[operator] - fst = 1 - (2 * total_div / den) - return fst # type: ignore[no-any-return] + if allele_counts not in ds: + ds = count_cohort_alleles(ds) + n_cohorts = ds.dims["cohorts"] + div = diversity(ds, allele_counts, merge=False).stat_diversity + assert_array_shape(div, n_cohorts) + + # calculate diversity pairs + div = da.asarray(div) + shape = (n_cohorts, n_cohorts) + div_pairs = da.map_blocks(_pairwise_sum, div, chunks=shape, dtype=np.float64) + assert_array_shape(div_pairs, n_cohorts, n_cohorts) + + gs = divergence(ds, allele_counts, merge=False).stat_divergence + den = div_pairs + 2 * gs + fst = 1 - (2 * div_pairs / den) + new_ds = Dataset({"stat_Fst": fst}) + return conditional_merge_datasets(ds, new_ds, merge) def Tajimas_D( - ds: Dataset, - allele_counts: Hashable = "variant_allele_count", -) -> DataArray: + ds: Dataset, allele_counts: Hashable = "variant_allele_count", merge: bool = True +) -> Dataset: """Compute Tajimas' D for a genotype call dataset. Parameters @@ -135,10 +208,8 @@ def Tajimas_D( """ if allele_counts not in ds: - ds_new = count_variant_alleles(ds) - else: - ds_new = ds - ac = ds_new[allele_counts] + ds = count_variant_alleles(ds) + ac = ds[allele_counts] # count segregating S = ((ac > 0).sum(axis=1) > 1).sum() @@ -153,7 +224,7 @@ def Tajimas_D( theta = S / a1 # calculate diversity - div = diversity(ds_new) + div = diversity(ds).stat_diversity # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference @@ -170,8 +241,10 @@ def Tajimas_D( d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) if d_stdev == 0: - return xr.DataArray(np.nan) + D = np.nan + else: + # finally calculate Tajima's D + D = d / d_stdev - # finally calculate Tajima's D - D = d / d_stdev - return D # type: ignore[no-any-return] + new_ds = Dataset({"stat_Tajimas_D": D}) + return conditional_merge_datasets(ds, new_ds, merge) diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py index 8951f842b..b8aafa50a 100644 --- a/sgkit/tests/test_aggregation.py +++ b/sgkit/tests/test_aggregation.py @@ -7,6 +7,7 @@ from sgkit.stats.aggregation import ( count_call_alleles, + count_cohort_alleles, count_variant_alleles, variant_stats, ) @@ -209,6 +210,26 @@ def test_count_call_alleles__chunked(): xr.testing.assert_equal(ac1, ac2) # type: ignore[no-untyped-call] +def test_count_cohort_alleles__multi_variant_multi_sample(): + ds = get_dataset( + [ + [[0, 0], [0, 0], [0, 0]], + [[0, 0], [0, 0], [0, 1]], + [[1, 1], [0, 1], [1, 0]], + [[1, 1], [1, 1], [1, 1]], + ] + ) + ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1]), dims="samples") + ds = count_cohort_alleles(ds) + ac = ds["cohort_allele_count"] + np.testing.assert_equal( + ac, + np.array( + [[[2, 0], [4, 0]], [[2, 0], [3, 1]], [[0, 2], [2, 2]], [[0, 2], [0, 4]]] + ), + ) + + @pytest.mark.parametrize("precompute_variant_allele_count", [False, True]) def test_variant_stats(precompute_variant_allele_count): ds = get_dataset( diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py index 478f4cae0..181136fdd 100644 --- a/sgkit/tests/test_popgen.py +++ b/sgkit/tests/test_popgen.py @@ -1,6 +1,9 @@ +import itertools + import msprime # type: ignore import numpy as np import pytest +import xarray as xr from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity @@ -37,32 +40,60 @@ def ts_to_dataset(ts, samples=None): def test_diversity(size): ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] - div = diversity(ds).compute() + sample_cohorts = np.full_like(ts.samples(), 0) + ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") + ds = ds.assign_coords({"cohorts": ["co_0"]}) + ds = diversity(ds) + div = ds["stat_diversity"].sel(cohorts="co_0").values ts_div = ts.diversity(span_normalise=False) np.testing.assert_allclose(div, ts_div) -@pytest.mark.parametrize("size", [2, 3, 10, 100]) -def test_divergence(size): +@pytest.mark.parametrize( + "size, n_cohorts", + [(2, 2), (3, 2), (3, 3), (10, 2), (10, 3), (10, 4), (100, 2), (100, 3), (100, 4)], +) +def test_divergence(size, n_cohorts): ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) - subset_1 = ts.samples()[: ts.num_samples // 2] - subset_2 = ts.samples()[ts.num_samples // 2 :] - ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call] - ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call] - div = divergence(ds1, ds2).compute() - ts_div = ts.divergence([subset_1, subset_2], span_normalise=False) + subsets = np.array_split(ts.samples(), n_cohorts) + ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + sample_cohorts = np.concatenate( + [np.full_like(subset, i) for i, subset in enumerate(subsets)] + ) + ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") + cohort_names = [f"co_{i}" for i in range(n_cohorts)] + ds = ds.assign_coords({"cohorts_a": cohort_names, "cohorts_b": cohort_names}) + ds = divergence(ds) + div = ds["stat_divergence"].values + + ts_div = np.full([n_cohorts, n_cohorts], np.nan) + for i, j in itertools.combinations(range(n_cohorts), 2): + ts_div[i, j] = ts.divergence([subsets[i], subsets[j]], span_normalise=False) + ts_div[j, i] = ts_div[i, j] np.testing.assert_allclose(div, ts_div) -@pytest.mark.parametrize("size", [2, 3, 10, 100]) -def test_Fst(size): +@pytest.mark.parametrize( + "size, n_cohorts", + [(2, 2), (3, 2), (3, 3), (10, 2), (10, 3), (10, 4), (100, 2), (100, 3), (100, 4)], +) +def test_Fst(size, n_cohorts): ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) - subset_1 = ts.samples()[: ts.num_samples // 2] - subset_2 = ts.samples()[ts.num_samples // 2 :] - ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call] - ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call] - fst = Fst(ds1, ds2).compute() - ts_fst = ts.Fst([subset_1, subset_2]) + subsets = np.array_split(ts.samples(), n_cohorts) + ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + sample_cohorts = np.concatenate( + [np.full_like(subset, i) for i, subset in enumerate(subsets)] + ) + ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") + cohort_names = [f"co_{i}" for i in range(n_cohorts)] + ds = ds.assign_coords({"cohorts_a": cohort_names, "cohorts_b": cohort_names}) + ds = Fst(ds) + fst = ds["stat_Fst"].values + + ts_fst = np.full([n_cohorts, n_cohorts], np.nan) + for i, j in itertools.combinations(range(n_cohorts), 2): + ts_fst[i, j] = ts.Fst([subsets[i], subsets[j]]) + ts_fst[j, i] = ts_fst[i, j] np.testing.assert_allclose(fst, ts_fst) @@ -70,6 +101,9 @@ def test_Fst(size): def test_Tajimas_D(size): ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + sample_cohorts = np.full_like(ts.samples(), 0) + ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") + ds = Tajimas_D(ds) + d = ds["stat_Tajimas_D"].compute() ts_d = ts.Tajimas_D() - d = Tajimas_D(ds).compute() np.testing.assert_allclose(d, ts_d) From 5f969fb005859d24b1ea9f871d90d5ba51b5a187 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 28 Sep 2020 11:44:24 +0100 Subject: [PATCH 2/8] Call two cohorts dimensions 'cohorts_0' and 'cohorts_1' --- sgkit/stats/popgen.py | 2 +- sgkit/tests/test_popgen.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index 82340b0d7..dae992d14 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -127,7 +127,7 @@ def divergence( d_sum = d.sum(axis=0) assert_array_shape(d_sum, n_cohorts, n_cohorts) - new_ds = Dataset({"stat_divergence": (("cohorts_a", "cohorts_b"), d_sum)}) + new_ds = Dataset({"stat_divergence": (("cohorts_0", "cohorts_1"), d_sum)}) return conditional_merge_datasets(ds, new_ds, merge) diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py index 181136fdd..80fb56c4a 100644 --- a/sgkit/tests/test_popgen.py +++ b/sgkit/tests/test_popgen.py @@ -62,7 +62,7 @@ def test_divergence(size, n_cohorts): ) ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") cohort_names = [f"co_{i}" for i in range(n_cohorts)] - ds = ds.assign_coords({"cohorts_a": cohort_names, "cohorts_b": cohort_names}) + ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) ds = divergence(ds) div = ds["stat_divergence"].values @@ -86,7 +86,7 @@ def test_Fst(size, n_cohorts): ) ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") cohort_names = [f"co_{i}" for i in range(n_cohorts)] - ds = ds.assign_coords({"cohorts_a": cohort_names, "cohorts_b": cohort_names}) + ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) ds = Fst(ds) fst = ds["stat_Fst"].values From a66f910303356167ca393f4897e3db915b7221bf Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 1 Oct 2020 11:22:59 +0100 Subject: [PATCH 3/8] Change dataset variable accessors to use shorter form --- sgkit/stats/aggregation.py | 2 +- sgkit/tests/test_aggregation.py | 2 +- sgkit/tests/test_popgen.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index d666097cb..692b23023 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -230,7 +230,7 @@ def count_cohort_alleles(ds: Dataset, merge: bool = True) -> Dataset: n_cohorts = SC.max().compute() + 1 # 0-based indexing C = da.empty(n_cohorts, dtype=np.uint8) - G = da.asarray(ds["call_genotype"]) + G = da.asarray(ds.call_genotype) shape = (G.chunks[0], n_cohorts, n_alleles) AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32) diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py index b8aafa50a..add8db26a 100644 --- a/sgkit/tests/test_aggregation.py +++ b/sgkit/tests/test_aggregation.py @@ -221,7 +221,7 @@ def test_count_cohort_alleles__multi_variant_multi_sample(): ) ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1]), dims="samples") ds = count_cohort_alleles(ds) - ac = ds["cohort_allele_count"] + ac = ds.cohort_allele_count np.testing.assert_equal( ac, np.array( diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py index 80fb56c4a..e7e264256 100644 --- a/sgkit/tests/test_popgen.py +++ b/sgkit/tests/test_popgen.py @@ -44,7 +44,7 @@ def test_diversity(size): ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") ds = ds.assign_coords({"cohorts": ["co_0"]}) ds = diversity(ds) - div = ds["stat_diversity"].sel(cohorts="co_0").values + div = ds.stat_diversity.sel(cohorts="co_0").values ts_div = ts.diversity(span_normalise=False) np.testing.assert_allclose(div, ts_div) @@ -64,7 +64,7 @@ def test_divergence(size, n_cohorts): cohort_names = [f"co_{i}" for i in range(n_cohorts)] ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) ds = divergence(ds) - div = ds["stat_divergence"].values + div = ds.stat_divergence.values ts_div = np.full([n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): @@ -88,7 +88,7 @@ def test_Fst(size, n_cohorts): cohort_names = [f"co_{i}" for i in range(n_cohorts)] ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) ds = Fst(ds) - fst = ds["stat_Fst"].values + fst = ds.stat_Fst.values ts_fst = np.full([n_cohorts, n_cohorts], np.nan) for i, j in itertools.combinations(range(n_cohorts), 2): @@ -104,6 +104,6 @@ def test_Tajimas_D(size): sample_cohorts = np.full_like(ts.samples(), 0) ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") ds = Tajimas_D(ds) - d = ds["stat_Tajimas_D"].compute() + d = ds.stat_Tajimas_D.compute() ts_d = ts.Tajimas_D() np.testing.assert_allclose(d, ts_d) From 7fd35afabcfdc919111cdc9646cf0846c4b50686 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 1 Oct 2020 11:24:51 +0100 Subject: [PATCH 4/8] Remove spurious 'optional' description in doc --- sgkit/stats/aggregation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index 692b23023..f83934808 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -93,7 +93,6 @@ def count_call_alleles(ds: Dataset, merge: bool = True) -> Dataset: Genotype call dataset such as from `sgkit.create_genotype_call_dataset`. merge - (optional) If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. @@ -209,7 +208,6 @@ def count_cohort_alleles(ds: Dataset, merge: bool = True) -> Dataset: Genotype call dataset such as from `sgkit.create_genotype_call_dataset`. merge - (optional) If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. From c414bb6533c22e646831ab868c50c42502b6a835 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 1 Oct 2020 11:26:35 +0100 Subject: [PATCH 5/8] Add float32 overload to _pairwise_sum gufunc --- sgkit/stats/popgen.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index dae992d14..de70df99b 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -133,7 +133,10 @@ def divergence( # c = cohorts @guvectorize( # type: ignore - ["void(float64[:], float64[:,:])"], + [ + "void(float32[:], float32[:,:])", + "void(float64[:], float64[:,:])", + ], "(c)->(c,c)", nopython=True, ) From 0a735bed7df27544e57029d63e60816894a618f7 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 1 Oct 2020 12:19:23 +0100 Subject: [PATCH 6/8] Test chunking across variants --- sgkit/stats/aggregation.py | 2 +- sgkit/tests/test_popgen.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index f83934808..35c69de4b 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -233,7 +233,7 @@ def count_cohort_alleles(ds: Dataset, merge: bool = True) -> Dataset: AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32) assert_array_shape( - AC, n_variants, n_cohorts * AC.numblocks[0], n_alleles * AC.numblocks[1] + AC, n_variants, n_cohorts * AC.numblocks[1], n_alleles * AC.numblocks[2] ) # Stack the blocks and sum across them diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py index e7e264256..4f9398ddf 100644 --- a/sgkit/tests/test_popgen.py +++ b/sgkit/tests/test_popgen.py @@ -37,9 +37,11 @@ def ts_to_dataset(ts, samples=None): @pytest.mark.parametrize("size", [2, 3, 10, 100]) -def test_diversity(size): +@pytest.mark.parametrize("chunks", [(-1, -1), (10, -1)]) +def test_diversity(size, chunks): ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] + ds = ds.chunk(dict(zip(["variants", "samples"], chunks))) sample_cohorts = np.full_like(ts.samples(), 0) ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") ds = ds.assign_coords({"cohorts": ["co_0"]}) From 80c1f107bdc89bac70fb532a562b709a7337d1d5 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 1 Oct 2020 14:48:53 +0100 Subject: [PATCH 7/8] Fix Fst usage in Getting Started guide --- docs/getting_started.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index df4ccb99a..2eac3e5e8 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -255,6 +255,7 @@ Sgkit functions are compatible with this idiom by default and this example shows Xarray and Pandas operations in a single pipeline: .. ipython:: python + :okwarning: import sgkit as sg ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=50, missing_pct=.1) @@ -276,10 +277,9 @@ Xarray and Pandas operations in a single pipeline: # Assign a "cohort" variable that splits samples into two groups .assign(sample_cohort=np.repeat([0, 1], ds.dims['samples'] // 2)) # Compute Fst between the groups - # TODO: Refactor based on https://github.com/pystatgen/sgkit/pull/260 - .pipe(lambda ds: sg.Fst(*(g[1] for g in ds.groupby('sample_cohort')))) - # Extract the single Fst value from the resulting array - .item(0) + .pipe(sg.Fst) + # Extract the Fst values for cohort pairs + .stat_Fst.values ) This is possible because sgkit functions nearly always take a ``Dataset`` as the first argument, create new From 3c6bf4bf8977df106a006b88b10e74d0ae7b59a3 Mon Sep 17 00:00:00 2001 From: Tom White Date: Thu, 1 Oct 2020 14:54:22 +0100 Subject: [PATCH 8/8] Add warnings about not supporting datasets chunked by samples --- sgkit/stats/popgen.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py index de70df99b..aedd73c70 100644 --- a/sgkit/stats/popgen.py +++ b/sgkit/stats/popgen.py @@ -36,6 +36,11 @@ def diversity( Returns ------- diversity value. + + Warnings + -------- + This method does not currently support datasets that are chunked along the + samples dimension. """ if allele_counts not in ds: ds = count_cohort_alleles(ds) @@ -109,6 +114,11 @@ def divergence( Returns ------- divergence value between pairs of cohorts. + + Warnings + -------- + This method does not currently support datasets that are chunked along the + samples dimension. """ if allele_counts not in ds: @@ -173,6 +183,11 @@ def Fst( Returns ------- Fst value between pairs of cohorts. + + Warnings + -------- + This method does not currently support datasets that are chunked along the + samples dimension. """ if allele_counts not in ds: ds = count_cohort_alleles(ds) @@ -209,6 +224,10 @@ def Tajimas_D( ------- Tajimas' D value. + Warnings + -------- + This method does not currently support datasets that are chunked along the + samples dimension. """ if allele_counts not in ds: ds = count_variant_alleles(ds)