Skip to content

Sample summary stats #29 #417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Methods
gwas_linear_regression
hardy_weinberg_test
regenie
sample_stats
variant_stats
Tajimas_D
pc_relate
Expand Down Expand Up @@ -88,7 +89,13 @@ Variables
variables.loco_prediction_spec
variables.meta_prediction_spec
variables.pc_relate_phi_spec
variables.sample_call_rate_spec
variables.sample_id_spec
variables.sample_n_called_spec
variables.sample_n_het_spec
variables.sample_n_hom_alt_spec
variables.sample_n_hom_ref_spec
variables.sample_n_non_ref_spec
variables.sample_pcs_spec
variables.stat_divergence_spec
variables.stat_diversity_spec
Expand Down
2 changes: 2 additions & 0 deletions sgkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
count_call_alleles,
count_cohort_alleles,
count_variant_alleles,
sample_stats,
variant_stats,
)
from .stats.association import gwas_linear_regression
Expand Down Expand Up @@ -48,6 +49,7 @@
"read_vcfzarr",
"regenie",
"hardy_weinberg_test",
"sample_stats",
"variant_stats",
"diversity",
"divergence",
Expand Down
72 changes: 72 additions & 0 deletions sgkit/stats/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,3 +470,75 @@ def variant_stats(
]
)
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)


def sample_stats(
ds: Dataset,
*,
call_genotype_mask: Hashable = variables.call_genotype_mask,
call_genotype: Hashable = variables.call_genotype,
variant_allele_count: Hashable = variables.variant_allele_count,
merge: bool = True,
) -> Dataset:
"""Compute quality control sample statistics from genotype calls.

Parameters
----------
ds
Dataset containing genotype calls.
call_genotype
Input variable name holding call_genotype.
Defined by :data:`sgkit.variables.call_genotype_spec`.
Must be present in ``ds``.
call_genotype_mask
Input variable name holding call_genotype_mask.
Defined by :data:`sgkit.variables.call_genotype_mask_spec`
Must be present in ``ds``.
variant_allele_count
Input variable name holding variant_allele_count,
as defined by :data:`sgkit.variables.variant_allele_count_spec`.
If the variable is not present in ``ds``, it will be computed
using :func:`count_variant_alleles`.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
the computed output variables.
See :ref:`dataset_merge` for more details.

Returns
-------
A dataset containing the following variables:

- :data:`sgkit.variables.sample_n_called_spec` (samples):
The number of variants with called genotypes.
- :data:`sgkit.variables.sample_call_rate_spec` (samples):
The fraction of variants with called genotypes.
- :data:`sgkit.variables.sample_n_het_spec` (samples):
The number of variants with heterozygous calls.
- :data:`sgkit.variables.sample_n_hom_ref_spec` (samples):
The number of variants with homozygous reference calls.
- :data:`sgkit.variables.sample_n_hom_alt_spec` (samples):
The number of variants with homozygous alternate calls.
- :data:`sgkit.variables.sample_n_non_ref_spec` (samples):
The number of variants that are not homozygous reference calls.
"""
variables.validate(
ds,
{
call_genotype: variables.call_genotype_spec,
call_genotype_mask: variables.call_genotype_mask_spec,
},
)
new_ds = xr.merge(
[
call_rate(ds, dim="variants", call_genotype_mask=call_genotype_mask),
count_genotypes(
ds,
dim="variants",
call_genotype=call_genotype,
call_genotype_mask=call_genotype_mask,
merge=False,
),
]
)
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
19 changes: 18 additions & 1 deletion sgkit/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
count_call_alleles,
count_cohort_alleles,
count_variant_alleles,
sample_stats,
variant_stats,
)
from sgkit.testing import simulate_genotype_call_dataset
Expand Down Expand Up @@ -263,7 +264,6 @@ def test_variant_stats(precompute_variant_allele_count):
np.testing.assert_equal(vs["variant_n_hom_alt"], np.array([0, 1, 0, 0]))
np.testing.assert_equal(vs["variant_n_het"], np.array([1, 1, 2, 0]))
np.testing.assert_equal(vs["variant_n_non_ref"], np.array([1, 2, 2, 0]))
np.testing.assert_equal(vs["variant_n_non_ref"], np.array([1, 2, 2, 0]))
np.testing.assert_equal(
vs["variant_allele_count"], np.array([[1, 1], [1, 3], [2, 2], [2, 0]])
)
Expand All @@ -272,3 +272,20 @@ def test_variant_stats(precompute_variant_allele_count):
vs["variant_allele_frequency"],
np.array([[0.5, 0.5], [0.25, 0.75], [0.5, 0.5], [1, 0]]),
)


@pytest.mark.parametrize("precompute_variant_allele_count", [False, True])
def test_sample_stats(precompute_variant_allele_count):
ds = get_dataset(
[[[1, 0], [-1, -1]], [[1, 0], [1, 1]], [[0, 1], [1, 0]], [[-1, -1], [0, 0]]]
)
if precompute_variant_allele_count:
ds = count_variant_alleles(ds)
ss = sample_stats(ds)

np.testing.assert_equal(ss["sample_n_called"], np.array([3, 3]))
np.testing.assert_equal(ss["sample_call_rate"], np.array([0.75, 0.75]))
np.testing.assert_equal(ss["sample_n_hom_ref"], np.array([0, 1]))
np.testing.assert_equal(ss["sample_n_hom_alt"], np.array([0, 1]))
np.testing.assert_equal(ss["sample_n_het"], np.array([3, 1]))
np.testing.assert_equal(ss["sample_n_non_ref"], np.array([3, 2]))
24 changes: 24 additions & 0 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,34 @@ def _check_field(
ArrayLikeSpec("pc_relate_phi", ndim=2, kind="f")
)
"""PC Relate kinship coefficient matrix."""
sample_call_rate, sample_call_rate_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_call_rate", ndim=1, kind="f")
)
"""The fraction of variants with called genotypes."""
sample_id, sample_id_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_id", kind={"S", "U", "O"}, ndim=1)
)
"""The unique identifier of the sample."""
sample_n_called, sample_n_called_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_called", ndim=1, kind="i")
)
"""The number of variants with called genotypes."""
sample_n_het, sample_n_het_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_het", ndim=1, kind="i")
)
"""The number of variants with heterozygous calls."""
sample_n_hom_alt, sample_n_hom_alt_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_hom_alt", ndim=1, kind="i")
)
"""The number of variants with homozygous alternate calls."""
sample_n_hom_ref, sample_n_hom_ref_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_hom_ref", ndim=1, kind="i")
)
"""The number of variants with homozygous reference calls."""
sample_n_non_ref, sample_n_non_ref_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_n_non_ref", ndim=1, kind="i")
)
"""The number of variants that are not homozygous reference calls."""
sample_pcs, sample_pcs_spec = SgkitVariables.register_variable(
ArrayLikeSpec("sample_pcs", ndim=2, kind="f")
)
Expand Down