6
6
from numba import guvectorize
7
7
from xarray import Dataset
8
8
9
- from sgkit import to_haplotype_calls
10
9
from sgkit .stats .utils import assert_array_shape
11
10
from sgkit .typing import ArrayLike
12
11
from sgkit .utils import (
13
12
conditional_merge_datasets ,
14
13
define_variable_if_absent ,
15
- hash_columns ,
14
+ hash_array ,
16
15
)
17
16
from sgkit .window import has_windows , window_statistic
18
17
@@ -693,13 +692,13 @@ def pbs(
693
692
N_GARUD_H_STATS = 4 # H1, H12, H123, H2/H1
694
693
695
694
696
- def _Garud_h (k : ArrayLike ) -> ArrayLike :
695
+ def _Garud_h (haplotypes : ArrayLike ) -> ArrayLike :
697
696
# find haplotype counts (sorted in descending order)
698
- counts = sorted (collections .Counter (k .tolist ()).values (), reverse = True )
697
+ counts = sorted (collections .Counter (haplotypes .tolist ()).values (), reverse = True )
699
698
counts = np .array (counts )
700
699
701
700
# find haplotype frequencies
702
- n = k .shape [0 ]
701
+ n = haplotypes .shape [0 ]
703
702
f = counts / n
704
703
705
704
# compute H1
@@ -719,19 +718,20 @@ def _Garud_h(k: ArrayLike) -> ArrayLike:
719
718
720
719
721
720
def _Garud_h_cohorts (
722
- ht : ArrayLike , sample_cohort : ArrayLike , n_cohorts : int
721
+ gt : ArrayLike , sample_cohort : ArrayLike , n_cohorts : int
723
722
) -> ArrayLike :
724
- k = hash_columns (ht ) # hash haplotypes
723
+ # transpose to hash columns (haplotypes)
724
+ haplotypes = hash_array (gt .transpose ()).transpose ().flatten ()
725
725
arr = np .empty ((n_cohorts , N_GARUD_H_STATS ))
726
726
for c in range (n_cohorts ):
727
- arr [c , :] = _Garud_h (k [sample_cohort == c ])
727
+ arr [c , :] = _Garud_h (haplotypes [sample_cohort == c ])
728
728
return arr
729
729
730
730
731
731
def Garud_h (
732
732
ds : Dataset ,
733
733
* ,
734
- call_haplotype : Hashable = variables .call_haplotype ,
734
+ call_genotype : Hashable = variables .call_genotype ,
735
735
merge : bool = True ,
736
736
) -> Dataset :
737
737
"""Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
@@ -745,11 +745,10 @@ def Garud_h(
745
745
----------
746
746
ds
747
747
Genotype call dataset.
748
- call_haplotype
749
- Call haplotype variable to use or calculate. Defined by
750
- :data:`sgkit.variables.call_haplotype_spec`.
751
- If the variable is not present in ``ds``, it will be computed
752
- using :func:`to_haplotype_calls`.
748
+ call_genotype
749
+ Input variable name holding call_genotype as defined by
750
+ :data:`sgkit.variables.call_genotype_spec`.
751
+ Must be present in ``ds``.
753
752
merge
754
753
If True (the default), merge the input dataset and the computed
755
754
output variables into a single dataset, otherwise return only
@@ -814,12 +813,9 @@ def Garud_h(
814
813
if ds .dims ["ploidy" ] != 2 :
815
814
raise NotImplementedError ("Garud H only implemented for diploid genotypes" )
816
815
817
- ds = define_variable_if_absent (
818
- ds , variables .call_haplotype , call_haplotype , to_haplotype_calls
819
- )
820
- variables .validate (ds , {call_haplotype : variables .call_haplotype_spec })
816
+ variables .validate (ds , {call_genotype : variables .call_genotype_spec })
821
817
822
- ht = ds [call_haplotype ]
818
+ gt = ds [call_genotype ]
823
819
824
820
# convert sample cohorts to haplotype layout
825
821
sc = ds .sample_cohort .values
@@ -828,14 +824,13 @@ def Garud_h(
828
824
829
825
if has_windows (ds ):
830
826
gh = window_statistic (
831
- ht ,
832
- lambda ht : _Garud_h_cohorts (ht , hsc , n_cohorts ),
827
+ gt ,
828
+ lambda gt : _Garud_h_cohorts (gt , hsc , n_cohorts ),
833
829
ds .window_start .values ,
834
830
ds .window_stop .values ,
835
831
dtype = np .float64 ,
836
832
# first chunks dimension is windows, computed in window_statistic
837
833
chunks = (- 1 , n_cohorts , N_GARUD_H_STATS ),
838
- new_axis = 2 , # 2d -> 3d
839
834
)
840
835
n_windows = ds .window_start .shape [0 ]
841
836
assert_array_shape (gh , n_windows , n_cohorts , N_GARUD_H_STATS )
@@ -861,9 +856,9 @@ def Garud_h(
861
856
)
862
857
else :
863
858
# TODO: note this materializes all the data, so windowless should be discouraged/not supported
864
- ht = ht .values
859
+ gt = gt .values
865
860
866
- gh = _Garud_h_cohorts (ht , sample_cohort = hsc , n_cohorts = n_cohorts )
861
+ gh = _Garud_h_cohorts (gt , sample_cohort = hsc , n_cohorts = n_cohorts )
867
862
assert_array_shape (gh , n_cohorts , N_GARUD_H_STATS )
868
863
869
864
new_ds = Dataset (
0 commit comments