Skip to content

Commit 2274196

Browse files
authored
Update gnomad_gks methods for VRS 2.0.1 and CohortAlleleFrequencyStudyResult 1.0.1 (#818)
1 parent 6ad4625 commit 2274196

File tree

4 files changed

+717
-161
lines changed

4 files changed

+717
-161
lines changed

gnomad/resources/grch38/gnomad.py

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# noqa: D100
22

33
import logging
4-
from typing import Optional
4+
from typing import Any, Dict, List, Optional
55

66
import hail as hl
77

@@ -604,8 +604,6 @@ def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table:
604604
"""
605605
Add a grpMaxFAF95 struct with 'grpmax' and 'grpmax_gen_anc'.
606606
607-
Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures.
608-
609607
:param ht: Input hail table.
610608
:return: Annotated hail table.
611609
"""
@@ -618,10 +616,6 @@ def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table:
618616
grpmax=fafmax_field.faf95_max,
619617
grpmax_gen_anc=fafmax_field.faf95_max_gen_anc,
620618
),
621-
jointGrpMaxFAF95=hl.struct(
622-
grpmax=ht.joint_fafmax.faf95_max,
623-
grpmax_gen_anc=ht.joint_fafmax.faf95_max_gen_anc,
624-
),
625619
)
626620
return ht
627621

@@ -637,23 +631,27 @@ def gnomad_gks(
637631
skip_checkpoint: bool = False,
638632
skip_coverage: bool = False,
639633
custom_coverage_ht: hl.Table = None,
640-
) -> list:
634+
skip_joint_metrics: bool = False,
635+
custom_joint_ht: hl.Table = None,
636+
) -> List[Dict[str, Any]]:
641637
"""
642638
Perform gnomad GKS annotations on a range of variants at once.
643639
644640
:param locus_interval: Hail IntervalExpression of locus<reference_genome>.
645641
e.g. hl.locus_interval('chr1', 6424776, 6461367, reference_genome="GRCh38")
646642
:param version: String of version of gnomAD release to use.
647-
:param data_type: String of either "exomes" or "genomes" for the type of reads that are desired.
648-
:param by_gen_anc_group: Boolean to pass to obtain frequency information for each cohort.
649-
:param by_sex: Boolean to pass to return frequency information for each cohort split by chromosomal sex.
643+
:param data_type: String of either "exomes" or "genomes" for the type of reads that are desired. Default is "genomes".
644+
:param by_gen_anc_group: Boolean to pass to obtain frequency information for each cohort. Default is False.
645+
:param by_sex: Boolean to pass to return frequency information for each cohort split by chromosomal sex. Default is False.
650646
:param vrs_only: Boolean to pass for only VRS info to be returned
651-
(will not include allele frequency information).
652-
:param custom_ht: Table to use instead of what public_release() method would return for the version.
647+
(will not include allele frequency information). Default is False.
648+
:param custom_ht: Table to use instead of what public_release() method would return for the version. Default is None.
653649
:param skip_checkpoint: Bool to pass to skip checkpointing selected fields
654-
(checkpointing may be desirable for large datasets by reducing data copies across the cluster).
655-
:param skip_coverage: Bool to pass to skip adding coverage statistics.
656-
:param custom_coverage_ht: Custom table to use for coverage statistics instead of the release coverage table.
650+
(checkpointing may be desirable for large datasets by reducing data copies across the cluster). Default is False.
651+
:param skip_coverage: Bool to pass to skip adding coverage statistics. Default is False.
652+
:param custom_coverage_ht: Custom table to use for coverage statistics instead of the release coverage table. Default is None.
653+
:param skip_joint_metrics: Bool to pass to skip adding joint FAF metrics from the joint table. Default is False.
654+
:param custom_joint_ht: Custom joint table to use instead of the public release joint table. Default is None.
657655
:return: List of dictionaries containing VRS information
658656
(and freq info split by ancestry groups and sex if desired) for specified variant.
659657
"""
@@ -664,6 +662,11 @@ def gnomad_gks(
664662
"gnomad_gks() is currently only implemented for gnomAD v4."
665663
)
666664

665+
if data_type not in ["exomes", "genomes"]:
666+
raise NotImplementedError(
667+
"gnomad_gks() is currently only implemented for gnomAD exomes and genomes."
668+
)
669+
667670
# Read public_release table if no custom table provided.
668671
if custom_ht:
669672
ht = custom_ht
@@ -693,10 +696,28 @@ def gnomad_gks(
693696
ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean)
694697
ht = ht.annotate(fraction_cov_over_20=coverage_ht[ht.locus].over_20)
695698

696-
# Retrieve genetic ancestry groups from the imported GEN_ANC_NAMES dictionary.
697-
grps_list = (
698-
list(GEN_ANC_NAMES[high_level_version][data_type]) if by_gen_anc_group else None
699-
)
699+
# Join joint table to add joint FAF metrics if requested.
700+
if not skip_joint_metrics:
701+
if custom_joint_ht:
702+
joint_ht = custom_joint_ht
703+
else:
704+
joint_ht = hl.read_table(public_release("joint").versions[version].path)
705+
ht = ht.annotate(
706+
jointGrpMaxFAF95=hl.struct(
707+
grpmax=joint_ht[ht.key].joint.fafmax.faf95_max,
708+
grpmax_gen_anc=joint_ht[ht.key].joint.fafmax.faf95_max_gen_anc,
709+
)
710+
)
711+
712+
# Determine which genetic ancestry groups to include (list of group IDs).
713+
if by_gen_anc_group:
714+
if data_type not in GEN_ANC_GROUPS[high_level_version]:
715+
raise NotImplementedError(
716+
f"No genetic ancestry group list configured for {high_level_version} {data_type}."
717+
)
718+
grps_list = list(GEN_ANC_GROUPS[high_level_version][data_type])
719+
else:
720+
grps_list = None
700721

701722
# Throw warnings if contradictory arguments are passed.
702723
if by_gen_anc_group and vrs_only:
@@ -760,7 +781,9 @@ def gnomad_gks(
760781
# then return list
761782
outputs = []
762783
for variant in variant_list:
763-
vrs_variant = add_gks_vrs(variant.locus, variant.vrs)
784+
# After the update to ga4gh.vrs 2.2.0, we now return
785+
# information for the reference as well as alternate allele
786+
vrs_variants = add_gks_vrs(variant.locus, variant.vrs)
764787

765788
out = {
766789
"locus": {
@@ -769,7 +792,7 @@ def gnomad_gks(
769792
"reference_genome": variant.locus.reference_genome.name,
770793
},
771794
"alleles": variant.alleles,
772-
"gks_vrs_variant": vrs_variant,
795+
"gks_vrs_variants": vrs_variants,
773796
}
774797

775798
if not vrs_only:
@@ -784,7 +807,7 @@ def gnomad_gks(
784807
)
785808

786809
# Assign existing VRS information to "focusAllele" key
787-
va_freq_dict["focusAllele"] = vrs_variant
810+
va_freq_dict["focusAllele"] = vrs_variants[1]
788811
out["gks_va_freq"] = va_freq_dict
789812

790813
# Append variant dictionary to list of outputs

0 commit comments

Comments
 (0)