11# noqa: D100
22
33import logging
4- from typing import Optional
4+ from typing import Any , Dict , List , Optional
55
66import hail as hl
77
@@ -604,8 +604,6 @@ def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table:
604604 """
605605 Add a grpMaxFAF95 struct with 'grpmax' and 'grpmax_gen_anc'.
606606
607- Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures.
608-
609607 :param ht: Input hail table.
610608 :return: Annotated hail table.
611609 """
@@ -618,10 +616,6 @@ def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table:
618616 grpmax = fafmax_field .faf95_max ,
619617 grpmax_gen_anc = fafmax_field .faf95_max_gen_anc ,
620618 ),
621- jointGrpMaxFAF95 = hl .struct (
622- grpmax = ht .joint_fafmax .faf95_max ,
623- grpmax_gen_anc = ht .joint_fafmax .faf95_max_gen_anc ,
624- ),
625619 )
626620 return ht
627621
@@ -637,23 +631,27 @@ def gnomad_gks(
637631 skip_checkpoint : bool = False ,
638632 skip_coverage : bool = False ,
639633 custom_coverage_ht : hl .Table = None ,
640- ) -> list :
634+ skip_joint_metrics : bool = False ,
635+ custom_joint_ht : hl .Table = None ,
636+ ) -> List [Dict [str , Any ]]:
641637 """
642638 Perform gnomad GKS annotations on a range of variants at once.
643639
644640 :param locus_interval: Hail IntervalExpression of locus<reference_genome>.
645641 e.g. hl.locus_interval('chr1', 6424776, 6461367, reference_genome="GRCh38")
646642 :param version: String of version of gnomAD release to use.
647- :param data_type: String of either "exomes" or "genomes" for the type of reads that are desired.
648- :param by_gen_anc_group: Boolean to pass to obtain frequency information for each cohort.
649- :param by_sex: Boolean to pass to return frequency information for each cohort split by chromosomal sex.
643+ :param data_type: String of either "exomes" or "genomes" for the type of reads that are desired. Default is "genomes".
644+ :param by_gen_anc_group: Boolean to pass to obtain frequency information for each cohort. Default is False.
645+ :param by_sex: Boolean to pass to return frequency information for each cohort split by chromosomal sex. Default is False.
650646 :param vrs_only: Boolean to pass for only VRS info to be returned
651- (will not include allele frequency information).
652- :param custom_ht: Table to use instead of what public_release() method would return for the version.
647+ (will not include allele frequency information). Default is False.
648+ :param custom_ht: Table to use instead of what public_release() method would return for the version. Default is None.
653649 :param skip_checkpoint: Bool to pass to skip checkpointing selected fields
654- (checkpointing may be desirable for large datasets by reducing data copies across the cluster).
655- :param skip_coverage: Bool to pass to skip adding coverage statistics.
656- :param custom_coverage_ht: Custom table to use for coverage statistics instead of the release coverage table.
650+ (checkpointing may be desirable for large datasets by reducing data copies across the cluster). Default is False.
651+ :param skip_coverage: Bool to pass to skip adding coverage statistics. Default is False.
652+ :param custom_coverage_ht: Custom table to use for coverage statistics instead of the release coverage table. Default is None.
653+ :param skip_joint_metrics: Bool to pass to skip adding joint FAF metrics from the joint table. Default is False.
654+ :param custom_joint_ht: Custom joint table to use instead of the public release joint table. Default is None.
657655 :return: List of dictionaries containing VRS information
658656 (and freq info split by ancestry groups and sex if desired) for specified variant.
659657 """
@@ -664,6 +662,11 @@ def gnomad_gks(
664662 "gnomad_gks() is currently only implemented for gnomAD v4."
665663 )
666664
665+ if data_type not in ["exomes" , "genomes" ]:
666+ raise NotImplementedError (
667+ "gnomad_gks() is currently only implemented for gnomAD exomes and genomes."
668+ )
669+
667670 # Read public_release table if no custom table provided.
668671 if custom_ht :
669672 ht = custom_ht
@@ -693,10 +696,28 @@ def gnomad_gks(
693696 ht = ht .annotate (mean_depth = coverage_ht [ht .locus ].mean )
694697 ht = ht .annotate (fraction_cov_over_20 = coverage_ht [ht .locus ].over_20 )
695698
696- # Retrieve genetic ancestry groups from the imported GEN_ANC_NAMES dictionary.
697- grps_list = (
698- list (GEN_ANC_NAMES [high_level_version ][data_type ]) if by_gen_anc_group else None
699- )
699+ # Join joint table to add joint FAF metrics if requested.
700+ if not skip_joint_metrics :
701+ if custom_joint_ht :
702+ joint_ht = custom_joint_ht
703+ else :
704+ joint_ht = hl .read_table (public_release ("joint" ).versions [version ].path )
705+ ht = ht .annotate (
706+ jointGrpMaxFAF95 = hl .struct (
707+ grpmax = joint_ht [ht .key ].joint .fafmax .faf95_max ,
708+ grpmax_gen_anc = joint_ht [ht .key ].joint .fafmax .faf95_max_gen_anc ,
709+ )
710+ )
711+
712+ # Determine which genetic ancestry groups to include (list of group IDs).
713+ if by_gen_anc_group :
714+ if data_type not in GEN_ANC_GROUPS [high_level_version ]:
715+ raise NotImplementedError (
716+ f"No genetic ancestry group list configured for { high_level_version } { data_type } ."
717+ )
718+ grps_list = list (GEN_ANC_GROUPS [high_level_version ][data_type ])
719+ else :
720+ grps_list = None
700721
701722 # Throw warnings if contradictory arguments are passed.
702723 if by_gen_anc_group and vrs_only :
@@ -760,7 +781,9 @@ def gnomad_gks(
760781 # then return list
761782 outputs = []
762783 for variant in variant_list :
763- vrs_variant = add_gks_vrs (variant .locus , variant .vrs )
784+ # After the update to ga4gh.vrs 2.2.0, we now return
785+ # information for the reference as well as alternate allele
786+ vrs_variants = add_gks_vrs (variant .locus , variant .vrs )
764787
765788 out = {
766789 "locus" : {
@@ -769,7 +792,7 @@ def gnomad_gks(
769792 "reference_genome" : variant .locus .reference_genome .name ,
770793 },
771794 "alleles" : variant .alleles ,
772- "gks_vrs_variant " : vrs_variant ,
795+ "gks_vrs_variants " : vrs_variants ,
773796 }
774797
775798 if not vrs_only :
@@ -784,7 +807,7 @@ def gnomad_gks(
784807 )
785808
786809 # Assign existing VRS information to "focusAllele" key
787- va_freq_dict ["focusAllele" ] = vrs_variant
810+ va_freq_dict ["focusAllele" ] = vrs_variants [ 1 ]
788811 out ["gks_va_freq" ] = va_freq_dict
789812
790813 # Append variant dictionary to list of outputs
0 commit comments