Skip to content
Open
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 109 additions & 13 deletions src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,57 @@ pub trait GlobalStatistic {
fn as_raw(&self) -> f64;
}

/// A [`GlobalStatistic`], with the following additional guarantees:
/// - A "component" of the statistic is computable for each site.
/// - The computation of a component can be done from site data alone and no other context.
/// - The statistic can be computed over some number of sites by performing some composition operation
/// over zero or more components.
/// - This composition remains correct under reordering (commutation and association).
///
/// It is also possible to implement this trait if a sufficient *portion* of computation can be done
/// under the above conditions.
/// Such types can use [`GlobalStatistic::as_raw`] to perform inexpensive finalizing computations
/// if needed.
/// For an example of this use case, see [`TajimaD`].
///
/// Types which implement this trait may also use it to easily implement [`GlobalStatistic::add_site`]:
/// ```
/// use popgen::iter::SiteCounts;
/// use popgen::stats::{GlobalStatistic, SiteComposable};
///
/// // a very simple statistic
/// #[derive(Default)] // to define the result over 0 sites
/// struct NumSites(u64);
///
/// impl SiteComposable for NumSites {
/// type Component = u64;
///
/// fn component_from(site: SiteCounts) -> Self::Component {
/// 1
/// }
///
/// fn add_component(&mut self, component: Self::Component) {
/// self.0 += component;
/// }
/// }
///
/// impl GlobalStatistic for NumSites {
/// fn add_site(&mut self, site: SiteCounts) {
/// self.add_component(Self::component_from(site))
/// }
///
/// fn as_raw(&self) -> f64 {
/// self.0 as f64
/// }
/// }
/// ```
pub trait SiteComposable: Default + GlobalStatistic {
type Component;

fn component_from(site: SiteCounts) -> Self::Component;
fn add_component(&mut self, component: Self::Component);
}

/// The expected number of differences between two samples over all sites, the "expected pairwise diversity".
///
/// This is the sum of [`Pi`] over all sites.
Expand All @@ -37,6 +88,18 @@ pub struct GlobalPi(f64);

impl GlobalStatistic for GlobalPi {
fn add_site(&mut self, site: SiteCounts) {
self.add_component(Self::component_from(site))
}

fn as_raw(&self) -> f64 {
self.0
}
}

impl SiteComposable for GlobalPi {
type Component = f64;

fn component_from(site: SiteCounts) -> f64 {
// technically should divide both by two here and below but it cancels out
let num_pairs = {
let count: i64 = site.counts.iter().sum();
Expand All @@ -45,12 +108,11 @@ impl GlobalStatistic for GlobalPi {

// the number of pairs where the two samples are homozygous, summed over every genotype
let num_homozygous_pairs: Count = site.counts.iter().map(|count| count * (count - 1)).sum();

self.0 += 1f64 - (num_homozygous_pairs as f64 / num_pairs as f64)
1f64 - (num_homozygous_pairs as f64 / num_pairs as f64)
}

fn as_raw(&self) -> f64 {
self.0
fn add_component(&mut self, component: Self::Component) {
self.0 += component;
}
}

Expand All @@ -67,6 +129,18 @@ pub struct WattersonTheta(f64);

impl GlobalStatistic for WattersonTheta {
fn add_site(&mut self, site: SiteCounts) {
self.add_component(Self::component_from(site))
}

fn as_raw(&self) -> f64 {
self.0
}
}

impl SiteComposable for WattersonTheta {
type Component = f64;

fn component_from(site: SiteCounts) -> Self::Component {
// trying our very hardest to encourage optimization and SIMD here
// also optimizing with the typical two-element slice in mind
let mut iter = site.counts.chunks_exact(2);
Expand All @@ -88,14 +162,15 @@ impl GlobalStatistic for WattersonTheta {

if num_variants > 1 {
let harmonic = (1..total_samples).map(|i| 1f64 / i as f64).sum::<f64>();
self.0 += (num_variants - 1) as f64 / harmonic;
(num_variants - 1) as f64 / harmonic
} else {
// then this site isn't actually polymorphic; meh
0f64
}
}

fn as_raw(&self) -> f64 {
self.0
fn add_component(&mut self, component: Self::Component) {
self.0 += component;
}
}

Expand All @@ -111,12 +186,7 @@ pub struct TajimaD {

impl GlobalStatistic for TajimaD {
fn add_site(&mut self, site: SiteCounts) {
self.k_hat.add_site(site.clone());
self.theta.add_site(site.clone());

self.num_sites += 1;
// this is not perfect but that's fine
self.num_samples = max(self.num_samples, site.total_alleles as usize);
self.add_component(Self::component_from(site))
}

fn as_raw(&self) -> f64 {
Expand Down Expand Up @@ -159,6 +229,32 @@ impl GlobalStatistic for TajimaD {
}
}

impl SiteComposable for TajimaD {
// (k_hat, theta, total_alleles)
type Component = (f64, f64, usize);

fn component_from(site: SiteCounts) -> Self::Component {
let k_hat_component = GlobalPi::component_from(site.clone());
let theta_component = WattersonTheta::component_from(site.clone());

(
k_hat_component,
theta_component,
site.total_alleles() as usize,
)
}

fn add_component(&mut self, component: Self::Component) {
let (k_hat_component, theta_component, total_alleles) = component;
self.k_hat.add_component(k_hat_component);
self.theta.add_component(theta_component);

self.num_sites += 1;
// this is not perfect but that's fine
self.num_samples = max(self.num_samples, total_alleles);
}
}

/// Fixation statistics as in [Charlesworth (1998)](https://doi.org/10.1093/oxfordjournals.molbev.a025953).
///
/// Construction of this type from an arbitrary collection of [`MultiSiteCounts`] is not sound,
Expand Down
Loading