Improve docs+dtype handling and remove cachetools

eric-czech · eric-czech · commit 0dc57afce6ae · 2020-10-09T06:08:18.000-04:00
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -14,7 +14,6 @@ partd
 fsspec
 bed-reader
 git+https://github.com/pangeo-data/rechunker.git
-cachetools
 cbgen
 cyvcf2; platform_system != "Windows"
 yarl
diff --git a/setup.cfg b/setup.cfg
@@ -59,7 +59,6 @@ vcf =
     yarl
 bgen =
     git+https://github.com/pangeo-data/rechunker.git
-    cachetools
     cbgen
 
 [coverage:report]
@@ -125,8 +124,6 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 [mypy-rechunker.*]
 ignore_missing_imports = True
-[mypy-cachetools.*]
-ignore_missing_imports = True
 [mypy-bed_reader.*]
 ignore_missing_imports = True
 [mypy-sphinx.*]
diff --git a/sgkit/io/bgen/bgen_reader.py b/sgkit/io/bgen/bgen_reader.py
@@ -2,7 +2,6 @@
 import logging
 import tempfile
 from pathlib import Path
-from threading import RLock
 from typing import (
     Any,
     Dict,
@@ -22,7 +21,6 @@
 import pandas as pd
 import xarray as xr
 import zarr
-from cachetools import LRUCache, cached
 from cbgen import bgen_file, bgen_metafile
 from rechunker import api as rechunker_api
 from xarray import Dataset
@@ -85,7 +83,8 @@ def __init__(
                 self.partition_size = mf.partition_size
 
         self.shape = (self.n_variants, self.n_samples, 3)
-        self.dtype = dtype
+        self.dtype = np.dtype(dtype)
+        self.precision = 64 if self.dtype.itemsize >= 8 else 32
         self.ndim = 3
 
     def __getitem__(self, idx: Any) -> np.ndarray:
@@ -135,7 +134,7 @@ def __getitem__(self, idx: Any) -> np.ndarray:
         with bgen_file(self.path) as bgen:
             res = None
             for i, vaddr in enumerate(all_vaddr):
-                probs = bgen.read_probability(vaddr, precision=32)[idx[1]]
+                probs = bgen.read_probability(vaddr, precision=self.precision)[idx[1]]
                 assert len(probs.shape) == 2 and probs.shape[1] == 3
                 if res is None:
                     res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype)
@@ -144,10 +143,6 @@ def __getitem__(self, idx: Any) -> np.ndarray:
             return np.squeeze(res, axis=squeeze_dims)
 
 
-cache = LRUCache(maxsize=3)
-lock = RLock()
-
-
 def _split_alleles(allele_ids: bytes) -> List[bytes]:
     alleles = allele_ids.split(b",")
     if len(alleles) != 2:
@@ -157,7 +152,6 @@ def _split_alleles(allele_ids: bytes) -> List[bytes]:
     return alleles
 
 
-@cached(cache, lock=lock)  # type: ignore[misc]
 def _read_metafile_partition(path: Path, partition: int) -> pd.DataFrame:
     with bgen_metafile(path) as mf:
         part = mf.read_partition(partition)
@@ -243,17 +237,42 @@ def read_bgen(
         be read multiple times when False.
     contig_dtype
         Data type for contig names, by default "str".
-        This may be an integer type, but this will fail if any of the contig names cannot be
-        converted to integers.
+        This may also be an integer type (e.g. "int"), but will fail if any of the contig names
+        cannot be converted to integers.
     gp_dtype
         Data type for genotype probabilities, by default "float32".
 
     Warnings
     --------
     Only bi-allelic, diploid BGEN files are currently supported.
+
+    Returns
+    -------
+    A dataset containing the following variables:
+
+    - :data:`sgkit.variables.variant_id` (variants)
+    - :data:`sgkit.variables.variant_contig` (variants)
+    - :data:`sgkit.variables.variant_position` (variants)
+    - :data:`sgkit.variables.variant_allele` (variants)
+    - :data:`sgkit.variables.sample_id` (samples)
+    - :data:`sgkit.variables.call_dosage` (variants, samples)
+    - :data:`sgkit.variables.call_dosage_mask` (variants, samples)
+    - :data:`sgkit.variables.call_genotype_probability` (variants, samples, genotypes)
+    - :data:`sgkit.variables.call_genotype_probability_mask` (variants, samples, genotypes)
+
     """
     if isinstance(chunks, tuple) and len(chunks) != 3:
-        raise ValueError(f"Chunks must be tuple with 3 items, not {chunks}")
+        raise ValueError(f"`chunks` must be tuple with 3 items, not {chunks}")
+    if not np.issubdtype(gp_dtype, np.floating):
+        raise ValueError(
+            f"`gp_dtype` must be a floating point data type, not {gp_dtype}"
+        )
+    if not np.issubdtype(contig_dtype, np.integer) and np.dtype(
+        contig_dtype
+    ).kind not in {"U", "S"}:
+        raise ValueError(
+            f"`contig_dtype` must be of string or int type, not {contig_dtype}"
+        )
 
     path = Path(path)
     sample_path = Path(sample_path) if sample_path else path.with_suffix(".sample")
diff --git a/sgkit/tests/io/bgen/test_bgen_reader.py b/sgkit/tests/io/bgen/test_bgen_reader.py
@@ -116,6 +116,35 @@ def test_read_bgen__gp_dtype(shared_datadir, dtype):
     assert ds["call_dosage"].dtype == dtype
 
 
+@pytest.mark.parametrize("dtype", ["c8", "i8", "str"])
+def test_read_bgen__invalid_gp_dtype(shared_datadir, dtype):
+    path = shared_datadir / "example.bgen"
+    with pytest.raises(
+        ValueError, match="`gp_dtype` must be a floating point data type"
+    ):
+        read_bgen(path, gp_dtype=dtype)
+
+
+@pytest.mark.parametrize("dtype", ["U", "S", "u1", "u2", "i8", "int"])
+def test_read_bgen__contig_dtype(shared_datadir, dtype):
+    path = shared_datadir / "example.bgen"
+    ds = read_bgen(path, contig_dtype=dtype)
+    dtype = np.dtype(dtype)
+    if dtype.kind in {"U", "S"}:
+        assert ds["variant_contig"].dtype == np.int64
+    else:
+        assert ds["variant_contig"].dtype == dtype
+
+
+@pytest.mark.parametrize("dtype", ["c8", "M", "f4"])
+def test_read_bgen__invalid_contig_dtype(shared_datadir, dtype):
+    path = shared_datadir / "example.bgen"
+    with pytest.raises(
+        ValueError, match="`contig_dtype` must be of string or int type"
+    ):
+        read_bgen(path, contig_dtype=dtype)
+
+
 @pytest.mark.parametrize("chunks", CHUNKS)
 def test_read_bgen__fancy_index(shared_datadir, chunks):
     path = shared_datadir / "example.bgen"
@@ -165,7 +194,7 @@ def test_split_alleles__raise_on_multiallelic():
 
 def test_read_bgen__invalid_chunks(shared_datadir):
     path = shared_datadir / "example.bgen"
-    with pytest.raises(ValueError, match="Chunks must be tuple with 3 items"):
+    with pytest.raises(ValueError, match="`chunks` must be tuple with 3 items"):
         read_bgen(path, chunks=(100, -1))  # type: ignore[arg-type]
 
 
diff --git a/sgkit/variables.py b/sgkit/variables.py
@@ -303,9 +303,13 @@ def _check_field(
 )
 """The number of samples with heterozygous calls."""
 variant_contig, variant_contig_spec = SgkitVariables.register_variable(
-    ArrayLikeSpec("variant_contig", kind="i", ndim=1)
+    ArrayLikeSpec("variant_contig", kind={"i", "u"}, ndim=1)
 )
-"""The (index of the) contig for each variant."""
+"""
+Index corresponding to contig name for each variant. In some less common scenarios,
+this may also be equivalent to the contig names if the data generating process used
+contig names that were also integers.
+"""
 variant_hwe_p_value, variant_hwe_p_value_spec = SgkitVariables.register_variable(
     ArrayLikeSpec("variant_hwe_p_value", kind="f")
 )

Original file line number	Diff line number	Diff line change
`@@ -303,9 +303,13 @@ def _check_field(`
`303`	`303`	`)`
`304`	`304`	`"""The number of samples with heterozygous calls."""`
`305`	`305`	`variant_contig, variant_contig_spec = SgkitVariables.register_variable(`
`306`		`- ArrayLikeSpec("variant_contig", kind="i", ndim=1)`
	`306`	`+ ArrayLikeSpec("variant_contig", kind={"i", "u"}, ndim=1)`
`307`	`307`	`)`
`308`		`-"""The (index of the) contig for each variant."""`
	`308`	`+"""`
	`309`	`+Index corresponding to contig name for each variant. In some less common scenarios,`
	`310`	`+this may also be equivalent to the contig names if the data generating process used`
	`311`	`+contig names that were also integers.`
	`312`	`+"""`
`309`	`313`	`variant_hwe_p_value, variant_hwe_p_value_spec = SgkitVariables.register_variable(`
`310`	`314`	`ArrayLikeSpec("variant_hwe_p_value", kind="f")`
`311`	`315`	`)`