Skip to content

Commit 0dc57af

Browse files
committed
Improve docs+dtype handling and remove cachetools
1 parent 53ca999 commit 0dc57af

File tree

5 files changed

+67
-19
lines changed

5 files changed

+67
-19
lines changed

requirements-dev.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ partd
1414
fsspec
1515
bed-reader
1616
git+https://github.com/pangeo-data/rechunker.git
17-
cachetools
1817
cbgen
1918
cyvcf2; platform_system != "Windows"
2019
yarl

setup.cfg

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ vcf =
5959
yarl
6060
bgen =
6161
git+https://github.com/pangeo-data/rechunker.git
62-
cachetools
6362
cbgen
6463

6564
[coverage:report]
@@ -125,8 +124,6 @@ ignore_missing_imports = True
125124
ignore_missing_imports = True
126125
[mypy-rechunker.*]
127126
ignore_missing_imports = True
128-
[mypy-cachetools.*]
129-
ignore_missing_imports = True
130127
[mypy-bed_reader.*]
131128
ignore_missing_imports = True
132129
[mypy-sphinx.*]

sgkit/io/bgen/bgen_reader.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import logging
33
import tempfile
44
from pathlib import Path
5-
from threading import RLock
65
from typing import (
76
Any,
87
Dict,
@@ -22,7 +21,6 @@
2221
import pandas as pd
2322
import xarray as xr
2423
import zarr
25-
from cachetools import LRUCache, cached
2624
from cbgen import bgen_file, bgen_metafile
2725
from rechunker import api as rechunker_api
2826
from xarray import Dataset
@@ -85,7 +83,8 @@ def __init__(
8583
self.partition_size = mf.partition_size
8684

8785
self.shape = (self.n_variants, self.n_samples, 3)
88-
self.dtype = dtype
86+
self.dtype = np.dtype(dtype)
87+
self.precision = 64 if self.dtype.itemsize >= 8 else 32
8988
self.ndim = 3
9089

9190
def __getitem__(self, idx: Any) -> np.ndarray:
@@ -135,7 +134,7 @@ def __getitem__(self, idx: Any) -> np.ndarray:
135134
with bgen_file(self.path) as bgen:
136135
res = None
137136
for i, vaddr in enumerate(all_vaddr):
138-
probs = bgen.read_probability(vaddr, precision=32)[idx[1]]
137+
probs = bgen.read_probability(vaddr, precision=self.precision)[idx[1]]
139138
assert len(probs.shape) == 2 and probs.shape[1] == 3
140139
if res is None:
141140
res = np.zeros((len(all_vaddr), len(probs), 3), dtype=self.dtype)
@@ -144,10 +143,6 @@ def __getitem__(self, idx: Any) -> np.ndarray:
144143
return np.squeeze(res, axis=squeeze_dims)
145144

146145

147-
cache = LRUCache(maxsize=3)
148-
lock = RLock()
149-
150-
151146
def _split_alleles(allele_ids: bytes) -> List[bytes]:
152147
alleles = allele_ids.split(b",")
153148
if len(alleles) != 2:
@@ -157,7 +152,6 @@ def _split_alleles(allele_ids: bytes) -> List[bytes]:
157152
return alleles
158153

159154

160-
@cached(cache, lock=lock) # type: ignore[misc]
161155
def _read_metafile_partition(path: Path, partition: int) -> pd.DataFrame:
162156
with bgen_metafile(path) as mf:
163157
part = mf.read_partition(partition)
@@ -243,17 +237,42 @@ def read_bgen(
243237
be read multiple times when False.
244238
contig_dtype
245239
Data type for contig names, by default "str".
246-
This may be an integer type, but this will fail if any of the contig names cannot be
247-
converted to integers.
240+
This may also be an integer type (e.g. "int"), but will fail if any of the contig names
241+
cannot be converted to integers.
248242
gp_dtype
249243
Data type for genotype probabilities, by default "float32".
250244
251245
Warnings
252246
--------
253247
Only bi-allelic, diploid BGEN files are currently supported.
248+
249+
Returns
250+
-------
251+
A dataset containing the following variables:
252+
253+
- :data:`sgkit.variables.variant_id` (variants)
254+
- :data:`sgkit.variables.variant_contig` (variants)
255+
- :data:`sgkit.variables.variant_position` (variants)
256+
- :data:`sgkit.variables.variant_allele` (variants)
257+
- :data:`sgkit.variables.sample_id` (samples)
258+
- :data:`sgkit.variables.call_dosage` (variants, samples)
259+
- :data:`sgkit.variables.call_dosage_mask` (variants, samples)
260+
- :data:`sgkit.variables.call_genotype_probability` (variants, samples, genotypes)
261+
- :data:`sgkit.variables.call_genotype_probability_mask` (variants, samples, genotypes)
262+
254263
"""
255264
if isinstance(chunks, tuple) and len(chunks) != 3:
256-
raise ValueError(f"Chunks must be tuple with 3 items, not {chunks}")
265+
raise ValueError(f"`chunks` must be tuple with 3 items, not {chunks}")
266+
if not np.issubdtype(gp_dtype, np.floating):
267+
raise ValueError(
268+
f"`gp_dtype` must be a floating point data type, not {gp_dtype}"
269+
)
270+
if not np.issubdtype(contig_dtype, np.integer) and np.dtype(
271+
contig_dtype
272+
).kind not in {"U", "S"}:
273+
raise ValueError(
274+
f"`contig_dtype` must be of string or int type, not {contig_dtype}"
275+
)
257276

258277
path = Path(path)
259278
sample_path = Path(sample_path) if sample_path else path.with_suffix(".sample")

sgkit/tests/io/bgen/test_bgen_reader.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,35 @@ def test_read_bgen__gp_dtype(shared_datadir, dtype):
116116
assert ds["call_dosage"].dtype == dtype
117117

118118

119+
@pytest.mark.parametrize("dtype", ["c8", "i8", "str"])
120+
def test_read_bgen__invalid_gp_dtype(shared_datadir, dtype):
121+
path = shared_datadir / "example.bgen"
122+
with pytest.raises(
123+
ValueError, match="`gp_dtype` must be a floating point data type"
124+
):
125+
read_bgen(path, gp_dtype=dtype)
126+
127+
128+
@pytest.mark.parametrize("dtype", ["U", "S", "u1", "u2", "i8", "int"])
129+
def test_read_bgen__contig_dtype(shared_datadir, dtype):
130+
path = shared_datadir / "example.bgen"
131+
ds = read_bgen(path, contig_dtype=dtype)
132+
dtype = np.dtype(dtype)
133+
if dtype.kind in {"U", "S"}:
134+
assert ds["variant_contig"].dtype == np.int64
135+
else:
136+
assert ds["variant_contig"].dtype == dtype
137+
138+
139+
@pytest.mark.parametrize("dtype", ["c8", "M", "f4"])
140+
def test_read_bgen__invalid_contig_dtype(shared_datadir, dtype):
141+
path = shared_datadir / "example.bgen"
142+
with pytest.raises(
143+
ValueError, match="`contig_dtype` must be of string or int type"
144+
):
145+
read_bgen(path, contig_dtype=dtype)
146+
147+
119148
@pytest.mark.parametrize("chunks", CHUNKS)
120149
def test_read_bgen__fancy_index(shared_datadir, chunks):
121150
path = shared_datadir / "example.bgen"
@@ -165,7 +194,7 @@ def test_split_alleles__raise_on_multiallelic():
165194

166195
def test_read_bgen__invalid_chunks(shared_datadir):
167196
path = shared_datadir / "example.bgen"
168-
with pytest.raises(ValueError, match="Chunks must be tuple with 3 items"):
197+
with pytest.raises(ValueError, match="`chunks` must be tuple with 3 items"):
169198
read_bgen(path, chunks=(100, -1)) # type: ignore[arg-type]
170199

171200

sgkit/variables.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,9 +303,13 @@ def _check_field(
303303
)
304304
"""The number of samples with heterozygous calls."""
305305
variant_contig, variant_contig_spec = SgkitVariables.register_variable(
306-
ArrayLikeSpec("variant_contig", kind="i", ndim=1)
306+
ArrayLikeSpec("variant_contig", kind={"i", "u"}, ndim=1)
307307
)
308-
"""The (index of the) contig for each variant."""
308+
"""
309+
Index corresponding to contig name for each variant. In some less common scenarios,
310+
this may also be equivalent to the contig names if the data generating process used
311+
contig names that were also integers.
312+
"""
309313
variant_hwe_p_value, variant_hwe_p_value_spec = SgkitVariables.register_variable(
310314
ArrayLikeSpec("variant_hwe_p_value", kind="f")
311315
)

0 commit comments

Comments
 (0)