Skip to content

Commit 9c24b3c

Browse files
tomwhitemergify[bot]
authored andcommitted
Fix vcfzarr coverage (generate files in test)
1 parent 143ba25 commit 9c24b3c

File tree

5 files changed

+87
-51
lines changed

5 files changed

+87
-51
lines changed

sgkit/io/utils.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,18 @@ def zarrs_to_dataset(
8888

8989
# Set variable length strings to fixed length ones to avoid xarray/conventions.py:188 warning
9090
# (Also avoids this issue: https://github.com/pydata/xarray/issues/3476)
91-
max_variant_id_length = max(ds.attrs["max_variant_id_length"] for ds in datasets)
91+
92+
if "max_variant_id_length" in datasets[0].attrs:
93+
max_variant_id_length = max(
94+
ds.attrs["max_variant_id_length"] for ds in datasets
95+
)
96+
ds["variant_id"] = ds["variant_id"].astype(f"S{max_variant_id_length}") # type: ignore[no-untyped-call]
97+
del ds.attrs["max_variant_id_length"]
98+
9299
max_variant_allele_length = max(
93100
ds.attrs["max_variant_allele_length"] for ds in datasets
94101
)
95-
ds["variant_id"] = ds["variant_id"].astype(f"S{max_variant_id_length}") # type: ignore[no-untyped-call]
96102
ds["variant_allele"] = ds["variant_allele"].astype(f"S{max_variant_allele_length}") # type: ignore[no-untyped-call]
97-
del ds.attrs["max_variant_id_length"]
98103
del ds.attrs["max_variant_allele_length"]
99104

100105
return ds

sgkit/io/vcfzarr_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def _vcfzarr_to_dataset(
169169
if "variants/ID" in vcfzarr:
170170
variants_id = da.from_zarr(vcfzarr["variants/ID"]).astype(str)
171171
else:
172-
variants_id = None # pragma: no cover
172+
variants_id = None
173173

174174
ds = create_genotype_call_dataset(
175175
variant_contig_names=variant_contig_names,
-24.4 KB
Binary file not shown.

sgkit/tests/data/sample.vcf.zarr.zip

-20.2 KB
Binary file not shown.

sgkit/tests/test_vcfzarr_reader.py

Lines changed: 78 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,44 @@
1+
import allel
12
import numpy as np
23
import pytest
34
import xarray as xr
5+
import zarr
46
from numpy.testing import assert_array_equal
57

68
from sgkit import read_vcfzarr
79
from sgkit.io.vcfzarr_reader import _ensure_2d, vcfzarr_to_zarr
810

911

12+
def create_vcfzarr(
13+
shared_datadir, tmpdir, *, fields=None, grouped_by_contig=False, consolidated=False
14+
):
15+
"""Create a vcfzarr file using scikit-allel"""
16+
vcf_path = shared_datadir / "sample.vcf"
17+
output_path = tmpdir / "sample.vcf.zarr"
18+
if grouped_by_contig:
19+
for contig in ["19", "20", "X"]:
20+
allel.vcf_to_zarr(
21+
str(vcf_path),
22+
str(output_path),
23+
fields=fields,
24+
group=contig,
25+
region=contig,
26+
)
27+
else:
28+
allel.vcf_to_zarr(str(vcf_path), str(output_path), fields=fields)
29+
if consolidated:
30+
zarr.consolidate_metadata(str(output_path))
31+
return output_path
32+
33+
1034
def test_ensure_2d():
1135
assert_array_equal(_ensure_2d(np.array([0, 2, 1])), np.array([[0], [2], [1]]))
1236
assert_array_equal(_ensure_2d(np.array([[0], [2], [1]])), np.array([[0], [2], [1]]))
1337

1438

15-
def test_read_vcfzarr(shared_datadir):
16-
# The file sample.vcf.zarr.zip was created by running the following
17-
# in a python session with the scikit-allel package installed.
18-
#
19-
# import allel
20-
# allel.vcf_to_zarr("sample.vcf", "sample.vcf.zarr.zip")
21-
22-
path = shared_datadir / "sample.vcf.zarr.zip"
23-
ds = read_vcfzarr(path)
39+
def test_read_vcfzarr(shared_datadir, tmpdir):
40+
vcfzarr_path = create_vcfzarr(shared_datadir, tmpdir) # type: ignore[no-untyped-call]
41+
ds = read_vcfzarr(vcfzarr_path)
2442

2543
assert ds.attrs["contigs"] == ["19", "20", "X"]
2644
assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
@@ -73,11 +91,12 @@ def test_read_vcfzarr(shared_datadir):
7391

7492

7593
@pytest.mark.parametrize(
76-
"vcfzarr_filename, grouped_by_contig, consolidated",
94+
"grouped_by_contig, consolidated, has_variant_id",
7795
[
78-
("sample.vcf.zarr.zip", False, False),
79-
("sample-grouped.vcf.zarr.zip", True, False),
80-
("sample-grouped.vcf.zarr.zip", True, True),
96+
(False, False, False),
97+
(False, False, True),
98+
(True, False, True),
99+
(True, True, False),
81100
],
82101
)
83102
@pytest.mark.parametrize(
@@ -86,27 +105,35 @@ def test_read_vcfzarr(shared_datadir):
86105
)
87106
def test_vcfzarr_to_zarr(
88107
shared_datadir,
89-
tmp_path,
90-
vcfzarr_filename,
108+
tmpdir,
91109
grouped_by_contig,
92110
consolidated,
111+
has_variant_id,
93112
concat_algorithm,
94113
):
95-
# The file sample-grouped.vcf.zarr.zip was created by running the following
96-
# in a python session with the scikit-allel package installed.
97-
#
98-
# import allel
99-
# for contig in ["19", "20", "X"]:
100-
# allel.vcf_to_zarr("sample.vcf", "sample-grouped.vcf.zarr", group=contig, region=contig)
101-
# zarr.consolidate_metadata("sample-grouped.vcf.zarr")
102-
#
103-
# Then (in a shell):
104-
# (cd sample-grouped.vcf.zarr; zip -r ../sample-grouped.vcf.zarr.zip .)
105-
106-
path = shared_datadir / vcfzarr_filename
107-
output = tmp_path.joinpath("vcf.zarr").as_posix()
114+
if has_variant_id:
115+
fields = None
116+
else:
117+
fields = [
118+
"variants/CHROM",
119+
"variants/POS",
120+
"variants/REF",
121+
"variants/ALT",
122+
"calldata/GT",
123+
"samples",
124+
]
125+
126+
vcfzarr_path = create_vcfzarr( # type: ignore[no-untyped-call]
127+
shared_datadir,
128+
tmpdir,
129+
fields=fields,
130+
grouped_by_contig=grouped_by_contig,
131+
consolidated=consolidated,
132+
)
133+
134+
output = str(tmpdir / "vcf.zarr")
108135
vcfzarr_to_zarr(
109-
path,
136+
vcfzarr_path,
110137
output,
111138
grouped_by_contig=grouped_by_contig,
112139
concat_algorithm=concat_algorithm,
@@ -138,24 +165,28 @@ def test_vcfzarr_to_zarr(
138165
[b"AC", b"A", b"ATG", b"C"],
139166
],
140167
)
141-
assert_array_equal(
142-
ds["variant_id"],
143-
[
144-
b".",
145-
b".",
146-
b"rs6054257",
147-
b".",
148-
b"rs6040355",
149-
b".",
150-
b"microsat1",
151-
b".",
152-
b"rsTest",
153-
],
154-
)
155-
assert_array_equal(
156-
ds["variant_id_mask"],
157-
[True, True, False, True, False, True, False, True, False],
158-
)
168+
if has_variant_id:
169+
assert_array_equal(
170+
ds["variant_id"],
171+
[
172+
b".",
173+
b".",
174+
b"rs6054257",
175+
b".",
176+
b"rs6040355",
177+
b".",
178+
b"microsat1",
179+
b".",
180+
b"rsTest",
181+
],
182+
)
183+
assert_array_equal(
184+
ds["variant_id_mask"],
185+
[True, True, False, True, False, True, False, True, False],
186+
)
187+
else:
188+
assert "variant_id" not in ds
189+
assert "variant_id_mask" not in ds
159190

160191
assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])
161192

0 commit comments

Comments
 (0)