1
+ import allel
1
2
import numpy as np
2
3
import pytest
3
4
import xarray as xr
5
+ import zarr
4
6
from numpy .testing import assert_array_equal
5
7
6
8
from sgkit import read_vcfzarr
7
9
from sgkit .io .vcfzarr_reader import _ensure_2d , vcfzarr_to_zarr
8
10
9
11
12
+ def create_vcfzarr (
13
+ shared_datadir , tmpdir , * , fields = None , grouped_by_contig = False , consolidated = False
14
+ ):
15
+ """Create a vcfzarr file using scikit-allel"""
16
+ vcf_path = shared_datadir / "sample.vcf"
17
+ output_path = tmpdir / "sample.vcf.zarr"
18
+ if grouped_by_contig :
19
+ for contig in ["19" , "20" , "X" ]:
20
+ allel .vcf_to_zarr (
21
+ str (vcf_path ),
22
+ str (output_path ),
23
+ fields = fields ,
24
+ group = contig ,
25
+ region = contig ,
26
+ )
27
+ else :
28
+ allel .vcf_to_zarr (str (vcf_path ), str (output_path ), fields = fields )
29
+ if consolidated :
30
+ zarr .consolidate_metadata (str (output_path ))
31
+ return output_path
32
+
33
+
10
34
def test_ensure_2d ():
11
35
assert_array_equal (_ensure_2d (np .array ([0 , 2 , 1 ])), np .array ([[0 ], [2 ], [1 ]]))
12
36
assert_array_equal (_ensure_2d (np .array ([[0 ], [2 ], [1 ]])), np .array ([[0 ], [2 ], [1 ]]))
13
37
14
38
15
- def test_read_vcfzarr (shared_datadir ):
16
- # The file sample.vcf.zarr.zip was created by running the following
17
- # in a python session with the scikit-allel package installed.
18
- #
19
- # import allel
20
- # allel.vcf_to_zarr("sample.vcf", "sample.vcf.zarr.zip")
21
-
22
- path = shared_datadir / "sample.vcf.zarr.zip"
23
- ds = read_vcfzarr (path )
39
+ def test_read_vcfzarr (shared_datadir , tmpdir ):
40
+ vcfzarr_path = create_vcfzarr (shared_datadir , tmpdir ) # type: ignore[no-untyped-call]
41
+ ds = read_vcfzarr (vcfzarr_path )
24
42
25
43
assert ds .attrs ["contigs" ] == ["19" , "20" , "X" ]
26
44
assert_array_equal (ds ["variant_contig" ], [0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 2 ])
@@ -73,11 +91,12 @@ def test_read_vcfzarr(shared_datadir):
73
91
74
92
75
93
@pytest .mark .parametrize (
76
- "vcfzarr_filename, grouped_by_contig, consolidated" ,
94
+ "grouped_by_contig, consolidated, has_variant_id " ,
77
95
[
78
- ("sample.vcf.zarr.zip" , False , False ),
79
- ("sample-grouped.vcf.zarr.zip" , True , False ),
80
- ("sample-grouped.vcf.zarr.zip" , True , True ),
96
+ (False , False , False ),
97
+ (False , False , True ),
98
+ (True , False , True ),
99
+ (True , True , False ),
81
100
],
82
101
)
83
102
@pytest .mark .parametrize (
@@ -86,27 +105,35 @@ def test_read_vcfzarr(shared_datadir):
86
105
)
87
106
def test_vcfzarr_to_zarr (
88
107
shared_datadir ,
89
- tmp_path ,
90
- vcfzarr_filename ,
108
+ tmpdir ,
91
109
grouped_by_contig ,
92
110
consolidated ,
111
+ has_variant_id ,
93
112
concat_algorithm ,
94
113
):
95
- # The file sample-grouped.vcf.zarr.zip was created by running the following
96
- # in a python session with the scikit-allel package installed.
97
- #
98
- # import allel
99
- # for contig in ["19", "20", "X"]:
100
- # allel.vcf_to_zarr("sample.vcf", "sample-grouped.vcf.zarr", group=contig, region=contig)
101
- # zarr.consolidate_metadata("sample-grouped.vcf.zarr")
102
- #
103
- # Then (in a shell):
104
- # (cd sample-grouped.vcf.zarr; zip -r ../sample-grouped.vcf.zarr.zip .)
105
-
106
- path = shared_datadir / vcfzarr_filename
107
- output = tmp_path .joinpath ("vcf.zarr" ).as_posix ()
114
+ if has_variant_id :
115
+ fields = None
116
+ else :
117
+ fields = [
118
+ "variants/CHROM" ,
119
+ "variants/POS" ,
120
+ "variants/REF" ,
121
+ "variants/ALT" ,
122
+ "calldata/GT" ,
123
+ "samples" ,
124
+ ]
125
+
126
+ vcfzarr_path = create_vcfzarr ( # type: ignore[no-untyped-call]
127
+ shared_datadir ,
128
+ tmpdir ,
129
+ fields = fields ,
130
+ grouped_by_contig = grouped_by_contig ,
131
+ consolidated = consolidated ,
132
+ )
133
+
134
+ output = str (tmpdir / "vcf.zarr" )
108
135
vcfzarr_to_zarr (
109
- path ,
136
+ vcfzarr_path ,
110
137
output ,
111
138
grouped_by_contig = grouped_by_contig ,
112
139
concat_algorithm = concat_algorithm ,
@@ -138,24 +165,28 @@ def test_vcfzarr_to_zarr(
138
165
[b"AC" , b"A" , b"ATG" , b"C" ],
139
166
],
140
167
)
141
- assert_array_equal (
142
- ds ["variant_id" ],
143
- [
144
- b"." ,
145
- b"." ,
146
- b"rs6054257" ,
147
- b"." ,
148
- b"rs6040355" ,
149
- b"." ,
150
- b"microsat1" ,
151
- b"." ,
152
- b"rsTest" ,
153
- ],
154
- )
155
- assert_array_equal (
156
- ds ["variant_id_mask" ],
157
- [True , True , False , True , False , True , False , True , False ],
158
- )
168
+ if has_variant_id :
169
+ assert_array_equal (
170
+ ds ["variant_id" ],
171
+ [
172
+ b"." ,
173
+ b"." ,
174
+ b"rs6054257" ,
175
+ b"." ,
176
+ b"rs6040355" ,
177
+ b"." ,
178
+ b"microsat1" ,
179
+ b"." ,
180
+ b"rsTest" ,
181
+ ],
182
+ )
183
+ assert_array_equal (
184
+ ds ["variant_id_mask" ],
185
+ [True , True , False , True , False , True , False , True , False ],
186
+ )
187
+ else :
188
+ assert "variant_id" not in ds
189
+ assert "variant_id_mask" not in ds
159
190
160
191
assert_array_equal (ds ["sample_id" ], ["NA00001" , "NA00002" , "NA00003" ])
161
192
0 commit comments