|
| 1 | +import itertools |
1 | 2 | import json |
2 | 3 | from io import BytesIO |
3 | 4 | from pathlib import Path |
|
7 | 8 | import pytest |
8 | 9 |
|
9 | 10 | from stac_geoparquet.arrow import ( |
| 11 | + DEFAULT_JSON_CHUNK_SIZE, |
10 | 12 | parse_stac_items_to_arrow, |
11 | 13 | parse_stac_ndjson_to_arrow, |
12 | 14 | stac_table_to_items, |
|
33 | 35 | "us-census", |
34 | 36 | ] |
35 | 37 |
|
| 38 | +CHUNK_SIZES = [2, DEFAULT_JSON_CHUNK_SIZE] |
36 | 39 |
|
37 | | -@pytest.mark.parametrize("collection_id", TEST_COLLECTIONS) |
38 | | -def test_round_trip_read_write(collection_id: str): |
| 40 | + |
| 41 | +@pytest.mark.parametrize( |
| 42 | + "collection_id,chunk_size", itertools.product(TEST_COLLECTIONS, CHUNK_SIZES) |
| 43 | +) |
| 44 | +def test_round_trip_read_write(collection_id: str, chunk_size: int): |
39 | 45 | with open(HERE / "data" / f"{collection_id}-pc.json") as f: |
40 | 46 | items = json.load(f) |
41 | 47 |
|
42 | | - table = pa.Table.from_batches(parse_stac_items_to_arrow(items)) |
| 48 | + table = parse_stac_items_to_arrow(items, chunk_size=chunk_size).read_all() |
43 | 49 | items_result = list(stac_table_to_items(table)) |
44 | 50 |
|
45 | 51 | for result, expected in zip(items_result, items): |
46 | 52 | assert_json_value_equal(result, expected, precision=0) |
47 | 53 |
|
48 | 54 |
|
49 | | -@pytest.mark.parametrize("collection_id", TEST_COLLECTIONS) |
50 | | -def test_round_trip_write_read_ndjson(collection_id: str, tmp_path: Path): |
| 55 | +@pytest.mark.parametrize( |
| 56 | + "collection_id,chunk_size", itertools.product(TEST_COLLECTIONS, CHUNK_SIZES) |
| 57 | +) |
| 58 | +def test_round_trip_write_read_ndjson( |
| 59 | + collection_id: str, chunk_size: int, tmp_path: Path |
| 60 | +): |
51 | 61 | # First load into a STAC-GeoParquet table |
52 | 62 | path = HERE / "data" / f"{collection_id}-pc.json" |
53 | | - table = pa.Table.from_batches(parse_stac_ndjson_to_arrow(path)) |
| 63 | + table = parse_stac_ndjson_to_arrow(path, chunk_size=chunk_size).read_all() |
54 | 64 |
|
55 | 65 | # Then write to disk |
56 | 66 | stac_table_to_ndjson(table, tmp_path / "tmp.ndjson") |
57 | 67 |
|
58 | | - # Then read back and assert tables match |
59 | | - table = pa.Table.from_batches(parse_stac_ndjson_to_arrow(tmp_path / "tmp.ndjson")) |
| 68 | + with open(path) as f: |
| 69 | + orig_json = json.load(f) |
| 70 | + |
| 71 | + rt_json = [] |
| 72 | + with open(tmp_path / "tmp.ndjson") as f: |
| 73 | + for line in f: |
| 74 | + rt_json.append(json.loads(line)) |
| 75 | + |
| 76 | + # Then read back and assert JSON data matches |
| 77 | + assert_json_value_equal(orig_json, rt_json, precision=0) |
60 | 78 |
|
61 | 79 |
|
62 | 80 | def test_table_contains_geoarrow_metadata(): |
63 | 81 | collection_id = "naip" |
64 | 82 | with open(HERE / "data" / f"{collection_id}-pc.json") as f: |
65 | 83 | items = json.load(f) |
66 | 84 |
|
67 | | - table = pa.Table.from_batches(parse_stac_items_to_arrow(items)) |
| 85 | + table = parse_stac_items_to_arrow(items).read_all() |
68 | 86 | field_meta = table.schema.field("geometry").metadata |
69 | 87 | assert field_meta[b"ARROW:extension:name"] == b"geoarrow.wkb" |
70 | 88 | assert json.loads(field_meta[b"ARROW:extension:metadata"])["crs"]["id"] == { |
@@ -107,7 +125,7 @@ def test_to_parquet_two_geometry_columns(): |
107 | 125 | with open(HERE / "data" / "3dep-lidar-copc-pc.json") as f: |
108 | 126 | items = json.load(f) |
109 | 127 |
|
110 | | - table = pa.Table.from_batches(parse_stac_items_to_arrow(items)) |
| 128 | + table = parse_stac_items_to_arrow(items).read_all() |
111 | 129 | with BytesIO() as bio: |
112 | 130 | to_parquet(table, bio) |
113 | 131 | bio.seek(0) |
|
0 commit comments