Skip to content

Commit 6c1d673

Browse files
stac-geoparquet metadata updates (#98)
This updates the stac-geoparquet metadata spec to add a version and json-schema. --------- Co-authored-by: Pete Gadomski <[email protected]>
1 parent 57229c0 commit 6c1d673

File tree

13 files changed

+2550
-2182
lines changed

13 files changed

+2550
-2182
lines changed

.github/workflows/continuous-integration.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,5 @@ jobs:
3232
run: uv run pytest tests -v
3333
- name: Check docs
3434
run: uv run mkdocs build --strict
35+
- name: Check jsonschema
36+
run: check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json

.pre-commit-config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,11 @@ repos:
1414
- id: trailing-whitespace
1515
- id: end-of-file-fixer
1616
exclude: tests/.*\.json
17+
- repo: https://github.com/astral-sh/ruff-pre-commit
18+
# Ruff version.
19+
rev: v0.11.8
20+
hooks:
21+
# Run the linter.
22+
- id: ruff
23+
# Run the formatter.
24+
- id: ruff-format

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,9 @@ uv run pre-commit install
4040
uv run pytest
4141
scripts/lint
4242
```
43+
44+
Validate the example collection metadata against the jsonschema:
45+
46+
```shell
47+
check-jsonschema --schemafile spec/json-schema/metadata.json spec/example-metadata.json
48+
```

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,17 +44,19 @@ pc = ["adlfs", "azure-data-tables", "psycopg[binary,pool]", "pypgstac", "tqdm"]
4444

4545
[dependency-groups]
4646
dev = [
47+
"check-jsonschema",
48+
"jsonschema",
4749
"mypy",
4850
"numpy>=2",
49-
"ruff",
5051
"pre-commit",
52+
"pytest-recording>=0.13.2",
5153
"pytest",
5254
"requests",
55+
"ruff",
5356
"stac-geoparquet[pc]",
5457
"stac-geoparquet[pgstac]",
5558
"types-python-dateutil",
5659
"types-requests",
57-
"pytest-recording>=0.13.2",
5860
"vcrpy>=7.0.0",
5961
]
6062
docs = [

scripts/lint

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,5 @@
22

33
set -e
44

5-
uv run ruff check
6-
uv run ruff format --check
5+
uv run pre-commit run --all-files
76
uv run mypy stac_geoparquet

spec/example-metadata.json

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"version": "1.0.0",
3+
"collection": {
4+
"id": "simple-collection",
5+
"type": "Collection",
6+
"stac_extensions": [],
7+
"stac_version": "1.1.0",
8+
"description": "A simple collection demonstrating core catalog fields with links to a couple of items",
9+
"title": "Simple Example Collection",
10+
"keywords": [
11+
"simple",
12+
"example",
13+
"collection"
14+
],
15+
"providers": [],
16+
"extent": {
17+
"spatial": {
18+
"bbox": [
19+
[
20+
172.91173669923782,
21+
1.3438851951615003,
22+
172.95469614953714,
23+
1.3690476620161975
24+
]
25+
]
26+
},
27+
"temporal": {
28+
"interval": [
29+
[
30+
"2020-12-11T22:38:32.125Z",
31+
"2020-12-14T18:02:31.437Z"
32+
]
33+
]
34+
}
35+
},
36+
"license": "CC-BY-4.0",
37+
"summaries": {},
38+
"links": []
39+
}
40+
}

spec/json-schema/metadata.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"$schema": "http://json-schema.org/draft-07/schema#",
3+
"$id": "https://stac-utils.github.io/stac-geoparquet/json-schema/metadata.json",
4+
"title": "STAC GeoParquet Metadata",
5+
"description": "JSON Schema for STAC GeoParquet metadata stored in Parquet file metadata",
6+
"type": "object",
7+
"properties": {
8+
"version": {
9+
"type": "string",
10+
"const": "1.0.0",
11+
"description": "The stac-geoparquet metadata version."
12+
},
13+
"collection": {
14+
"type": "object",
15+
"description": "This object represents a Collection in a SpatioTemporal Asset Catalog. Note that this object is not validated against the STAC Collection schema. You'll need to validate it separately from stac-geoparquet."
16+
}
17+
},
18+
"required": ["version"]
19+
}

spec/stac-geoparquet-spec.md

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ most of the fields should be the same in STAC and in GeoParquet.
3131
| _property columns_ | _varies_ | - | Each property should use the relevant Parquet type, and be pulled out of the properties object to be a top-level Parquet field |
3232

3333
- Must be valid GeoParquet, with proper metadata. Ideally the geometry types are defined and as narrow as possible.
34-
- Strongly recommend to only have one GeoParquet per STAC 'Collection'. Not doing this will lead to an expanded GeoParquet schema (the union of all the schemas of the collection) with lots of empty data
34+
- Strongly recommend storing items that are mostly homogeneous (i.e. have the same fields). Parquet is a columnar format; storing items with many different fields will lead to an expanded parquet Schema with lots of empty data. In practice, this means storing a single collection or only collections with very similar item properties in a single stac-geoparquet dataset.
3535
- Any field in 'properties' of the STAC item should be moved up to be a top-level field in the GeoParquet.
3636
- STAC GeoParquet does not support properties that are named such that they collide with a top-level key.
3737
- datetime columns should be stored as a [native timestamp][timestamp], not as a string
38-
- The Collection JSON should be included in the Parquet metadata. See [Collection JSON](#including-a-stac-collection-json-in-a-stac-geoparquet-collection) below.
38+
- The Collection JSON objects should be included in the Parquet metadata. See [Collection JSON](#stac-collection-objects) below.
3939
- Any other properties that would be stored as GeoJSON in a STAC JSON Item (e.g. `proj:geometry`) should be stored as a binary column with WKB encoding. This simplifies the handling of collections with multiple geometry types.
4040

4141
### Link Struct
@@ -69,17 +69,48 @@ To take advantage of Parquet's columnar nature and compression, the assets shoul
6969

7070
See [Asset Object][asset] for more.
7171

72-
## Including a STAC Collection JSON in a STAC Geoparquet Collection
72+
### Parquet Metadata
73+
74+
stac-geoparquet uses Parquet [File Metadata](https://parquet.apache.org/docs/file-format/metadata/) to store metadata about the dataset.
75+
All stac-geoparquet metadata is stored under the key `stac-geoparquet` in the parquet file metadata.
76+
77+
See [`example-metadata.json`](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/example-metadata.json) for an example.
78+
79+
A [jsonschema schema file][schema] is provided for tools to validate against.
80+
Note that the json-schema for stac-geoparquet does *not* validate the
81+
`collection` object against the STAC json-schema. You'll need to validate that
82+
separately.
83+
84+
85+
| Field Name | Type | Description |
86+
| -------------| -----------------------| ----------------------------------------------------------------------- |
87+
| `version` | string | The stac-geoparquet metadata version. Currently just "1.0.0" is allowed |
88+
| `collection` | STAC Collection object | STAC Collection metadata. |
89+
90+
Note that this metadata is distinct from the file metadata required by
91+
[geoparquet].
92+
93+
#### Geoparquet Version
94+
95+
The field `version` stores the version of the stac-geoparquet
96+
specification the data complies with. Readers can use this field to understand what
97+
features and fields are available.
98+
99+
Currently, the only allowed value is the string `"1.0.0"`.
100+
101+
Note: early versions of this specificaiton didn't include a `version` field. Readers
102+
aiming for maximum compatibility may attempt to read files without this key present,
103+
despite it being required from 1.0.0 onwards.
104+
105+
#### STAC Collection Object
73106

74107
To make a stac-geoparquet file a fully self-contained representation, you can
75-
include the Collection JSON in the Parquet metadata. If present in the [Parquet
76-
file metadata][parquet-metadata], the key must be `stac:collection` and the
77-
value must be a JSON string with the Collection JSON.
108+
include the Collection JSON document in the Parquet metadata under the
109+
`collection` key. This should contain a STAC [Collection].
78110

79111
## Referencing a STAC Geoparquet Collections in a STAC Collection JSON
80112

81-
A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an [Asset Object](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#asset-object) at the collection level of the STAC JSON that includes the `application/vnd.apache.parquet` Media type and `collection-mirror` Role type to describe the function of the Geoparquet STAC Collection Asset.
82-
113+
A common use case of stac-geoparquet is to create a mirror of a STAC collection. To refer to this mirror in the original collection, use an [Asset Object](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#asset-object) at the collection level of the STAC JSON that includes the `application/vnd.apache.parquet` Media type and `collection-mirror` Role type to describe the function of the Geoparquet STAC Co
83114
For example:
84115

85116
| Field Name | Type | Value |
@@ -105,3 +136,5 @@ The principles here can likely be used to map into other geospatial data formats
105136
[common-media-types]: https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#common-media-types-in-stac
106137
[timestamp]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
107138
[parquet-metadata]: https://github.com/apache/parquet-format#metadata
139+
[Collection]: https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#
140+
[schema]: https://github.com/stac-utils/stac-geoparquet/blob/main/spec/json-schema/metadata.json

stac_geoparquet/arrow/_delta_lake.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from stac_geoparquet.arrow._to_parquet import (
1313
DEFAULT_PARQUET_SCHEMA_VERSION,
1414
SUPPORTED_PARQUET_SCHEMA_VERSIONS,
15-
create_geoparquet_metadata,
15+
create_parquet_metadata,
1616
)
1717

1818
if TYPE_CHECKING:
@@ -51,7 +51,7 @@ def parse_stac_ndjson_to_delta_lake(
5151
input_path, chunk_size=chunk_size, schema=schema, limit=limit
5252
)
5353
schema = record_batch_reader.schema.with_metadata(
54-
create_geoparquet_metadata(
54+
create_parquet_metadata(
5555
record_batch_reader.schema, schema_version=schema_version
5656
)
5757
)

stac_geoparquet/arrow/_to_parquet.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
from collections.abc import Iterable
55
from pathlib import Path
6-
from typing import Any
6+
from typing import Any, Literal
77

88
import pyarrow as pa
99
import pyarrow.parquet as pq
@@ -18,6 +18,9 @@
1818
from stac_geoparquet.arrow._schema.models import InferredSchema
1919
from stac_geoparquet.arrow.types import ArrowStreamExportable
2020

21+
STAC_GEOPARQUET_VERSION: Literal["1.0.0"] = "1.0.0"
22+
STAC_GEOPARQUET_METADATA_KEY = b"stac-geoparquet"
23+
2124

2225
def parse_stac_ndjson_to_parquet(
2326
input_path: str | Path | Iterable[str | Path],
@@ -27,6 +30,7 @@ def parse_stac_ndjson_to_parquet(
2730
schema: pa.Schema | InferredSchema | None = None,
2831
limit: int | None = None,
2932
schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,
33+
collection_metadata: dict[str, Any] | None = None,
3034
**kwargs: Any,
3135
) -> None:
3236
"""Convert one or more newline-delimited JSON STAC files to GeoParquet
@@ -45,6 +49,9 @@ def parse_stac_ndjson_to_parquet(
4549
limit: The maximum number of JSON records to convert.
4650
schema_version: GeoParquet specification version; if not provided will default
4751
to latest supported version.
52+
collection_metadata: A dictionary representing a Collection in a SpatioTemporal
53+
Asset Catalog. This will be stored under the key `stac-geoparquet` in the
54+
parquet file metadata, under the key `collection`.
4855
4956
All other keyword args are passed on to
5057
[`pyarrow.parquet.ParquetWriter`][pyarrow.parquet.ParquetWriter].
@@ -57,6 +64,7 @@ def parse_stac_ndjson_to_parquet(
5764
output_path=output_path,
5865
schema_version=schema_version,
5966
**kwargs,
67+
collection_metadata=collection_metadata,
6068
)
6169

6270

@@ -65,6 +73,7 @@ def to_parquet(
6573
output_path: str | Path,
6674
*,
6775
schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS = DEFAULT_PARQUET_SCHEMA_VERSION,
76+
collection_metadata: dict[str, Any] | None = None,
6877
**kwargs: Any,
6978
) -> None:
7079
"""Write an Arrow table with STAC data to GeoParquet
@@ -82,6 +91,9 @@ def to_parquet(
8291
Keyword Args:
8392
schema_version: GeoParquet specification version; if not provided will default
8493
to latest supported version.
94+
collection_metadata: A dictionary representing a Collection in a SpatioTemporal
95+
Asset Catalog. This will be stored under the key `stac-geoparquet` in the
96+
parquet file metadata, under the key `collection`.
8597
8698
All other keyword args are passed on to
8799
[`pyarrow.parquet.ParquetWriter`][pyarrow.parquet.ParquetWriter].
@@ -90,17 +102,22 @@ def to_parquet(
90102
reader = pa.RecordBatchReader.from_stream(table)
91103

92104
schema = reader.schema.with_metadata(
93-
create_geoparquet_metadata(reader.schema, schema_version=schema_version)
105+
create_parquet_metadata(
106+
reader.schema,
107+
schema_version=schema_version,
108+
collection_metadata=collection_metadata,
109+
)
94110
)
95111
with pq.ParquetWriter(output_path, schema, **kwargs) as writer:
96112
for batch in reader:
97113
writer.write_batch(batch)
98114

99115

100-
def create_geoparquet_metadata(
116+
def create_parquet_metadata(
101117
schema: pa.Schema,
102118
*,
103119
schema_version: SUPPORTED_PARQUET_SCHEMA_VERSIONS,
120+
collection_metadata: dict[str, Any] | None = None,
104121
) -> dict[bytes, bytes]:
105122
# TODO: include bbox of geometries
106123
column_meta = {
@@ -141,7 +158,12 @@ def create_geoparquet_metadata(
141158
"crs": None,
142159
}
143160

144-
return {b"geo": json.dumps(geo_meta).encode("utf-8")}
161+
geoparquet_metadata = create_stac_geoparquet_metadata(collection_metadata)
162+
163+
return {
164+
b"geo": json.dumps(geo_meta).encode("utf-8"),
165+
STAC_GEOPARQUET_METADATA_KEY: json.dumps(geoparquet_metadata).encode("utf-8"),
166+
}
145167

146168

147169
def schema_version_has_bbox_mapping(
@@ -152,3 +174,20 @@ def schema_version_has_bbox_mapping(
152174
metadata.
153175
"""
154176
return int(schema_version.split(".")[1]) >= 1
177+
178+
179+
def create_stac_geoparquet_metadata(
180+
collection_metadata: dict[str, Any] | None = None,
181+
) -> dict[str, Any]:
182+
"""
183+
Create the stac-geoparquet metadata object for the Parquet file.
184+
185+
This will be stored under the key `stac-geoparquet` in the Parquet file metadata.
186+
It must be compatible with the metadata spec.
187+
"""
188+
result: dict[str, Any] = {
189+
"version": STAC_GEOPARQUET_VERSION,
190+
}
191+
if collection_metadata:
192+
result["collection"] = collection_metadata
193+
return result

0 commit comments

Comments
 (0)