Skip to content

Commit ca7dc05

Browse files
authored
Merge pull request #12 from smart-on-fhir/mikix/low-mem
fix: don't take up more memory than needed when calculating schema
2 parents c6a4698 + 503c739 commit ca7dc05

File tree

2 files changed

+18
-19
lines changed

2 files changed

+18
-19
lines changed

cumulus_fhir_support/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""FHIR support code for the Cumulus project"""
22

3-
__version__ = "1.3.0"
3+
__version__ = "1.3.1"
44

55
from .ml_json import list_multiline_json_in_dir, read_multiline_json, read_multiline_json_from_dir
66
from .schemas import pyarrow_schema_from_rows

cumulus_fhir_support/schemas.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,26 @@ def pyarrow_schema_from_rows(
4141
:param rows: optionally a set of JSON FHIR resources to ensure are covered by the schema
4242
:returns: a PyArrow schema that covers the unified shape of all provided rows
4343
"""
44-
rows = list(rows or [])
44+
rows = rows or []
4545

4646
# Examine batch to see the full shape of it, in order to detect any deeply nested fields
4747
# that we want to make sure to include in the final schema (normally, we go wide but only as
4848
# deep as we need to)
49-
batch_shape = _get_shape_of_dicts(None, rows)
50-
49+
# Note: be careful to only iterate through `rows` once, to allow passing in pure iterables.
50+
batch_shape = {}
51+
contained_types = set()
52+
for row in rows:
53+
# Build up a complete picture of the shape of all rows
54+
batch_shape = _get_shape_of_dicts(batch_shape, row)
55+
56+
# Also gather up which kind of contained resources exist.
57+
for contained_obj in row.get("contained", []):
58+
if contained_type := contained_obj.get("resourceType"):
59+
contained_types.add(contained_type)
60+
61+
# Now actually create the schema
5162
schema = _create_pyarrow_schema_for_resource(resource_type, batch_shape)
52-
schema = _include_contained_schemas(schema, rows, batch_shape)
63+
schema = _include_contained_schemas(schema, contained_types, batch_shape)
5364
return schema
5465

5566

@@ -86,7 +97,7 @@ def _get_shape_of_dicts(total_shape: Optional[dict], item: Any) -> dict:
8697

8798

8899
def _include_contained_schemas(
89-
schema: pyarrow.Schema, rows: list[dict], batch_shape: dict
100+
schema: pyarrow.Schema, contained_types: set[str], batch_shape: dict
90101
) -> pyarrow.Schema:
91102
"""
92103
This will include all contained resource schemas into one big contained schema.
@@ -97,25 +108,13 @@ def _include_contained_schemas(
97108
Also see https://github.com/smart-on-fhir/cumulus-etl/issues/250 for discussion
98109
of whether it is wise to just comingle the schemas like this.
99110
"""
100-
# Grab all contained resource types that we have in the source data,
101-
# which will inform the expected schema inside there.
102-
contained_types = sorted(
103-
filter(
104-
None,
105-
{
106-
contained_obj.get("resourceType")
107-
for row in rows
108-
for contained_obj in row.get("contained", [])
109-
},
110-
)
111-
)
112111
if not contained_types:
113112
return schema # no need to do anything
114113
contained_shape = batch_shape.get("contained")
115114

116115
# Allow any found fields in any of the contained types
117116
fields = {}
118-
for contained_type in contained_types:
117+
for contained_type in sorted(contained_types):
119118
subschema = _create_pyarrow_schema_for_resource(contained_type, contained_shape, wide=False)
120119
for name in subschema.names:
121120
fields[name] = subschema.field(name) # will overwrite previous field of same name

0 commit comments

Comments
 (0)