@@ -41,15 +41,26 @@ def pyarrow_schema_from_rows(
4141 :param rows: optionally a set of JSON FHIR resources to ensure are covered by the schema
4242 :returns: a PyArrow schema that covers the unified shape of all provided rows
4343 """
44- rows = list ( rows or [])
44+ rows = rows or []
4545
4646 # Examine batch to see the full shape of it, in order to detect any deeply nested fields
4747 # that we want to make sure to include in the final schema (normally, we go wide but only as
4848 # deep as we need to)
49- batch_shape = _get_shape_of_dicts (None , rows )
50-
49+ # Note: be careful to only iterate through `rows` once, to allow passing in pure iterables.
50+ batch_shape = {}
51+ contained_types = set ()
52+ for row in rows :
53+ # Build up a complete picture of the shape of all rows
54+ batch_shape = _get_shape_of_dicts (batch_shape , row )
55+
56+ # Also gather up which kind of contained resources exist.
57+ for contained_obj in row .get ("contained" , []):
58+ if contained_type := contained_obj .get ("resourceType" ):
59+ contained_types .add (contained_type )
60+
61+ # Now actually create the schema
5162 schema = _create_pyarrow_schema_for_resource (resource_type , batch_shape )
52- schema = _include_contained_schemas (schema , rows , batch_shape )
63+ schema = _include_contained_schemas (schema , contained_types , batch_shape )
5364 return schema
5465
5566
@@ -86,7 +97,7 @@ def _get_shape_of_dicts(total_shape: Optional[dict], item: Any) -> dict:
8697
8798
8899def _include_contained_schemas (
89- schema : pyarrow .Schema , rows : list [ dict ], batch_shape : dict
100+ schema : pyarrow .Schema , contained_types : set [ str ], batch_shape : dict
90101) -> pyarrow .Schema :
91102 """
92103 This will include all contained resource schemas into one big contained schema.
@@ -97,25 +108,13 @@ def _include_contained_schemas(
97108 Also see https://github.com/smart-on-fhir/cumulus-etl/issues/250 for discussion
98109 of whether it is wise to just comingle the schemas like this.
99110 """
100- # Grab all contained resource types that we have in the source data,
101- # which will inform the expected schema inside there.
102- contained_types = sorted (
103- filter (
104- None ,
105- {
106- contained_obj .get ("resourceType" )
107- for row in rows
108- for contained_obj in row .get ("contained" , [])
109- },
110- )
111- )
112111 if not contained_types :
113112 return schema # no need to do anything
114113 contained_shape = batch_shape .get ("contained" )
115114
116115 # Allow any found fields in any of the contained types
117116 fields = {}
118- for contained_type in contained_types :
117+ for contained_type in sorted ( contained_types ) :
119118 subschema = _create_pyarrow_schema_for_resource (contained_type , contained_shape , wide = False )
120119 for name in subschema .names :
121120 fields [name ] = subschema .field (name ) # will overwrite previous field of same name
0 commit comments