diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a58cdc8c93ab7..933deb7a97451 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -133,6 +133,7 @@ Other Enhancements - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) - Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where ``>> json_normalize(data, fill_value={'id': -1}) + id name name.family name.first name.given name.last + 0 1 NaN NaN Coleen NaN Volk + 1 -1 NaN Regner NaN Mose NaN + 2 2 Faye Raker NaN NaN NaN NaN + >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { @@ -197,6 +209,9 @@ def _pull_field(js, spec): if isinstance(data, dict): data = [data] + if fill_value and not isinstance(fill_value, dict): + raise ValueError('Invalid fill_value, fill_value only accepts a dict') + if record_path is None: if any([isinstance(x, dict) for x in y.values()] for y in data): # naive normalization, this is idempotent for flat records @@ -207,7 +222,7 @@ def _pull_field(js, spec): # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep) - return DataFrame(data) + return DataFrame(data, fill_value=fill_value) elif not isinstance(record_path, list): record_path = [record_path] @@ -265,7 +280,7 @@ def _recursive_extract(data, path, seen_meta, level=0): _recursive_extract(data, record_path, {}, level=0) - result = DataFrame(records) + result = DataFrame(records, fill_value=fill_value) if record_prefix is not None: result = result.rename( diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a7407d843c6c9..c851d61df0e82 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -39,6 +39,34 @@ def deep_nested(): ] +@pytest.fixture +def deep_nested_missing(): + # deeply nested data with some missing values + return [{'country': 'USA', + 'states': [{'name': 'California', + 'cities': [{'name': 'San Francisco', + 'pop': 12345}, + {'name': 'Los Angeles', + 'pop': 12346}] + }, + {'name': 'Ohio', + 'cities': [{'name': 'Columbus', + 'pop': 1234}, + {'pop': 1236}]} + ] + }, + {'country': 'Germany', + 'states': [{'name': 'Bayern', + 'cities': [{'name': 'Munich'}] + }, + {'name': 'Nordrhein-Westfalen', + 'cities': [{'name': 'Duesseldorf', 'pop': 1238}, + {'name': 'Koeln'}]} + ] + } + ] + + @pytest.fixture def state_data(): return [ @@ -294,6 +322,43 @@ def test_missing_field(self, author_missing_data): expected = DataFrame(ex_data) tm.assert_frame_equal(result, expected) + def test_fill_value(self, author_missing_data, deep_nested_missing): + # GH16918 + result = json_normalize( + author_missing_data, + fill_value={'info.last_updated': '27/06/2019'}) + ex_data = [ + {'info': np.nan, + 'author_name.first': np.nan, + 'author_name.last_name': np.nan, + 'info.created_at': np.nan, + 'info.last_updated': '27/06/2019'}, + {'info': None, + 'author_name.first': 'Jane', + 'author_name.last_name': 'Doe', + 'info.created_at': '11/08/1993', + 'info.last_updated': '26/05/2012'} + ] + expected = DataFrame(ex_data) + print(result['info'], expected['info']) + tm.assert_frame_equal(result, expected) + + result = json_normalize(deep_nested_missing, ['states', 'cities'], + meta=['country', ['states', 'name']], + fill_value={'pop': 0, 'name': 'N/A'}) + # meta_prefix={'states': 'state_'}) + + ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3, + 'states.name': ['California', 'California', 'Ohio', 'Ohio', + 'Bayern', 'Nordrhein-Westfalen', + 'Nordrhein-Westfalen'], + 'name': ['San Francisco', 'Los Angeles', 'Columbus', + 'N/A', 'Munich', 'Duesseldorf', 'Koeln'], + 'pop': [12345, 12346, 1234, 1236, 0, 1238, 0]} + + expected = DataFrame(ex_data, columns=result.columns) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord: