diff --git a/doc/source/io.rst b/doc/source/io.rst index 9442f59425106..0fabfa7077a95 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1230,6 +1230,37 @@ nanoseconds import os os.remove('test.json') +.. _io.json_normalize: + +Normalization +~~~~~~~~~~~~~ + +.. versionadded:: 0.13.0 + +Pandas provides a utility function to take a dict or list of dicts and *normalize* this semi-structured data +into a flat table. + +.. ipython:: python + + from pandas.io.json import json_normalize + data = [{'state': 'Florida', + 'shortname': 'FL', + 'info': { + 'governor': 'Rick Scott' + }, + 'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, + {'state': 'Ohio', + 'shortname': 'OH', + 'info': { + 'governor': 'John Kasich' + }, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] + + json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) + HTML ---- @@ -1244,7 +1275,7 @@ Reading HTML Content .. _io.read_html: -.. versionadded:: 0.12 +.. versionadded:: 0.12.0 The top-level :func:`~pandas.io.html.read_html` function can accept an HTML string/file/url and will parse HTML tables into list of pandas DataFrames. @@ -1620,7 +1651,7 @@ advanced strategies .. note:: - The prior method of accessing Excel is now deprecated as of 0.12, + The prior method of accessing Excel is now deprecated as of 0.12.0, this will work but will be removed in a future version. .. code-block:: python @@ -2291,7 +2322,7 @@ The default is 50,000 rows returned in a chunk. .. note:: - .. versionadded:: 0.12 + .. versionadded:: 0.12.0 You can also use the iterator with ``read_hdf`` which will open, then automatically close the store when finished iterating. @@ -2580,7 +2611,7 @@ Pass ``min_itemsize`` on the first table creation to a-priori specifiy the minim ``min_itemsize`` can be an integer, or a dict mapping a column name to an integer. You can pass ``values`` as a key to allow all *indexables* or *data_columns* to have this min_itemsize. -Starting in 0.11, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically. +Starting in 0.11.0, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically. .. note:: @@ -2860,7 +2891,7 @@ Reading from STATA format .. _io.stata_reader: -.. versionadded:: 0.12 +.. versionadded:: 0.12.0 The top-level function ``read_stata`` will read a dta format file and return a DataFrame: diff --git a/doc/source/release.rst b/doc/source/release.rst index 78236bbf821dd..179e7ff091444 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -169,6 +169,8 @@ Improvements to existing features high-dimensional arrays). - :func:`~pandas.read_html` now supports the ``parse_dates``, ``tupleize_cols`` and ``thousands`` parameters (:issue:`4770`). + - :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table + from semi-structured JSON data. :ref:`See the docs` (:issue:`1067`) API Changes ~~~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index fe6d796d95968..c6a4c280ca4bb 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -490,6 +490,8 @@ Enhancements - ``tz_localize`` can infer a fall daylight savings transition based on the structure of the unlocalized data (:issue:`4230`), see :ref:`here` - DatetimeIndex is now in the API documentation, see :ref:`here` + - :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table + from semi-structured JSON data. :ref:`See the docs` (:issue:`1067`) .. _whatsnew_0130.experimental: diff --git a/pandas/io/json.py b/pandas/io/json.py index e3c85fae045d0..497831f597681 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -1,6 +1,8 @@ # pylint: disable-msg=E1101,W0613,W0603 -import os +import os +import copy +from collections import defaultdict import numpy as np import pandas.json as _json @@ -15,7 +17,6 @@ dumps = _json.dumps ### interface to/from ### - def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms'): @@ -71,7 +72,6 @@ def write(self): date_unit=self.date_unit, iso_dates=self.date_format == 'iso') - class SeriesWriter(Writer): _default_orient = 'index' @@ -537,3 +537,201 @@ def is_ok(col): lambda col, c: self._try_convert_to_date(c), lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates)) + + +#---------------------------------------------------------------------- +# JSON normalization routines + +def nested_to_record(ds,prefix="",level=0): + """a simplified json_normalize + + converts a nested dict into a flat dict ("record"), unlike json_normalize, + it does not attempt to extract a subset of the data. + + Parameters + ---------- + ds : dict or list of dicts + + Returns + ------- + d - dict or list of dicts, matching `ds` + + Example: + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),nested=dict(e=dict(c=1,d=2),d=2))) + Out[52]: + {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + """ + singleton = False + if isinstance(ds,dict): + ds = [ds] + singleton = True + + new_ds = [] + for d in ds: + + new_d = copy.deepcopy(d) + for k,v in d.items(): + # each key gets renamed with prefix + if level == 0: + newkey = str(k) + else: + newkey = prefix+'.'+ str(k) + + # only dicts gets recurse-flattend + # only at level>1 do we rename the rest of the keys + if not isinstance(v,dict): + if level!=0: # so we skip copying for top level, common case + v = new_d.pop(k) + new_d[newkey]= v + continue + else: + v = new_d.pop(k) + new_d.update(nested_to_record(v,newkey,level+1)) + new_ds.append(new_d) + + if singleton: + return new_ds[0] + return new_ds + + +def json_normalize(data, record_path=None, meta=None, + meta_prefix=None, + record_prefix=None): + """ + "Normalize" semi-structured JSON data into a flat table + + Parameters + ---------- + data : dict or list of dicts + Unserialized JSON objects + record_path : string or list of strings, default None + Path in each object to list of records. If not passed, data will be + assumed to be an array of records + meta : list of paths (string or list of strings) + Fields to use as metadata for each record in resulting table + record_prefix : string, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar'] + meta_prefix : string, default None + + Examples + -------- + data = [{'state': 'Florida', + 'shortname': 'FL', + 'info': { + 'governor': 'Rick Scott' + }, + 'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, + {'state': 'Ohio', + 'shortname': 'OH', + 'info': { + 'governor': 'John Kasich' + }, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] + + result = json_normalize(data, 'counties', ['state', 'shortname', + ['info', 'governor']]) + + state governor + Florida Rick Scott + + + Returns + ------- + frame : DataFrame + """ + def _pull_field(js, spec): + result = js + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] + + return result + + # A bit of a hackjob + if isinstance(data, dict): + data = [data] + + if record_path is None: + if any([isinstance(x,dict) for x in compat.itervalues(data[0])]): + # naive normalization, this is idempotent for flat records + # and potentially will inflate the data considerably for + # deeply nested structures: + # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} + # + # TODO: handle record value which are lists, at least error reasonabley + data = nested_to_record(data) + return DataFrame(data) + elif not isinstance(record_path, list): + record_path = [record_path] + + if meta is None: + meta = [] + elif not isinstance(meta, list): + meta = [meta] + + for i, x in enumerate(meta): + if not isinstance(x, list): + meta[i] = [x] + + # Disastrously inefficient for now + records = [] + lengths = [] + + meta_vals = defaultdict(list) + meta_keys = ['.'.join(val) for val in meta] + + def _recursive_extract(data, path, seen_meta, level=0): + if len(path) > 1: + for obj in data: + for val, key in zip(meta, meta_keys): + if level + 1 == len(val): + seen_meta[key] = _pull_field(obj, val[-1]) + + _recursive_extract(obj[path[0]], path[1:], + seen_meta, level=level+1) + else: + for obj in data: + recs = _pull_field(obj, path[0]) + + # For repeating the metadata later + lengths.append(len(recs)) + + for val, key in zip(meta, meta_keys): + if level + 1 > len(val): + meta_val = seen_meta[key] + else: + meta_val = _pull_field(obj, val[level:]) + meta_vals[key].append(meta_val) + + records.extend(recs) + + _recursive_extract(data, record_path, {}, level=0) + + result = DataFrame(records) + + if record_prefix is not None: + result.rename(columns=lambda x: record_prefix + x, inplace=True) + + # Data types, a problem + for k, v in compat.iteritems(meta_vals): + if meta_prefix is not None: + k = meta_prefix + k + + if k in result: + raise ValueError('Conflicting metadata name %s, ' + 'need distinguishing prefix ' % k) + + result[k] = np.array(v).repeat(lengths) + + return result diff --git a/pandas/io/tests/test_json_norm.py b/pandas/io/tests/test_json_norm.py new file mode 100644 index 0000000000000..e96a89e71f12d --- /dev/null +++ b/pandas/io/tests/test_json_norm.py @@ -0,0 +1,208 @@ +import nose +import unittest + +from pandas import DataFrame +import numpy as np + +import pandas.util.testing as tm + +from pandas.io.json import json_normalize, nested_to_record + +def _assert_equal_data(left, right): + if not left.columns.equals(right.columns): + left = left.reindex(columns=right.columns) + + tm.assert_frame_equal(left, right) + + +class TestJSONNormalize(unittest.TestCase): + + def setUp(self): + self.state_data = [ + {'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}], + 'info': {'governor': 'Rick Scott'}, + 'shortname': 'FL', + 'state': 'Florida'}, + {'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}], + 'info': {'governor': 'John Kasich'}, + 'shortname': 'OH', + 'state': 'Ohio'}] + + def test_simple_records(self): + recs = [{'a': 1, 'b': 2, 'c': 3}, + {'a': 4, 'b': 5, 'c': 6}, + {'a': 7, 'b': 8, 'c': 9}, + {'a': 10, 'b': 11, 'c': 12}] + + result = json_normalize(recs) + expected = DataFrame(recs) + + tm.assert_frame_equal(result, expected) + + def test_simple_normalize(self): + result = json_normalize(self.state_data[0], 'counties') + expected = DataFrame(self.state_data[0]['counties']) + tm.assert_frame_equal(result, expected) + + result = json_normalize(self.state_data, 'counties') + + expected = [] + for rec in self.state_data: + expected.extend(rec['counties']) + expected = DataFrame(expected) + + tm.assert_frame_equal(result, expected) + + result = json_normalize(self.state_data, 'counties', meta='state') + expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) + + tm.assert_frame_equal(result, expected) + + def test_more_deeply_nested(self): + data = [{'country': 'USA', + 'states': [{'name': 'California', + 'cities': [{'name': 'San Francisco', + 'pop': 12345}, + {'name': 'Los Angeles', + 'pop': 12346}] + }, + {'name': 'Ohio', + 'cities': [{'name': 'Columbus', + 'pop': 1234}, + {'name': 'Cleveland', + 'pop': 1236}]} + ] + }, + {'country': 'Germany', + 'states': [{'name': 'Bayern', + 'cities': [{'name': 'Munich', 'pop': 12347}] + }, + {'name': 'Nordrhein-Westfalen', + 'cities': [{'name': 'Duesseldorf', 'pop': 1238}, + {'name': 'Koeln', 'pop': 1239}]} + ] + } + ] + + result = json_normalize(data, ['states', 'cities'], + meta=['country', ['states', 'name']]) + # meta_prefix={'states': 'state_'}) + + ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3, + 'states.name': ['California', 'California', 'Ohio', 'Ohio', + 'Bayern', 'Nordrhein-Westfalen', + 'Nordrhein-Westfalen'], + 'name': ['San Francisco', 'Los Angeles', 'Columbus', + 'Cleveland', 'Munich', 'Duesseldorf', 'Koeln'], + 'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]} + + expected = DataFrame(ex_data, columns=result.columns) + tm.assert_frame_equal(result, expected) + + def test_shallow_nested(self): + data = [{'state': 'Florida', + 'shortname': 'FL', + 'info': { + 'governor': 'Rick Scott' + }, + 'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, + {'state': 'Ohio', + 'shortname': 'OH', + 'info': { + 'governor': 'John Kasich' + }, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] + + result = json_normalize(data, 'counties', + ['state', 'shortname', + ['info', 'governor']]) + ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit', + 'Cuyahoga'], + 'state': ['Florida'] * 3 + ['Ohio'] * 2, + 'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'], + 'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2, + 'population': [12345, 40000, 60000, 1234, 1337]} + expected = DataFrame(ex_data, columns=result.columns) + tm.assert_frame_equal(result, expected) + + def test_meta_name_conflict(self): + data = [{'foo': 'hello', + 'bar': 'there', + 'data': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}]}] + + self.assertRaises(ValueError, json_normalize, data, + 'data', meta=['foo', 'bar']) + + result = json_normalize(data, 'data', meta=['foo', 'bar'], + meta_prefix='meta') + + for val in ['metafoo', 'metabar', 'foo', 'bar']: + self.assertTrue(val in result) + + def test_record_prefix(self): + result = json_normalize(self.state_data[0], 'counties') + expected = DataFrame(self.state_data[0]['counties']) + tm.assert_frame_equal(result, expected) + + result = json_normalize(self.state_data, 'counties', + meta='state', + record_prefix='county_') + + expected = [] + for rec in self.state_data: + expected.extend(rec['counties']) + expected = DataFrame(expected) + expected = expected.rename(columns=lambda x: 'county_' + x) + expected['state'] = np.array(['Florida', 'Ohio']).repeat([3, 2]) + + tm.assert_frame_equal(result, expected) + + +class TestNestedToRecord(unittest.TestCase): + + def test_flat_stays_flat(self): + recs = [dict(flat1=1,flat2=2), + dict(flat1=3,flat2=4), + ] + + result = nested_to_record(recs) + expected = recs + self.assertEqual(result, expected) + + def test_one_level_deep_flattens(self): + data = dict(flat1=1, + dict1=dict(c=1,d=2)) + + result = nested_to_record(data) + expected = {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1} + + self.assertEqual(result,expected) + + def test_nested_flattens(self): + data = dict(flat1=1, + dict1=dict(c=1,d=2), + nested=dict(e=dict(c=1,d=2), + d=2)) + + result = nested_to_record(data) + expected = {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + + self.assertEqual(result,expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', + '--pdb-failure', '-s'], exit=False) diff --git a/test.py b/test.py new file mode 100644 index 0000000000000..b3295e2d830e7 --- /dev/null +++ b/test.py @@ -0,0 +1,12 @@ + + +import pandas as pd +df = pd.DataFrame( + {'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62], + }) + +g = df.groupby('tag') + +import pdb; pdb.set_trace() +g.filter(lambda x: len(x) > 1)