Skip to content

Commit a2c8f44

Browse files
committed
Merge pull request #4007 from pydata/PR_json-normalize-proto
Basic JSON normalization/flattening
2 parents bc6787a + e796c27 commit a2c8f44

File tree

6 files changed

+461
-8
lines changed

6 files changed

+461
-8
lines changed

doc/source/io.rst

+36-5
Original file line numberDiff line numberDiff line change
@@ -1230,6 +1230,37 @@ nanoseconds
12301230
import os
12311231
os.remove('test.json')
12321232
1233+
.. _io.json_normalize:
1234+
1235+
Normalization
1236+
~~~~~~~~~~~~~
1237+
1238+
.. versionadded:: 0.13.0
1239+
1240+
Pandas provides a utility function to take a dict or list of dicts and *normalize* this semi-structured data
1241+
into a flat table.
1242+
1243+
.. ipython:: python
1244+
1245+
from pandas.io.json import json_normalize
1246+
data = [{'state': 'Florida',
1247+
'shortname': 'FL',
1248+
'info': {
1249+
'governor': 'Rick Scott'
1250+
},
1251+
'counties': [{'name': 'Dade', 'population': 12345},
1252+
{'name': 'Broward', 'population': 40000},
1253+
{'name': 'Palm Beach', 'population': 60000}]},
1254+
{'state': 'Ohio',
1255+
'shortname': 'OH',
1256+
'info': {
1257+
'governor': 'John Kasich'
1258+
},
1259+
'counties': [{'name': 'Summit', 'population': 1234},
1260+
{'name': 'Cuyahoga', 'population': 1337}]}]
1261+
1262+
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
1263+
12331264
HTML
12341265
----
12351266

@@ -1244,7 +1275,7 @@ Reading HTML Content
12441275

12451276
.. _io.read_html:
12461277

1247-
.. versionadded:: 0.12
1278+
.. versionadded:: 0.12.0
12481279

12491280
The top-level :func:`~pandas.io.html.read_html` function can accept an HTML
12501281
string/file/url and will parse HTML tables into list of pandas DataFrames.
@@ -1620,7 +1651,7 @@ advanced strategies
16201651

16211652
.. note::
16221653

1623-
The prior method of accessing Excel is now deprecated as of 0.12,
1654+
The prior method of accessing Excel is now deprecated as of 0.12.0,
16241655
this will work but will be removed in a future version.
16251656

16261657
.. code-block:: python
@@ -2291,7 +2322,7 @@ The default is 50,000 rows returned in a chunk.
22912322
22922323
.. note::
22932324

2294-
.. versionadded:: 0.12
2325+
.. versionadded:: 0.12.0
22952326

22962327
You can also use the iterator with ``read_hdf`` which will open, then
22972328
automatically close the store when finished iterating.
@@ -2580,7 +2611,7 @@ Pass ``min_itemsize`` on the first table creation to a-priori specifiy the minim
25802611
``min_itemsize`` can be an integer, or a dict mapping a column name to an integer. You can pass ``values`` as a key to
25812612
allow all *indexables* or *data_columns* to have this min_itemsize.
25822613

2583-
Starting in 0.11, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically.
2614+
Starting in 0.11.0, passing a ``min_itemsize`` dict will cause all passed columns to be created as *data_columns* automatically.
25842615

25852616
.. note::
25862617

@@ -2860,7 +2891,7 @@ Reading from STATA format
28602891

28612892
.. _io.stata_reader:
28622893

2863-
.. versionadded:: 0.12
2894+
.. versionadded:: 0.12.0
28642895

28652896
The top-level function ``read_stata`` will read a dta format file
28662897
and return a DataFrame:

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ Improvements to existing features
169169
high-dimensional arrays).
170170
- :func:`~pandas.read_html` now supports the ``parse_dates``,
171171
``tupleize_cols`` and ``thousands`` parameters (:issue:`4770`).
172+
- :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table
173+
from semi-structured JSON data. :ref:`See the docs<io.json_normalize>` (:issue:`1067`)
172174

173175
API Changes
174176
~~~~~~~~~~~

doc/source/v0.13.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,8 @@ Enhancements
490490
- ``tz_localize`` can infer a fall daylight savings transition based on the structure
491491
of the unlocalized data (:issue:`4230`), see :ref:`here<timeseries.timezone>`
492492
- DatetimeIndex is now in the API documentation, see :ref:`here<api.datetimeindex>`
493+
- :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table
494+
from semi-structured JSON data. :ref:`See the docs<io.json_normalize>` (:issue:`1067`)
493495

494496
.. _whatsnew_0130.experimental:
495497

pandas/io/json.py

+201-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# pylint: disable-msg=E1101,W0613,W0603
2-
import os
32

3+
import os
4+
import copy
5+
from collections import defaultdict
46
import numpy as np
57

68
import pandas.json as _json
@@ -15,7 +17,6 @@
1517
dumps = _json.dumps
1618
### interface to/from ###
1719

18-
1920
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
2021
double_precision=10, force_ascii=True, date_unit='ms'):
2122

@@ -71,7 +72,6 @@ def write(self):
7172
date_unit=self.date_unit,
7273
iso_dates=self.date_format == 'iso')
7374

74-
7575
class SeriesWriter(Writer):
7676
_default_orient = 'index'
7777

@@ -537,3 +537,201 @@ def is_ok(col):
537537
lambda col, c: self._try_convert_to_date(c),
538538
lambda col, c: ((self.keep_default_dates and is_ok(col))
539539
or col in convert_dates))
540+
541+
542+
#----------------------------------------------------------------------
543+
# JSON normalization routines
544+
545+
def nested_to_record(ds,prefix="",level=0):
546+
"""a simplified json_normalize
547+
548+
converts a nested dict into a flat dict ("record"), unlike json_normalize,
549+
it does not attempt to extract a subset of the data.
550+
551+
Parameters
552+
----------
553+
ds : dict or list of dicts
554+
555+
Returns
556+
-------
557+
d - dict or list of dicts, matching `ds`
558+
559+
Example:
560+
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),nested=dict(e=dict(c=1,d=2),d=2)))
561+
Out[52]:
562+
{'dict1.c': 1,
563+
'dict1.d': 2,
564+
'flat1': 1,
565+
'nested.d': 2,
566+
'nested.e.c': 1,
567+
'nested.e.d': 2}
568+
"""
569+
singleton = False
570+
if isinstance(ds,dict):
571+
ds = [ds]
572+
singleton = True
573+
574+
new_ds = []
575+
for d in ds:
576+
577+
new_d = copy.deepcopy(d)
578+
for k,v in d.items():
579+
# each key gets renamed with prefix
580+
if level == 0:
581+
newkey = str(k)
582+
else:
583+
newkey = prefix+'.'+ str(k)
584+
585+
# only dicts gets recurse-flattend
586+
# only at level>1 do we rename the rest of the keys
587+
if not isinstance(v,dict):
588+
if level!=0: # so we skip copying for top level, common case
589+
v = new_d.pop(k)
590+
new_d[newkey]= v
591+
continue
592+
else:
593+
v = new_d.pop(k)
594+
new_d.update(nested_to_record(v,newkey,level+1))
595+
new_ds.append(new_d)
596+
597+
if singleton:
598+
return new_ds[0]
599+
return new_ds
600+
601+
602+
def json_normalize(data, record_path=None, meta=None,
603+
meta_prefix=None,
604+
record_prefix=None):
605+
"""
606+
"Normalize" semi-structured JSON data into a flat table
607+
608+
Parameters
609+
----------
610+
data : dict or list of dicts
611+
Unserialized JSON objects
612+
record_path : string or list of strings, default None
613+
Path in each object to list of records. If not passed, data will be
614+
assumed to be an array of records
615+
meta : list of paths (string or list of strings)
616+
Fields to use as metadata for each record in resulting table
617+
record_prefix : string, default None
618+
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
619+
path to records is ['foo', 'bar']
620+
meta_prefix : string, default None
621+
622+
Examples
623+
--------
624+
data = [{'state': 'Florida',
625+
'shortname': 'FL',
626+
'info': {
627+
'governor': 'Rick Scott'
628+
},
629+
'counties': [{'name': 'Dade', 'population': 12345},
630+
{'name': 'Broward', 'population': 40000},
631+
{'name': 'Palm Beach', 'population': 60000}]},
632+
{'state': 'Ohio',
633+
'shortname': 'OH',
634+
'info': {
635+
'governor': 'John Kasich'
636+
},
637+
'counties': [{'name': 'Summit', 'population': 1234},
638+
{'name': 'Cuyahoga', 'population': 1337}]}]
639+
640+
result = json_normalize(data, 'counties', ['state', 'shortname',
641+
['info', 'governor']])
642+
643+
state governor
644+
Florida Rick Scott
645+
646+
647+
Returns
648+
-------
649+
frame : DataFrame
650+
"""
651+
def _pull_field(js, spec):
652+
result = js
653+
if isinstance(spec, list):
654+
for field in spec:
655+
result = result[field]
656+
else:
657+
result = result[spec]
658+
659+
return result
660+
661+
# A bit of a hackjob
662+
if isinstance(data, dict):
663+
data = [data]
664+
665+
if record_path is None:
666+
if any([isinstance(x,dict) for x in compat.itervalues(data[0])]):
667+
# naive normalization, this is idempotent for flat records
668+
# and potentially will inflate the data considerably for
669+
# deeply nested structures:
670+
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
671+
#
672+
# TODO: handle record value which are lists, at least error reasonabley
673+
data = nested_to_record(data)
674+
return DataFrame(data)
675+
elif not isinstance(record_path, list):
676+
record_path = [record_path]
677+
678+
if meta is None:
679+
meta = []
680+
elif not isinstance(meta, list):
681+
meta = [meta]
682+
683+
for i, x in enumerate(meta):
684+
if not isinstance(x, list):
685+
meta[i] = [x]
686+
687+
# Disastrously inefficient for now
688+
records = []
689+
lengths = []
690+
691+
meta_vals = defaultdict(list)
692+
meta_keys = ['.'.join(val) for val in meta]
693+
694+
def _recursive_extract(data, path, seen_meta, level=0):
695+
if len(path) > 1:
696+
for obj in data:
697+
for val, key in zip(meta, meta_keys):
698+
if level + 1 == len(val):
699+
seen_meta[key] = _pull_field(obj, val[-1])
700+
701+
_recursive_extract(obj[path[0]], path[1:],
702+
seen_meta, level=level+1)
703+
else:
704+
for obj in data:
705+
recs = _pull_field(obj, path[0])
706+
707+
# For repeating the metadata later
708+
lengths.append(len(recs))
709+
710+
for val, key in zip(meta, meta_keys):
711+
if level + 1 > len(val):
712+
meta_val = seen_meta[key]
713+
else:
714+
meta_val = _pull_field(obj, val[level:])
715+
meta_vals[key].append(meta_val)
716+
717+
records.extend(recs)
718+
719+
_recursive_extract(data, record_path, {}, level=0)
720+
721+
result = DataFrame(records)
722+
723+
if record_prefix is not None:
724+
result.rename(columns=lambda x: record_prefix + x, inplace=True)
725+
726+
# Data types, a problem
727+
for k, v in compat.iteritems(meta_vals):
728+
if meta_prefix is not None:
729+
k = meta_prefix + k
730+
731+
if k in result:
732+
raise ValueError('Conflicting metadata name %s, '
733+
'need distinguishing prefix ' % k)
734+
735+
result[k] = np.array(v).repeat(lengths)
736+
737+
return result

0 commit comments

Comments
 (0)