Skip to content

CLN: reorg pandas/io/json to sub-dirs #15322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).

.. _whatsnew_0200.enhancements.uint64_support:

UInt64 Support Improved
^^^^^^^^^^^^^^^^^^^^^^^

Pandas has significantly improved support for operations involving unsigned,
or purely non-negative, integers. Previously, handling these integers would
result in improper rounding or data-type casting, leading to incorrect results.
Expand Down
4 changes: 4 additions & 0 deletions pandas/io/json/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .json import to_json, read_json, loads, dumps # noqa
from .normalize import json_normalize # noqa

del json, normalize # noqa
246 changes: 1 addition & 245 deletions pandas/io/json.py → pandas/io/json/json.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# pylint: disable-msg=E1101,W0613,W0603

import os
import copy
from collections import defaultdict
import numpy as np

import pandas.json as _json
Expand All @@ -13,6 +11,7 @@
from pandas.io.common import get_filepath_or_buffer, _get_handle
from pandas.core.common import AbstractMethodError
from pandas.formats.printing import pprint_thing
from .normalize import _convert_to_line_delimits

loads = _json.loads
dumps = _json.dumps
Expand Down Expand Up @@ -641,246 +640,3 @@ def is_ok(col):
lambda col, c: self._try_convert_to_date(c),
lambda col, c: ((self.keep_default_dates and is_ok(col)) or
col in convert_dates))

# ---------------------------------------------------------------------
# JSON normalization routines


def _convert_to_line_delimits(s):
"""Helper function that converts json lists to line delimited json."""

# Determine we have a JSON list to turn to lines otherwise just return the
# json object, only lists can
if not s[0] == '[' and s[-1] == ']':
return s
s = s[1:-1]

from pandas.lib import convert_json_to_lines
return convert_json_to_lines(s)


def nested_to_record(ds, prefix="", level=0):
"""a simplified json_normalize

converts a nested dict into a flat dict ("record"), unlike json_normalize,
it does not attempt to extract a subset of the data.

Parameters
----------
ds : dict or list of dicts
prefix: the prefix, optional, default: ""
level: the number of levels in the jason string, optional, default: 0

Returns
-------
d - dict or list of dicts, matching `ds`

Examples
--------

IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
nested=dict(e=dict(c=1,d=2),d=2)))
Out[52]:
{'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}
"""
singleton = False
if isinstance(ds, dict):
ds = [ds]
singleton = True

new_ds = []
for d in ds:

new_d = copy.deepcopy(d)
for k, v in d.items():
# each key gets renamed with prefix
if not isinstance(k, compat.string_types):
k = str(k)
if level == 0:
newkey = k
else:
newkey = prefix + '.' + k

# only dicts gets recurse-flattend
# only at level>1 do we rename the rest of the keys
if not isinstance(v, dict):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
else:
v = new_d.pop(k)
new_d.update(nested_to_record(v, newkey, level + 1))
new_ds.append(new_d)

if singleton:
return new_ds[0]
return new_ds


def json_normalize(data, record_path=None, meta=None,
meta_prefix=None,
record_prefix=None,
errors='raise'):

"""
"Normalize" semi-structured JSON data into a flat table

Parameters
----------
data : dict or list of dicts
Unserialized JSON objects
record_path : string or list of strings, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records
meta : list of paths (string or list of strings), default None
Fields to use as metadata for each record in resulting table
record_prefix : string, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar']
meta_prefix : string, default None
errors : {'raise', 'ignore'}, default 'raise'

* ignore : will ignore KeyError if keys listed in meta are not
always present
* raise : will raise KeyError if keys listed in meta are not
always present

.. versionadded:: 0.20.0

Returns
-------
frame : DataFrame

Examples
--------

>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {
... 'governor': 'Rick Scott'
... },
... 'counties': [{'name': 'Dade', 'population': 12345},
... {'name': 'Broward', 'population': 40000},
... {'name': 'Palm Beach', 'population': 60000}]},
... {'state': 'Ohio',
... 'shortname': 'OH',
... 'info': {
... 'governor': 'John Kasich'
... },
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
>>> from pandas.io.json import json_normalize
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
... ['info', 'governor']])
>>> result
name population info.governor state shortname
0 Dade 12345 Rick Scott Florida FL
1 Broward 40000 Rick Scott Florida FL
2 Palm Beach 60000 Rick Scott Florida FL
3 Summit 1234 John Kasich Ohio OH
4 Cuyahoga 1337 John Kasich Ohio OH

"""
def _pull_field(js, spec):
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]

return result

# A bit of a hackjob
if isinstance(data, dict):
data = [data]

if record_path is None:
if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
# naive normalization, this is idempotent for flat records
# and potentially will inflate the data considerably for
# deeply nested structures:
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
#
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]

if meta is None:
meta = []
elif not isinstance(meta, list):
meta = [meta]

for i, x in enumerate(meta):
if not isinstance(x, list):
meta[i] = [x]

# Disastrously inefficient for now
records = []
lengths = []

meta_vals = defaultdict(list)
meta_keys = ['.'.join(val) for val in meta]

def _recursive_extract(data, path, seen_meta, level=0):
if len(path) > 1:
for obj in data:
for val, key in zip(meta, meta_keys):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])

_recursive_extract(obj[path[0]], path[1:],
seen_meta, level=level + 1)
else:
for obj in data:
recs = _pull_field(obj, path[0])

# For repeating the metadata later
lengths.append(len(recs))

for val, key in zip(meta, meta_keys):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == 'ignore':
meta_val = np.nan
else:
raise \
KeyError("Try running with "
"errors='ignore' as key "
"%s is not always present", e)
meta_vals[key].append(meta_val)

records.extend(recs)

_recursive_extract(data, record_path, {}, level=0)

result = DataFrame(records)

if record_prefix is not None:
result.rename(columns=lambda x: record_prefix + x, inplace=True)

# Data types, a problem
for k, v in compat.iteritems(meta_vals):
if meta_prefix is not None:
k = meta_prefix + k

if k in result:
raise ValueError('Conflicting metadata name %s, '
'need distinguishing prefix ' % k)

result[k] = np.array(v).repeat(lengths)

return result
Loading