diff --git a/doc/source/io.rst b/doc/source/io.rst index e64cbc4bc8101..c182d456315ec 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -954,13 +954,21 @@ with optional parameters: - path_or_buf : the pathname or buffer to write the output This can be ``None`` in which case a JSON string is returned -- orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` +- orient : - * split : dict like {index -> [index], columns -> [columns], data -> [values]} - * records : list like [{column -> value}, ... , {column -> value}] - * index : dict like {index -> {column -> value}} - * columns : dict like {column -> {index -> value}} - * values : just the values array + Series : + default is 'index', allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns', allowed values are: {'split','records','index','columns','values'} + + The format of the JSON string + + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array - date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch - double_precision : The number of decimal places to use when encoding floating point values, default 10. @@ -989,6 +997,8 @@ Writing to a file, with a date index and a date column dfj2 = dfj.copy() dfj2['date'] = Timestamp('20130101') + dfj2['ints'] = range(5) + dfj2['bools'] = True dfj2.index = date_range('20130101',periods=5) dfj2.to_json('test.json') open('test.json').read() @@ -1005,31 +1015,86 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` is expected. For instance, a local file could be file ://localhost/path/to/table.json - typ : type of object to recover (series or frame), default 'frame' -- orient : The format of the JSON string, one of the following +- orient : + + Series : + default is 'index', allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns', allowed values are: {'split','records','index','columns','values'} + + The format of the JSON string - * split : dict like {index -> [index], name -> name, data -> [values]} - * records : list like [value, ... , value] - * index : dict like {index -> value} + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array -- dtype : dtype of the resulting object -- numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. -- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is False +- dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data +- convert_axes : boolean, try to convert the axes to the proper dtypes, default is True +- convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns +- numpy: direct decoding to numpy arrays. default is False; + Note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. +The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True`` will try to parse the axes, and all of the data +into appropriate types, including dates. If you need to override specific dtypes, pass a dict to ``dtype``. ``convert_axes`` should only +be set to ``False`` if you need to preserve string-like numbers (e.g. '1', '2') in an axes. + +.. warning:: + + When reading JSON data, automatic coercing into dtypes has some quirks: + + * an index can be in a different order, that is the returned order is not guaranteed to be the same as before serialization + * a column that was ``float`` data can safely be converted to ``integer``, e.g. a column of ``1.`` + * bool columns will be converted to ``integer`` on reconstruction + + Thus there are times where you may want to specify specific dtypes via the ``dtype`` keyword argument. + Reading from a JSON string .. ipython:: python pd.read_json(json) -Reading from a file, parsing dates +Reading from a file + +.. ipython:: python + + pd.read_json('test.json') + +Don't convert any data (but still convert axes and dates) + +.. ipython:: python + + pd.read_json('test.json',dtype=object).dtypes + +Specify how I want to convert data + +.. ipython:: python + + pd.read_json('test.json',dtype={'A' : 'float32', 'bools' : 'int8'}).dtypes + +I like my string indicies .. ipython:: python - pd.read_json('test.json',parse_dates=True) + si = DataFrame(np.zeros((4, 4)), + columns=range(4), + index=[str(i) for i in range(4)]) + si + si.index + si.columns + json = si.to_json() + + sij = pd.read_json(json,convert_axes=False) + sij + sij.index + sij.columns .. ipython:: python :suppress: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0d2612d7aed7a..55347aef078ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -507,8 +507,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', ---------- path_or_buf : the path or buffer to write the result string if this is None, return a StringIO of the converted string - orient : {'split', 'records', 'index', 'columns', 'values'}, - default is 'index' for Series, 'columns' for DataFrame + orient : + + Series : + default is 'index' + allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns' + allowed values are: {'split','records','index','columns','values'} The format of the JSON string split : dict like @@ -517,6 +524,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', index : dict like {index -> {column -> value}} columns : dict like {column -> {index -> value}} values : just the values array + date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch double_precision : The number of decimal places to use when encoding diff --git a/pandas/io/json.py b/pandas/io/json.py index 17b33931bee5a..fcecb31bb77a7 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -11,6 +11,7 @@ import numpy as np from pandas.tslib import iNaT +import pandas.lib as lib ### interface to/from ### @@ -86,6 +87,11 @@ def _format_dates(self): self.copy_if_needed() self.obj = self._format_to_date(self.obj) + def _format_bools(self): + if self._needs_to_bool(self.obj): + self.copy_if_needed() + self.obj = self._format_to_bool(self.obj) + class FrameWriter(Writer): _default_orient = 'columns' @@ -112,8 +118,8 @@ def _format_dates(self): for c in dtypes.index: self.obj[c] = self._format_to_date(self.obj[c]) -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, - parse_dates=False, keep_default_dates=True): +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, + convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False): """ Convert JSON string to pandas object @@ -123,20 +129,33 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json - orient : {'split', 'records', 'index'}, default 'index' + orient : + Series : + default is 'index' + allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns' + allowed values are: {'split','records','index','columns','values'} + The format of the JSON string - split : dict like - {index -> [index], name -> name, data -> [values]} - records : list like [value, ... , value] - index : dict like {index -> value} + split : dict like {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + typ : type of object to recover (series or frame), default 'frame' - dtype : dtype of the resulting object - numpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. - parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns - default is False + dtype : if True, infer dtypes, if a dict of column to dtype, then use those, + if False, then don't infer dtypes at all, default is True, + apply only to the data + convert_axes : boolean, try to convert the axes to the proper dtypes, default is True + convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns + default is True keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns + numpy: direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same + for each term if numpy=True. Returns ------- @@ -157,16 +176,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True obj = None if typ == 'frame': - obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() + obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() if typ == 'series' or obj is None: - obj = SeriesParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() + if not isinstance(dtype,bool): + dtype = dict(data = dtype) + obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() return obj class Parser(object): - def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_dates=False): + def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False): self.json = json if orient is None: @@ -175,27 +196,100 @@ def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_d self.orient = orient self.dtype = dtype - if dtype is not None and orient == "split": + if orient == "split": numpy = False self.numpy = numpy - self.parse_dates = parse_dates + self.convert_axes = convert_axes + self.convert_dates = convert_dates self.keep_default_dates = keep_default_dates self.obj = None def parse(self): - self._parse() - if self.obj is not None: + + # try numpy + numpy = self.numpy + if numpy: + self._parse_numpy() + + else: + self._parse_no_numpy() + + if self.obj is None: return None + if self.convert_axes: self._convert_axes() - if self.parse_dates: - self._try_parse_dates() + self._try_convert_types() return self.obj + def _convert_axes(self): + """ try to convert axes """ + for axis in self.obj._AXIS_NUMBERS.keys(): + new_axis, result = self._try_convert_data(axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True) + if result: + setattr(self.obj,axis,new_axis) + + def _try_convert_types(self): + raise NotImplementedError + + def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): + """ try to parse a ndarray like into a column by inferring dtype """ + + # don't try to coerce, unless a force conversion + if use_dtypes: + if self.dtype is False: + return data, False + elif self.dtype is True: + pass + + else: + + # dtype to force + dtype = self.dtype.get(name) if isinstance(self.dtype,dict) else self.dtype + if dtype is not None: + try: + dtype = np.dtype(dtype) + return data.astype(dtype), True + except: + return data, False + + if convert_dates: + new_data, result = self._try_convert_to_date(data) + if result: + return new_data, True + + result = False + + if data.dtype == 'object': + + # try float + try: + data = data.astype('float64') + result = True + except: + pass - def _try_parse_to_date(self, data): + # do't coerce 0-len data + if len(data) and (data.dtype == 'float' or data.dtype == 'object'): + + # coerce ints if we can + try: + new_data = data.astype('int64') + if (new_data == data).all(): + data = new_data + result = True + except: + pass + + return data, result + + def _try_convert_to_date(self, data): """ try to parse a ndarray like into a date column try to coerce object in epoch/iso formats and - integer/float in epcoh formats """ + integer/float in epcoh formats, return a boolean if parsing + was successful """ + + # no conversion on empty + if not len(data): return data, False new_data = data if new_data.dtype == 'object': @@ -208,7 +302,7 @@ def _try_parse_to_date(self, data): # ignore numbers that are out of range if issubclass(new_data.dtype.type,np.number): if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all(): - return data + return data, False try: new_data = to_datetime(new_data) @@ -218,122 +312,102 @@ def _try_parse_to_date(self, data): except: # return old, noting more we can do - new_data = data + return data, False - return new_data + return new_data, True - def _try_parse_dates(self): + def _try_convert_dates(self): raise NotImplementedError class SeriesParser(Parser): _default_orient = 'index' - def _parse(self): - + def _parse_no_numpy(self): + json = self.json - dtype = self.dtype orient = self.orient - numpy = self.numpy - - if numpy: - try: - if orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - self.obj = Series(**decoded) - elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - else: - self.obj = Series(loads(json, dtype=dtype, numpy=True)) - except ValueError: - numpy = False - - if not numpy: - if orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - self.obj = Series(dtype=dtype, **decoded) - else: - self.obj = Series(loads(json), dtype=dtype) + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = Series(dtype=None, **decoded) + else: + self.obj = Series(loads(json), dtype=None) - def _convert_axes(self): - """ try to axes if they are datelike """ - try: - self.obj.index = self._try_parse_to_date(self.obj.index) - except: - pass + def _parse_numpy(self): - def _try_parse_dates(self): - if self.obj is None: return + json = self.json + orient = self.orient + if orient == "split": + decoded = loads(json, dtype=None, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = Series(**decoded) + elif orient == "columns" or orient == "index": + self.obj = Series(*loads(json, dtype=None, numpy=True, + labelled=True)) + else: + self.obj = Series(loads(json, dtype=None, numpy=True)) - if self.parse_dates: - self.obj = self._try_parse_to_date(self.obj) + def _try_convert_types(self): + if self.obj is None: return + obj, result = self._try_convert_data('data', self.obj, convert_dates=self.convert_dates) + if result: + self.obj = obj class FrameParser(Parser): _default_orient = 'columns' - def _parse(self): + def _parse_numpy(self): json = self.json - dtype = self.dtype orient = self.orient - numpy = self.numpy - - if numpy: - try: - if orient == "columns": - args = loads(json, dtype=dtype, numpy=True, labelled=True) - if args: - args = (args[0].T, args[2], args[1]) - self.obj = DataFrame(*args) - elif orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - self.obj = DataFrame(**decoded) - elif orient == "values": - self.obj = DataFrame(loads(json, dtype=dtype, numpy=True)) - else: - self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - except ValueError: - numpy = False - - if not numpy: - if orient == "columns": - self.obj = DataFrame(loads(json), dtype=dtype) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - self.obj = DataFrame(dtype=dtype, **decoded) - elif orient == "index": - self.obj = DataFrame(loads(json), dtype=dtype).T - else: - self.obj = DataFrame(loads(json), dtype=dtype) - def _convert_axes(self): - """ try to axes if they are datelike """ - if self.orient == 'columns': - axis = 'index' - elif self.orient == 'index': - axis = 'columns' + if orient == "columns": + args = loads(json, dtype=None, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + self.obj = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=None, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = DataFrame(**decoded) + elif orient == "values": + self.obj = DataFrame(loads(json, dtype=None, numpy=True)) else: - return + self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True)) - try: - a = getattr(self.obj,axis) - setattr(self.obj,axis,self._try_parse_to_date(a)) - except: - pass + def _parse_no_numpy(self): - def _try_parse_dates(self): + json = self.json + orient = self.orient + + if orient == "columns": + self.obj = DataFrame(loads(json), dtype=None) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = DataFrame(dtype=None, **decoded) + elif orient == "index": + self.obj = DataFrame(loads(json), dtype=None).T + else: + self.obj = DataFrame(loads(json), dtype=None) + + def _try_convert_types(self): + if self.obj is None: return + if self.convert_dates: + self._try_convert_dates() + for col in self.obj.columns: + new_data, result = self._try_convert_data(col, self.obj[col], convert_dates=False) + if result: + self.obj[col] = new_data + + def _try_convert_dates(self): if self.obj is None: return # our columns to parse - parse_dates = self.parse_dates - if parse_dates is True: - parse_dates = [] - parse_dates = set(parse_dates) + convert_dates = self.convert_dates + if convert_dates is True: + convert_dates = [] + convert_dates = set(convert_dates) def is_ok(col): """ return if this col is ok to try for a date parse """ @@ -348,6 +422,8 @@ def is_ok(col): return False - for col, c in self.obj.iteritems(): - if (self.keep_default_dates and is_ok(col)) or col in parse_dates: - self.obj[col] = self._try_parse_to_date(c) + for col in self.obj.columns: + if (self.keep_default_dates and is_ok(col)) or col in convert_dates: + new_data, result = self._try_convert_to_date(self.obj[col]) + if result: + self.obj[col] = new_data diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 4b1294b786df7..bdd700bdbcec3 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -56,13 +56,22 @@ def setUp(self): def test_frame_from_json_to_json(self): - def _check_orient(df, orient, dtype=None, numpy=True): + def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None): df = df.sort() dfjson = df.to_json(orient=orient) - unser = read_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy) + + try: + unser = read_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy, convert_axes=convert_axes) + except (Exception), detail: + if raise_ok is not None: + if type(detail) == raise_ok: + return + raise + unser = unser.sort() - if df.index.dtype.type == np.datetime64: + + if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) if orient == "records": # index is not captured in this orientation @@ -78,20 +87,40 @@ def _check_orient(df, orient, dtype=None, numpy=True): unser = unser.sort() assert_almost_equal(df.values, unser.values) else: - assert_frame_equal(df, unser) - - def _check_all_orients(df, dtype=None): - _check_orient(df, "columns", dtype=dtype) - _check_orient(df, "records", dtype=dtype) - _check_orient(df, "split", dtype=dtype) - _check_orient(df, "index", dtype=dtype) - _check_orient(df, "values", dtype=dtype) - - _check_orient(df, "columns", dtype=dtype, numpy=False) - _check_orient(df, "records", dtype=dtype, numpy=False) - _check_orient(df, "split", dtype=dtype, numpy=False) - _check_orient(df, "index", dtype=dtype, numpy=False) - _check_orient(df, "values", dtype=dtype, numpy=False) + if convert_axes: + assert_frame_equal(df, unser, check_dtype=check_dtype) + else: + assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) + + def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None): + + # numpy=False + if convert_axes: + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, convert_axes=False) + _check_orient(df, "records", dtype=dtype, convert_axes=False) + _check_orient(df, "split", dtype=dtype, convert_axes=False) + _check_orient(df, "index", dtype=dtype, convert_axes=False) + _check_orient(df, "values", dtype=dtype ,convert_axes=False) + + # numpy=True and raise_ok might be not None, so ignore the error + if convert_axes: + _check_orient(df, "columns", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, raise_ok=raise_ok) + + _check_orient(df, "columns", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) # basic _check_all_orients(self.frame) @@ -99,6 +128,7 @@ def _check_all_orients(df, dtype=None): self.frame.to_json(orient="columns")) _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + _check_all_orients(self.intframe, dtype=False) # big one # index and columns are strings as all unserialised JSON object keys @@ -106,13 +136,14 @@ def _check_all_orients(df, dtype=None): biggie = DataFrame(np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)]) - _check_all_orients(biggie) + _check_all_orients(biggie,dtype=False,convert_axes=False) # dtypes _check_all_orients(DataFrame(biggie, dtype=np.float64), - dtype=np.float64) - _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) - _check_all_orients(DataFrame(biggie, dtype='