From b58fc9db891729391b140e02bddd6537bc734cee Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 11 May 2013 18:34:32 -0700 Subject: [PATCH 1/3] ENH: pull pandasjson back into pandas --- pandas/core/frame.py | 112 +- pandas/core/series.py | 84 +- pandas/io/tests/test_json/test_pandas.py | 240 +++++ pandas/io/tests/test_json/test_ujson.py | 1230 ++++++++++++++++++++++ pandas/src/ujson/lib/ultrajson.h | 298 ++++++ pandas/src/ujson/python/py_defines.h | 15 + pandas/src/ujson/python/version.h | 1 + setup.py | 21 + 8 files changed, 1994 insertions(+), 7 deletions(-) create mode 100644 pandas/io/tests/test_json/test_pandas.py create mode 100644 pandas/io/tests/test_json/test_ujson.py create mode 100644 pandas/src/ujson/lib/ultrajson.h create mode 100644 pandas/src/ujson/python/py_defines.h create mode 100644 pandas/src/ujson/python/version.h diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ad1429fcea1ca..ffc02b5407a33 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1659,8 +1659,8 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, copy=True): ------- converted : DataFrame """ - return self._constructor(self._data.convert(convert_dates=convert_dates, - convert_numeric=convert_numeric, + return self._constructor(self._data.convert(convert_dates=convert_dates, + convert_numeric=convert_numeric, copy=copy)) #---------------------------------------------------------------------- @@ -3321,7 +3321,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, a reference to the filled object, which is self if inplace=True limit : int, default None Maximum size gap to forward or backward fill - downcast : dict, default is None, a dict of item->dtype of what to + downcast : dict, default is None, a dict of item->dtype of what to downcast if possible See also @@ -3368,7 +3368,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False, result[k].fillna(v, inplace=True) return result else: - new_data = self._data.fillna(value, inplace=inplace, + new_data = self._data.fillna(value, inplace=inplace, downcast=downcast) if inplace: @@ -3756,8 +3756,8 @@ def combine(self, other, func, fill_value=None, overwrite=True): result[col] = arr # convert_objects just in case - return self._constructor(result, - index=new_index, + return self._constructor(result, + index=new_index, columns=new_columns).convert_objects(copy=False) def combine_first(self, other): @@ -5278,6 +5278,106 @@ def mask(self, cond): """ return self.where(~cond, NA) + +@classmethod +def from_json(cls, json, orient="columns", dtype=None, numpy=True): + """ + Convert JSON string to DataFrame + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + dtype : dtype of the resulting DataFrame + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : DataFrame + """ + from pandas.json import loads + + df = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + df = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + df = DataFrame(**decoded) + elif orient == "values": + df = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + df = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "columns": + df = DataFrame(loads(json), dtype=dtype) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + df = DataFrame(dtype=dtype, **decoded) + elif orient == "index": + df = DataFrame(loads(json), dtype=dtype).T + else: + df = DataFrame(loads(json), dtype=dtype) + + return df +DataFrame.from_json = from_json + + +def to_json(self, orient="columns", double_precision=10, + force_ascii=True): + """ + Convert DataFrame to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.json import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) +DataFrame.to_json = to_json + + _EMPTY_SERIES = Series([]) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3509e226d46fb..14a8839fe3256 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1962,7 +1962,7 @@ def dot(self, other): Parameters ---------- - other : Series or DataFrame + other : Series or DataFrame Returns ------- @@ -3241,6 +3241,88 @@ def str(self): from pandas.core.strings import StringMethods return StringMethods(self) + +@classmethod +def from_json(cls, json, orient="index", dtype=None, numpy=True): + """ + Convert JSON string to Series + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + dtype : dtype of the resulting Series + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : Series + """ + from pandas.json import loads + s = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + s = Series(**decoded) + elif orient == "columns" or orient == "index": + s = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + s = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + s = Series(dtype=dtype, **decoded) + else: + s = Series(loads(json), dtype=dtype) + + return s +Series.from_json = from_json + +def to_json(self, orient="index", double_precision=10, force_ascii=True): + """ + Convert Series to a JSON string + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.json import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) +Series.to_json = to_json + + _INDEX_TYPES = ndarray, Index, list, tuple #------------------------------------------------------------------------------ diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py new file mode 100644 index 0000000000000..506aa382487d6 --- /dev/null +++ b/pandas/io/tests/test_json/test_pandas.py @@ -0,0 +1,240 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +from StringIO import StringIO +import cPickle as pickle +import operator +import os +import unittest + +import numpy as np + +from pandas import Series, DataFrame, DatetimeIndex +import pandas as pd + +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal) +import pandas.util.testing as tm + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() + +_frame = DataFrame(_seriesd) +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_intframe = DataFrame(dict((k, v.astype(int)) + for k, v in _seriesd.iteritems())) + +_tsframe = DataFrame(_tsd) + +_mixed_frame = _frame.copy() + + +class TestPandasObjects(unittest.TestCase): + + def setUp(self): + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.objSeries = tm.makeObjectSeries() + self.objSeries.name = 'objects' + + self.empty_series = Series([], index=[]) + self.empty_frame = DataFrame({}) + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.intframe = _intframe.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def test_frame_from_json_to_json(self): + + def _check_orient(df, orient, dtype=None, numpy=True): + df = df.sort() + dfjson = df.to_json(orient=orient) + unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy) + unser = unser.sort() + if df.index.dtype.type == np.datetime64: + unser.index = DatetimeIndex(unser.index.values.astype('i8')) + if orient == "records": + # index is not captured in this orientation + assert_almost_equal(df.values, unser.values) + self.assert_(df.columns.equals(unser.columns)) + elif orient == "values": + # index and cols are not captured in this orientation + assert_almost_equal(df.values, unser.values) + elif orient == "split": + # index and col labels might not be strings + unser.index = [str(i) for i in unser.index] + unser.columns = [str(i) for i in unser.columns] + unser = unser.sort() + assert_almost_equal(df.values, unser.values) + else: + assert_frame_equal(df, unser) + + def _check_all_orients(df, dtype=None): + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, numpy=False) + _check_orient(df, "records", dtype=dtype, numpy=False) + _check_orient(df, "split", dtype=dtype, numpy=False) + _check_orient(df, "index", dtype=dtype, numpy=False) + _check_orient(df, "values", dtype=dtype, numpy=False) + + # basic + _check_all_orients(self.frame) + self.assertEqual(self.frame.to_json(), + self.frame.to_json(orient="columns")) + + _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + + # big one + # index and columns are strings as all unserialised JSON object keys + # are assumed to be strings + biggie = DataFrame(np.zeros((200, 4)), + columns=[str(i) for i in range(4)], + index=[str(i) for i in range(200)]) + _check_all_orients(biggie) + + # dtypes + _check_all_orients(DataFrame(biggie, dtype=np.float64), + dtype=np.float64) + _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) + _check_all_orients(DataFrame(biggie, dtype='= 3 + else partial(json.dumps, encoding="utf-8")) + +class UltraJSONTests(TestCase): + def test_encodeDictWithUnicodeKeys(self): + input = { u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1" } + output = ujson.encode(input) + + input = { u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1" } + output = ujson.encode(input) + + pass + + def test_encodeDoubleConversion(self): + input = math.pi + output = ujson.encode(input) + self.assertEquals(round(input, 5), round(json.loads(output), 5)) + self.assertEquals(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeWithDecimal(self): + input = 1.0 + output = ujson.encode(input) + self.assertEquals(output, "1.0") + + def test_encodeDoubleNegConversion(self): + input = -math.pi + output = ujson.encode(input) + self.assertEquals(round(input, 5), round(json.loads(output), 5)) + self.assertEquals(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeArrayOfNestedArrays(self): + input = [[[[]]]] * 20 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + #self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + input = np.array(input) + assert_array_equal(input, ujson.decode(output, numpy=True, dtype=input.dtype)) + + def test_encodeArrayOfDoubles(self): + input = [ 31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + #self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + + def test_doublePrecisionTest(self): + input = 30.012345678901234 + output = ujson.encode(input, double_precision = 15) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + + output = ujson.encode(input, double_precision = 9) + self.assertEquals(round(input, 9), json.loads(output)) + self.assertEquals(round(input, 9), ujson.decode(output)) + + output = ujson.encode(input, double_precision = 3) + self.assertEquals(round(input, 3), json.loads(output)) + self.assertEquals(round(input, 3), ujson.decode(output)) + + output = ujson.encode(input) + self.assertEquals(round(input, 5), json.loads(output)) + self.assertEquals(round(input, 5), ujson.decode(output)) + + def test_invalidDoublePrecision(self): + input = 30.12345678901234567890 + output = ujson.encode(input, double_precision = 20) + # should snap to the max, which is 15 + self.assertEquals(round(input, 15), json.loads(output)) + self.assertEquals(round(input, 15), ujson.decode(output)) + + output = ujson.encode(input, double_precision = -1) + # also should snap to the max, which is 15 + self.assertEquals(round(input, 15), json.loads(output)) + self.assertEquals(round(input, 15), ujson.decode(output)) + + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = '9') + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = None) + + + def test_encodeStringConversion(self): + input = "A string \\ / \b \f \n \r \t" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, '"A string \\\\ \\/ \\b \\f \\n \\r \\t"') + self.assertEquals(input, ujson.decode(output)) + pass + + def test_decodeUnicodeConversion(self): + pass + + def test_encodeUnicodeConversion1(self): + input = "Räksmörgås اسامة بن محمد بن عوض بن لادن" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeControlEscaping(self): + input = "\x19" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(input, dec) + self.assertEquals(enc, json_unicode(input)) + + + def test_encodeUnicodeConversion2(self): + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicodeSurrogatePair(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x90\x8d\x86" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x91\x80\xb0TRAILINGNORMAL" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8Highest(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL" + enc = ujson.encode(input) + + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + + def test_encodeArrayInArray(self): + input = [[[[]]]] + output = ujson.encode(input) + + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeIntConversion(self): + input = 31337 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeIntNegConversion(self): + input = -31337 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + + def test_encodeLongNegConversion(self): + input = -9223372036854775808 + output = ujson.encode(input) + + outputjson = json.loads(output) + outputujson = ujson.decode(output) + + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeListConversion(self): + input = [ 1, 2, 3, 4 ] + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeDictConversion(self): + input = { "k1": 1, "k2": 2, "k3": 3, "k4": 4 } + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeNoneConversion(self): + input = None + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeTrueConversion(self): + input = True + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeFalseConversion(self): + input = False + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + # def test_encodeDatetimeConversion(self): + # ts = time.time() + # input = datetime.datetime.fromtimestamp(ts) + # output = ujson.encode(input) + # expected = calendar.timegm(input.utctimetuple()) + # self.assertEquals(int(expected), json.loads(output)) + # self.assertEquals(int(expected), ujson.decode(output)) + # pass + + # def test_encodeDateConversion(self): + # ts = time.time() + # input = datetime.date.fromtimestamp(ts) + + # output = ujson.encode(input) + # tup = ( input.year, input.month, input.day, 0, 0, 0 ) + + # expected = calendar.timegm(tup) + # self.assertEquals(int(expected), json.loads(output)) + # self.assertEquals(int(expected), ujson.decode(output)) + + def test_datetime_nanosecond_unit(self): + from datetime import datetime + from pandas.lib import Timestamp + + val = datetime.now() + stamp = Timestamp(val) + + roundtrip = ujson.decode(ujson.encode(val)) + self.assert_(roundtrip == stamp.value) + + def test_encodeToUTF8(self): + _skip_if_python_ver(2, 5) + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input, ensure_ascii=False) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input, ensure_ascii=False)) + self.assertEquals(dec, json.loads(enc)) + + def test_decodeFromUnicode(self): + input = u"{\"obj\": 31337}" + dec1 = ujson.decode(input) + dec2 = ujson.decode(str(input)) + self.assertEquals(dec1, dec2) + + def test_encodeRecursionMax(self): + # 8 is the max recursion depth + + class O2: + member = 0 + pass + + class O1: + member = 0 + pass + + input = O1() + input.member = O2() + input.member.member = input + + try: + output = ujson.encode(input) + assert False, "Expected overflow exception" + except(OverflowError): + pass + + def test_encodeDoubleNan(self): + input = np.nan + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleInf(self): + input = np.inf + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleNegInf(self): + input = -np.inf + assert ujson.encode(input) == 'null', "Expected null" + + + def test_decodeJibberish(self): + input = "fdsa sda v9sa fdsa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayStart(self): + input = "[" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectStart(self): + input = "{" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayEnd(self): + input = "]" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectEnd(self): + input = "}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUnterminated(self): + input = "\"TESTING" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUntermEscapeSequence(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringBadEscape(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeTrueBroken(self): + input = "tru" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeFalseBroken(self): + input = "fa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeNullBroken(self): + input = "n" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + + def test_decodeBrokenDictKeyTypeLeakTest(self): + input = '{{1337:""}}' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError),e: + continue + + assert False, "Wrong exception" + + def test_decodeBrokenDictLeakTest(self): + input = '{{"key":"}' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeBrokenListLeakTest(self): + input = '[[[true' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeDictWithNoKey(self): + input = "{{{{31337}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoColonOrValue(self): + input = "{{{{\"key\"}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoValue(self): + input = "{{{{\"key\":}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeNumericIntPos(self): + input = "31337" + self.assertEquals (31337, ujson.decode(input)) + + def test_decodeNumericIntNeg(self): + input = "-31337" + self.assertEquals (-31337, ujson.decode(input)) + + def test_encodeUnicode4BytesUTF8Fail(self): + _skip_if_python_ver(3) + input = "\xfd\xbf\xbf\xbf\xbf\xbf" + try: + enc = ujson.encode(input) + assert False, "Expected exception" + except OverflowError: + pass + + def test_encodeNullCharacter(self): + input = "31337 \x00 1337" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + + input = "\x00" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + + self.assertEquals('" \\u0000\\r\\n "', ujson.dumps(u" \u0000\r\n ")) + pass + + def test_decodeNullCharacter(self): + input = "\"31337 \\u0000 31337\"" + self.assertEquals(ujson.decode(input), json.loads(input)) + + + def test_encodeListLongConversion(self): + input = [9223372036854775807, 9223372036854775807, 9223372036854775807, + 9223372036854775807, 9223372036854775807, 9223372036854775807 ] + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True, + dtype=np.int64)) + pass + + def test_encodeLongConversion(self): + input = 9223372036854775807 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_numericIntExp(self): + input = "1337E40" + output = ujson.decode(input) + self.assertEquals(output, json.loads(input)) + + def test_numericIntFrcExp(self): + input = "1.337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEPLUS(self): + input = "1337E+40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpePLUS(self): + input = "1.337e+40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpE(self): + input = "1337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpe(self): + input = "1337e40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEMinus(self): + input = "1.337E-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpeMinus(self): + input = "1.337e-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_dumpToFile(self): + f = StringIO.StringIO() + ujson.dump([1, 2, 3], f) + self.assertEquals("[1,2,3]", f.getvalue()) + + def test_dumpToFileLikeObject(self): + class filelike: + def __init__(self): + self.bytes = '' + def write(self, bytes): + self.bytes += bytes + f = filelike() + ujson.dump([1, 2, 3], f) + self.assertEquals("[1,2,3]", f.bytes) + + def test_dumpFileArgsError(self): + try: + ujson.dump([], '') + except TypeError: + pass + else: + assert False, 'expected TypeError' + + def test_loadFile(self): + f = StringIO.StringIO("[1,2,3,4]") + self.assertEquals([1, 2, 3, 4], ujson.load(f)) + f = StringIO.StringIO("[1,2,3,4]") + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileLikeObject(self): + class filelike: + def read(self): + try: + self.end + except AttributeError: + self.end = True + return "[1,2,3,4]" + f = filelike() + self.assertEquals([1, 2, 3, 4], ujson.load(f)) + f = filelike() + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileArgsError(self): + try: + ujson.load("[]") + except TypeError: + pass + else: + assert False, "expected TypeError" + + def test_version(self): + assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ + "ujson.__version__ must be a string like '1.4.0'" + + def test_encodeNumericOverflow(self): + try: + ujson.encode(12839128391289382193812939) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_encodeNumericOverflowNested(self): + for n in xrange(0, 100): + class Nested: + x = 12839128391289382193812939 + + nested = Nested() + + try: + ujson.encode(nested) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_decodeNumberWith32bitSignBit(self): + #Test that numbers that fit within 32 bits but would have the + # sign bit set (2**31 <= x < 2**32) are decoded properly. + boundary1 = 2**31 + boundary2 = 2**32 + docs = ( + '{"id": 3590016419}', + '{"id": %s}' % 2**31, + '{"id": %s}' % 2**32, + '{"id": %s}' % ((2**32)-1), + ) + results = (3590016419, 2**31, 2**32, 2**32-1) + for doc,result in zip(docs, results): + self.assertEqual(ujson.decode(doc)['id'], result) + + def test_encodeBigEscape(self): + for x in xrange(10): + if py3compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + input = base * 1024 * 1024 * 2 + output = ujson.encode(input) + + def test_decodeBigEscape(self): + for x in xrange(10): + if py3compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + quote = py3compat.str_to_bytes("\"") + input = quote + (base * 1024 * 1024 * 2) + quote + output = ujson.decode(input) + + def test_toDict(self): + d = {u"key": 31337} + + class DictTest: + def toDict(self): + return d + + o = DictTest() + output = ujson.encode(o) + dec = ujson.decode(output) + self.assertEquals(dec, d) + + +class NumpyJSONTests(TestCase): + + def testBool(self): + b = np.bool(True) + self.assertEqual(ujson.decode(ujson.encode(b)), b) + + def testBoolArray(self): + inpt = np.array([True, False, True, True, False, True, False , False], + dtype=np.bool) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) + assert_array_equal(inpt, outp) + + def testInt(self): + num = np.int(2562010) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(127) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(2562010) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(2562010) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.int64(2562010) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(255) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(2562010) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(2562010) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + num = np.uint64(2562010) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testIntArray(self): + arr = np.arange(100, dtype=np.int) + dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, + np.uint, np.uint8, np.uint16, np.uint32, np.uint64) + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) + assert_array_equal(inpt, outp) + + def testIntMax(self): + num = np.int(np.iinfo(np.int).max) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(np.iinfo(np.int8).max) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(np.iinfo(np.int16).max) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(np.iinfo(np.int32).max) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(np.iinfo(np.uint8).max) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(np.iinfo(np.uint16).max) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(np.iinfo(np.uint32).max) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + if platform.architecture()[0] != '32bit': + num = np.int64(np.iinfo(np.int64).max) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + # uint64 max will always overflow as it's encoded to signed + num = np.uint64(np.iinfo(np.int64).max) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testFloat(self): + num = np.float(256.2013) + self.assertEqual(np.float(ujson.decode(ujson.encode(num))), num) + + num = np.float32(256.2013) + self.assertEqual(np.float32(ujson.decode(ujson.encode(num))), num) + + num = np.float64(256.2013) + self.assertEqual(np.float64(ujson.decode(ujson.encode(num))), num) + + def testFloatArray(self): + arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) + dtypes = (np.float, np.float32, np.float64) + + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt, double_precision=15)), dtype=dtype) + assert_array_almost_equal_nulp(inpt, outp) + + def testFloatMax(self): + num = np.float(np.finfo(np.float).max/10) + assert_approx_equal(np.float(ujson.decode(ujson.encode(num))), num, 15) + + num = np.float32(np.finfo(np.float32).max/10) + assert_approx_equal(np.float32(ujson.decode(ujson.encode(num))), num, 15) + + num = np.float64(np.finfo(np.float64).max/10) + assert_approx_equal(np.float64(ujson.decode(ujson.encode(num))), num, 15) + + def testArrays(self): + arr = np.arange(100); + + arr = arr.reshape((10, 10)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((5, 5, 4)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((100, 1)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = np.arange(96); + arr = arr.reshape((2, 2, 2, 2, 3, 2)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + l = ['a', list(), dict(), dict(), list(), + 42, 97.8, ['a', 'b'], {'key': 'val'}] + arr = np.array(l) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + + arr = np.arange(100.202, 200.202, 1, dtype=np.float32); + arr = arr.reshape((5, 5, 4)) + outp = np.array(ujson.decode(ujson.encode(arr)), dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + + def testArrayNumpyExcept(self): + + input = ujson.dumps([42, {}, 'a']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps(['a', 'b', [], 'c']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, ['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{}, []]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, None]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 'b'}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps({'a': {'b': {'c': 42}}}) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 42, 'b': 23}, {'c': 17}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + def testArrayNumpyLabelled(self): + input = {'a': []} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.empty((1, 0)) == output[0]).all()) + self.assertTrue((np.array(['a']) == output[1]).all()) + self.assertTrue(output[2] is None) + + input = [{'a': 42}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.array([42]) == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a']) == output[2]).all()) + + input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a', 'b']) == output[2]).all()) + + + input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue((np.array(['1','2','3']) == output[1]).all()) + self.assertTrue((np.array(['a', 'b']) == output[2]).all()) + +class PandasJSONTests(TestCase): + + def testDataFrame(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df))) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"))) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) + outp.index = df.index + self.assertTrue((df.values == outp.values).all()) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + + def testDataFrameNumpy(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), + numpy=True)) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), numpy=True)) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + def testDataFrameNested(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + nested = {'df1': df, 'df2': df.copy()} + + exp = {'df1': ujson.decode(ujson.encode(df)), + 'df2': ujson.decode(ujson.encode(df))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="index")), + 'df2': ujson.decode(ujson.encode(df, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="records")), + 'df2': ujson.decode(ujson.encode(df, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="values")), + 'df2': ujson.decode(ujson.encode(df, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="split")), + 'df2': ujson.decode(ujson.encode(df, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + def testDataFrameNumpyLabelled(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(*ujson.decode(ujson.encode(df), numpy=True, labelled=True)) + self.assertTrue((df.T == outp).values.all()) + assert_array_equal(df.T.columns, outp.columns) + assert_array_equal(df.T.index, outp.index) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), numpy=True, labelled=True)) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), numpy=True, labelled=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + def testSeries(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + # column indexed + outp = Series(ujson.decode(ujson.encode(s))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"))) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), + numpy=True)) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + def testSeriesNested(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + nested = {'s1': s, 's2': s.copy()} + + exp = {'s1': ujson.decode(ujson.encode(s)), + 's2': ujson.decode(ujson.encode(s))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="split")), + 's2': ujson.decode(ujson.encode(s, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="records")), + 's2': ujson.decode(ujson.encode(s, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="values")), + 's2': ujson.decode(ujson.encode(s, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="index")), + 's2': ujson.decode(ujson.encode(s, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + def testIndex(self): + i = Index([23, 45, 18, 98, 43, 11], name="index") + + # column indexed + outp = Index(ujson.decode(ujson.encode(i))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i), numpy=True)) + self.assert_(i.equals(outp)) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"))) + outp = Index(**dec) + self.assert_(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), + numpy=True)) + outp = Index(**dec) + self.assert_(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"), numpy=True)) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"), numpy=True)) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"), numpy=True)) + self.assert_(i.equals(outp)) + + def test_datetimeindex(self): + from pandas.tseries.index import date_range, DatetimeIndex + + rng = date_range('1/1/2000', periods=20) + + encoded = ujson.encode(rng) + decoded = DatetimeIndex(np.array(ujson.decode(encoded))) + + self.assert_(rng.equals(decoded)) + + ts = Series(np.random.randn(len(rng)), index=rng) + decoded = Series(ujson.decode(ujson.encode(ts))) + idx_values = decoded.index.values.astype(np.int64) + decoded.index = DatetimeIndex(idx_values) + tm.assert_series_equal(np.round(ts, 5), decoded) + +""" +def test_decodeNumericIntFrcOverflow(self): +input = "X.Y" +raise NotImplementedError("Implement this test!") + + +def test_decodeStringUnicodeEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUnicodeBrokenEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUnicodeInvalidEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUTF8(self): +input = "someutfcharacters" +raise NotImplementedError("Implement this test!") + + + +""" + +def _clean_dict(d): + return dict((str(k), v) for k, v in d.iteritems()) + +if __name__ == '__main__': + # unittest.main() + import nose + # nose.runmodule(argv=[__file__,'-vvs','-x', '--ipdb-failure'], + # exit=False) + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h new file mode 100644 index 0000000000000..eae665f00f03e --- /dev/null +++ b/pandas/src/ujson/lib/ultrajson.h @@ -0,0 +1,298 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +/* +Ultra fast JSON encoder and decoder +Developed by Jonas Tarnstrom (jonas@esn.me). + +Encoder notes: +------------------ + +:: Cyclic references :: +Cyclic referenced objects are not detected. +Set JSONObjectEncoder.recursionMax to suitable value or make sure input object +tree doesn't have cyclic references. + +*/ + +#ifndef __ULTRAJSON_H__ +#define __ULTRAJSON_H__ + +#include +#include + +//#define JSON_DECODE_NUMERIC_AS_DOUBLE + +// Don't output any extra whitespaces when encoding +#define JSON_NO_EXTRA_WHITESPACE + +// Max decimals to encode double floating point numbers with +#ifndef JSON_DOUBLE_MAX_DECIMALS +#define JSON_DOUBLE_MAX_DECIMALS 15 +#endif + +// Max recursion depth, default for encoder +#ifndef JSON_MAX_RECURSION_DEPTH +#define JSON_MAX_RECURSION_DEPTH 1024 +#endif + +/* +Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */ +#ifndef JSON_MAX_STACK_BUFFER_SIZE +#define JSON_MAX_STACK_BUFFER_SIZE 131072 +#endif + +#ifdef _WIN32 + +typedef __int64 JSINT64; +typedef unsigned __int64 JSUINT64; + +typedef __int32 JSINT32; +typedef unsigned __int32 JSUINT32; +typedef unsigned __int8 JSUINT8; +typedef unsigned __int16 JSUTF16; +typedef unsigned __int32 JSUTF32; +typedef __int64 JSLONG; + +#define EXPORTFUNCTION __declspec(dllexport) + +#define FASTCALL_MSVC __fastcall +#define FASTCALL_ATTR +#define INLINE_PREFIX __inline + +#else + +#include +typedef int64_t JSINT64; +typedef u_int64_t JSUINT64; + +typedef int32_t JSINT32; +typedef u_int32_t JSUINT32; + +#define FASTCALL_MSVC +#define FASTCALL_ATTR __attribute__((fastcall)) +#define INLINE_PREFIX inline + +typedef u_int8_t JSUINT8; +typedef u_int16_t JSUTF16; +typedef u_int32_t JSUTF32; + +typedef int64_t JSLONG; + +#define EXPORTFUNCTION +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __LITTLE_ENDIAN__ +#else + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __BIG_ENDIAN__ +#endif + +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +#error "Endianess not supported" +#endif + +enum JSTYPES +{ + JT_NULL, // NULL + JT_TRUE, //boolean true + JT_FALSE, //boolean false + JT_INT, //(JSINT32 (signed 32-bit)) + JT_LONG, //(JSINT64 (signed 64-bit)) + JT_DOUBLE, //(double) + JT_UTF8, //(char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect +}; + +typedef void * JSOBJ; +typedef void * JSITER; + +typedef struct __JSONTypeContext +{ + int type; + void *encoder; + void *prv; +} JSONTypeContext; + +/* +Function pointer declarations, suitable for implementing UltraJSON */ +typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); +typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); +typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); +typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); +typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, size_t *outLen); +typedef void *(*JSPFN_MALLOC)(size_t size); +typedef void (*JSPFN_FREE)(void *pptr); +typedef void *(*JSPFN_REALLOC)(void *base, size_t size); + +typedef struct __JSONObjectEncoder +{ + void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); + double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + + /* + Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) + Implementor should setup iteration state in ti->prv + */ + JSPFN_ITERBEGIN iterBegin; + + /* + Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items. + Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this + */ + JSPFN_ITERNEXT iterNext; + + /* + Ends the iteration of an iteratable object. + Any iteration state stored in ti->prv can be freed here + */ + JSPFN_ITEREND iterEnd; + + /* + Returns a reference to the value object of an iterator + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETVALUE iterGetValue; + + /* + Return name of iterator. + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETNAME iterGetName; + + /* + Release a value as indicated by setting ti->release = 1 in the previous getValue call. + The ti->prv array should contain the necessary context to release the value + */ + void (*releaseObject)(JSOBJ obj); + + /* Library functions + Set to NULL to use STDLIB malloc,realloc,free */ + JSPFN_MALLOC malloc; + JSPFN_REALLOC realloc; + JSPFN_FREE free; + + /* + Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/ + int recursionMax; + + /* + Configuration for max decimals of double floating poiunt numbers to encode (0-9) */ + int doublePrecision; + + /* + If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + int forceASCII; + + + /* + Set to an error message if error occured */ + const char *errorMsg; + JSOBJ errorObj; + + /* Buffer stuff */ + char *start; + char *offset; + char *end; + int heap; + int level; + +} JSONObjectEncoder; + + +/* +Encode an object structure into JSON. + +Arguments: +obj - An anonymous type representing the object +enc - Function definitions for querying JSOBJ type +buffer - Preallocated buffer to store result in. If NULL function allocates own buffer +cbBuffer - Length of buffer (ignored if buffer is NULL) + +Returns: +Encoded JSON object as a null terminated char string. + +NOTE: +If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer. +Life cycle of the provided buffer must still be handled by caller. + +If the return value doesn't equal the specified buffer caller must release the memory using +JSONObjectEncoder.free or free() as specified when calling this function. +*/ +EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); + + + +typedef struct __JSONObjectDecoder +{ + JSOBJ (*newString)(wchar_t *start, wchar_t *end); + int (*objectAddKey)(JSOBJ obj, JSOBJ name, JSOBJ value); + int (*arrayAddItem)(JSOBJ obj, JSOBJ value); + JSOBJ (*newTrue)(void); + JSOBJ (*newFalse)(void); + JSOBJ (*newNull)(void); + JSOBJ (*newObject)(void *decoder); + JSOBJ (*endObject)(JSOBJ obj); + JSOBJ (*newArray)(void *decoder); + JSOBJ (*endArray)(JSOBJ obj); + JSOBJ (*newInt)(JSINT32 value); + JSOBJ (*newLong)(JSINT64 value); + JSOBJ (*newDouble)(double value); + void (*releaseObject)(JSOBJ obj, void *decoder); + JSPFN_MALLOC malloc; + JSPFN_FREE free; + JSPFN_REALLOC realloc; + + char *errorStr; + char *errorOffset; + + + +} JSONObjectDecoder; + +EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); + +#endif diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/src/ujson/python/py_defines.h new file mode 100644 index 0000000000000..1544c2e3cf34d --- /dev/null +++ b/pandas/src/ujson/python/py_defines.h @@ -0,0 +1,15 @@ +#include + +#if PY_MAJOR_VERSION >= 3 + +#define PyInt_Check PyLong_Check +#define PyInt_AS_LONG PyLong_AsLong +#define PyInt_FromLong PyLong_FromLong + +#define PyString_Check PyBytes_Check +#define PyString_GET_SIZE PyBytes_GET_SIZE +#define PyString_AS_STRING PyBytes_AS_STRING + +#define PyString_FromString PyUnicode_FromString + +#endif diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h new file mode 100644 index 0000000000000..9449441411192 --- /dev/null +++ b/pandas/src/ujson/python/version.h @@ -0,0 +1 @@ +#define UJSON_VERSION "1.18" diff --git a/setup.py b/setup.py index 3e56144e25378..c451efb17afc7 100755 --- a/setup.py +++ b/setup.py @@ -250,6 +250,11 @@ def initialize_options(self): for f in files: if f in self._clean_exclude: continue + + # XXX + if 'ujson' in f: + continue + if os.path.splitext(f)[-1] in ('.pyc', '.so', '.o', '.pyo', '.pyd', '.c', '.orig'): @@ -472,6 +477,21 @@ def pxd(name): root, _ = os.path.splitext(ext.sources[0]) ext.sources[0] = root + suffix +ujson_ext = Extension('pandas.json', + depends=['pandas/src/ujson/lib/ultrajson.h'], + sources=['pandas/src/ujson/python/ujson.c', + 'pandas/src/ujson/python/objToJSON.c', + 'pandas/src/ujson/python/JSONtoObj.c', + 'pandas/src/ujson/lib/ultrajsonenc.c', + 'pandas/src/ujson/lib/ultrajsondec.c', + 'pandas/src/datetime/np_datetime.c', + 'pandas/src/datetime/np_datetime_strings.c'], + include_dirs=['pandas/src/ujson/python', + 'pandas/src/ujson/lib'] + common_include) + + +extensions.append(ujson_ext) + if _have_setuptools: setuptools_kwargs["test_suite"] = "nose.collector" @@ -500,6 +520,7 @@ def pxd(name): 'pandas.tseries', 'pandas.tseries.tests', 'pandas.io.tests', + 'pandas.io.tests.test_json', 'pandas.stats.tests', ], package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', From 958096be17168b76060b5cf47d20d76d8727ae77 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 11 May 2013 18:39:55 -0700 Subject: [PATCH 2/3] DOC: add ultrajson license --- LICENSES/ULTRAJSON_LICENSE | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 LICENSES/ULTRAJSON_LICENSE diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE new file mode 100644 index 0000000000000..defca46e7f820 --- /dev/null +++ b/LICENSES/ULTRAJSON_LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. \ No newline at end of file From e7f330df4713407670ff16de6f05ec58c3ca1465 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 12 May 2013 11:13:13 -0700 Subject: [PATCH 3/3] TST: json manip test script. and trigger travis --- scripts/json_manip.py | 421 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 421 insertions(+) create mode 100644 scripts/json_manip.py diff --git a/scripts/json_manip.py b/scripts/json_manip.py new file mode 100644 index 0000000000000..e76a99cca344a --- /dev/null +++ b/scripts/json_manip.py @@ -0,0 +1,421 @@ +""" + +Tasks +------- + +Search and transform jsonable structures, specifically to make it 'easy' to make tabular/csv output for other consumers. + +Example +~~~~~~~~~~~~~ + + *give me a list of all the fields called 'id' in this stupid, gnarly + thing* + + >>> Q('id',gnarly_data) + ['id1','id2','id3'] + + +Observations: +--------------------- + +1) 'simple data structures' exist and are common. They are tedious + to search. + +2) The DOM is another nested / treeish structure, and jQuery selector is + a good tool for that. + +3a) R, Numpy, Excel and other analysis tools want 'tabular' data. These + analyses are valuable and worth doing. + +3b) Dot/Graphviz, NetworkX, and some other analyses *like* treeish/dicty + things, and those analyses are also worth doing! + +3c) Some analyses are best done using 'one-off' and custom code in C, Python, + or another 'real' programming language. + +4) Arbitrary transforms are tedious and error prone. SQL is one solution, + XSLT is another, + +5) the XPATH/XML/XSLT family is.... not universally loved :) They are + very complete, and the completeness can make simple cases... gross. + +6) For really complicated data structures, we can write one-off code. Getting + 80% of the way is mostly okay. There will always have to be programmers + in the loop. + +7) Re-inventing SQL is probably a failure mode. So is reinventing XPATH, XSLT + and the like. Be wary of mission creep! Re-use when possible (e.g., can + we put the thing into a DOM using + +8) If the interface is good, people can improve performance later. + + +Simplifying +--------------- + + +1) Assuming 'jsonable' structures + +2) keys are strings or stringlike. Python allows any hashable to be a key. + for now, we pretend that doesn't happen. + +3) assumes most dicts are 'well behaved'. DAG, no cycles! + +4) assume that if people want really specialized transforms, they can do it + themselves. + +""" + +from collections import Counter, namedtuple +import csv +import itertools +from itertools import product +from operator import attrgetter as aget, itemgetter as iget +import operator +import sys + + + +## note 'url' appears multiple places and not all extensions have same struct +ex1 = { + 'name': 'Gregg', + 'extensions': [ + {'id':'hello', + 'url':'url1'}, + {'id':'gbye', + 'url':'url2', + 'more': dict(url='url3')}, + ] +} + +## much longer example +ex2 = {u'metadata': {u'accessibilities': [{u'name': u'accessibility.tabfocus', + u'value': 7}, + {u'name': u'accessibility.mouse_focuses_formcontrol', u'value': False}, + {u'name': u'accessibility.browsewithcaret', u'value': False}, + {u'name': u'accessibility.win32.force_disabled', u'value': False}, + {u'name': u'accessibility.typeaheadfind.startlinksonly', u'value': False}, + {u'name': u'accessibility.usebrailledisplay', u'value': u''}, + {u'name': u'accessibility.typeaheadfind.timeout', u'value': 5000}, + {u'name': u'accessibility.typeaheadfind.enabletimeout', u'value': True}, + {u'name': u'accessibility.tabfocus_applies_to_xul', u'value': False}, + {u'name': u'accessibility.typeaheadfind.flashBar', u'value': 1}, + {u'name': u'accessibility.typeaheadfind.autostart', u'value': True}, + {u'name': u'accessibility.blockautorefresh', u'value': False}, + {u'name': u'accessibility.browsewithcaret_shortcut.enabled', + u'value': True}, + {u'name': u'accessibility.typeaheadfind.enablesound', u'value': True}, + {u'name': u'accessibility.typeaheadfind.prefillwithselection', + u'value': True}, + {u'name': u'accessibility.typeaheadfind.soundURL', u'value': u'beep'}, + {u'name': u'accessibility.typeaheadfind', u'value': False}, + {u'name': u'accessibility.typeaheadfind.casesensitive', u'value': 0}, + {u'name': u'accessibility.warn_on_browsewithcaret', u'value': True}, + {u'name': u'accessibility.usetexttospeech', u'value': u''}, + {u'name': u'accessibility.accesskeycausesactivation', u'value': True}, + {u'name': u'accessibility.typeaheadfind.linksonly', u'value': False}, + {u'name': u'isInstantiated', u'value': True}], + u'extensions': [{u'id': u'216ee7f7f4a5b8175374cd62150664efe2433a31', + u'isEnabled': True}, + {u'id': u'1aa53d3b720800c43c4ced5740a6e82bb0b3813e', u'isEnabled': False}, + {u'id': u'01ecfac5a7bd8c9e27b7c5499e71c2d285084b37', u'isEnabled': True}, + {u'id': u'1c01f5b22371b70b312ace94785f7b0b87c3dfb2', u'isEnabled': True}, + {u'id': u'fb723781a2385055f7d024788b75e959ad8ea8c3', u'isEnabled': True}], + u'fxVersion': u'9.0', + u'location': u'zh-CN', + u'operatingSystem': u'WINNT Windows NT 5.1', + u'surveyAnswers': u'', + u'task_guid': u'd69fbd15-2517-45b5-8a17-bb7354122a75', + u'tpVersion': u'1.2', + u'updateChannel': u'beta'}, + u'survey_data': { + u'extensions': [{u'appDisabled': False, + u'id': u'testpilot?labs.mozilla.com', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Test Pilot'}, + {u'appDisabled': True, + u'id': u'dict?www.youdao.com', + u'isCompatible': False, + u'isEnabled': False, + u'isPlatformCompatible': True, + u'name': u'Youdao Word Capturer'}, + {u'appDisabled': False, + u'id': u'jqs?sun.com', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Java Quick Starter'}, + {u'appDisabled': False, + u'id': u'?20a82645-c095-46ed-80e3-08825760534b?', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Microsoft .NET Framework Assistant'}, + {u'appDisabled': False, + u'id': u'?a0d7ccb3-214d-498b-b4aa-0e8fda9a7bf7?', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'WOT'}], + u'version_number': 1}} + +# class SurveyResult(object): + +# def __init__(self, record): +# self.record = record +# self.metadata, self.survey_data = self._flatten_results() + +# def _flatten_results(self): +# survey_data = self.record['survey_data'] +# extensions = DataFrame(survey_data['extensions']) + +def denorm(queries,iterable_of_things,default=None): + """ + 'repeat', or 'stutter' to 'tableize' for downstream. + (I have no idea what a good word for this is!) + + Think ``kronecker`` products, or: + + ``SELECT single,multiple FROM table;`` + + single multiple + ------- --------- + id1 val1 + id1 val2 + + + Args: + + queries: iterable of ``Q`` queries. + iterable_of_things: to be queried. + + Returns: + + list of 'stuttered' output, where if a query returns + a 'single', it gets repeated appropriately. + + + """ + + def _denorm(queries,thing): + fields = [] + results = [] + for q in queries: + #print q + r = Ql(q,thing) + #print "-- result: ", r + if not r: + r = [default] + if type(r[0]) is type({}): + fields.append(sorted(r[0].keys())) # dicty answers + else: + fields.append([q]) # stringy answer + + results.append(r) + + #print results + #print fields + flist = list(flatten(*map(iter,fields))) + + prod = itertools.product(*results) + for p in prod: + U = dict() + for (ii,thing) in enumerate(p): + #print ii,thing + if type(thing) is type({}): + U.update(thing) + else: + U[fields[ii][0]] = thing + + yield U + + return list(flatten(*[_denorm(queries,thing) for thing in iterable_of_things])) + + +def default_iget(fields,default=None,): + """ itemgetter with 'default' handling, that *always* returns lists + + API CHANGES from ``operator.itemgetter`` + + Note: Sorry to break the iget api... (fields vs *fields) + Note: *always* returns a list... unlike itemgetter, + which can return tuples or 'singles' + """ + myiget = operator.itemgetter(*fields) + L = len(fields) + def f(thing): + try: + ans = list(myiget(thing)) + if L < 2: + ans = [ans,] + return ans + except KeyError: + # slower! + return [thing.get(x,default) for x in fields] + + f.__doc__ = "itemgetter with default %r for fields %r" %(default,fields) + f.__name__ = "default_itemgetter" + return f + + +def flatten(*stack): + """ + helper function for flattening iterables of generators in a + sensible way. + """ + stack = list(stack) + while stack: + try: x = stack[0].next() + except StopIteration: + stack.pop(0) + continue + if hasattr(x,'next') and callable(getattr(x,'next')): + stack.insert(0, x) + + #if isinstance(x, (GeneratorType,listerator)): + else: yield x + + +def _Q(filter_, thing): + """ underlying machinery for Q function recursion """ + T = type(thing) + if T is type({}): + for k,v in thing.iteritems(): + #print k,v + if filter_ == k: + if type(v) is type([]): + yield iter(v) + else: + yield v + + if type(v) in (type({}),type([])): + yield Q(filter_,v) + + elif T is type([]): + for k in thing: + #print k + yield Q(filter_,k) + + else: + # no recursion. + pass + +def Q(filter_,thing): + """ + type(filter): + - list: a flattened list of all searches (one list) + - dict: dict with vals each of which is that search + + Notes: + + [1] 'parent thing', with space, will do a descendent + [2] this will come back 'flattened' jQuery style + [3] returns a generator. Use ``Ql`` if you want a list. + + """ + if type(filter_) is type([]): + return flatten(*[_Q(x,thing) for x in filter_]) + elif type(filter_) is type({}): + d = dict.fromkeys(filter_.keys()) + #print d + for k in d: + #print flatten(Q(k,thing)) + d[k] = Q(k,thing) + + return d + + else: + if " " in filter_: # i.e. "antecendent post" + parts = filter_.strip().split() + r = None + for p in parts: + r = Ql(p,thing) + thing = r + + return r + + else: # simple. + return flatten(_Q(filter_,thing)) + +def Ql(filter_,thing): + """ same as Q, but returns a list, not a generator """ + res = Q(filter_,thing) + + if type(filter_) is type({}): + for k in res: + res[k] = list(res[k]) + return res + + else: + return list(res) + + + +def countit(fields,iter_of_iter,default=None): + """ + note: robust to fields not being in i_of_i, using ``default`` + """ + C = Counter() # needs hashables + T = namedtuple("Thing",fields) + get = default_iget(*fields,default=default) + return Counter( + (T(*get(thing)) for thing in iter_of_iter) + ) + + +## right now this works for one row... +def printout(queries,things,default=None, f=sys.stdout, **kwargs): + """ will print header and objects + + **kwargs go to csv.DictWriter + + help(csv.DictWriter) for more. + """ + + results = denorm(queries,things,default=None) + fields = set(itertools.chain(*(x.keys() for x in results))) + + W = csv.DictWriter(f=f,fieldnames=fields,**kwargs) + #print "---prod---" + #print list(prod) + W.writeheader() + for r in results: + W.writerow(r) + + +def test_run(): + print "\n>>> print list(Q('url',ex1))" + print list(Q('url',ex1)) + assert list(Q('url',ex1)) == ['url1','url2','url3'] + assert Ql('url',ex1) == ['url1','url2','url3'] + + print "\n>>> print list(Q(['name','id'],ex1))" + print list(Q(['name','id'],ex1)) + assert Ql(['name','id'],ex1) == ['Gregg','hello','gbye'] + + + print "\n>>> print Ql('more url',ex1)" + print Ql('more url',ex1) + + + print "\n>>> list(Q('extensions',ex1))" + print list(Q('extensions',ex1)) + + print "\n>>> print Ql('extensions',ex1)" + print Ql('extensions',ex1) + + print "\n>>> printout(['name','extensions'],[ex1,], extrasaction='ignore')" + printout(['name','extensions'],[ex1,], extrasaction='ignore') + + print "\n\n" + + from pprint import pprint as pp + + print "-- note that the extension fields are also flattened! (and N/A) -- " + pp(denorm(['location','fxVersion','notthere','survey_data extensions'],[ex2,], default="N/A")[:2]) + + +if __name__ == "__main__": + pass