From 510d350f74bd518c70a005f83efe2d0ede4ea6e7 Mon Sep 17 00:00:00 2001 From: gliptak Date: Sat, 8 Jun 2013 17:54:50 -0300 Subject: [PATCH 01/71] correct FRED test (GDP changed ...) --- pandas/io/tests/test_fred.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/test_fred.py b/pandas/io/tests/test_fred.py index 3e951e5443bc3..00a90ec3da402 100644 --- a/pandas/io/tests/test_fred.py +++ b/pandas/io/tests/test_fred.py @@ -29,7 +29,7 @@ def test_fred(self): try: self.assertEquals( web.DataReader("GDP", "fred", start, end)['GDP'].tail(1), - 16010.2) + 16004.5) self.assertRaises( Exception, From 877d8bb5300d9a995342b7affe721df4af15907b Mon Sep 17 00:00:00 2001 From: gliptak Date: Sat, 8 Jun 2013 18:39:51 -0300 Subject: [PATCH 02/71] remove unused import in test_yahoo --- pandas/io/tests/test_yahoo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index 1109d67278f73..b79fdad2bff9d 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -2,8 +2,6 @@ import nose from datetime import datetime -from pandas.util.py3compat import StringIO, BytesIO - import pandas as pd import pandas.io.data as web from pandas.util.testing import (network, assert_frame_equal, From 8196db95daa658737af929f68ec7cd45b826aa02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 17:41:24 -0400 Subject: [PATCH 03/71] Use google finance as datasource (test only, still pointing to yahoo finance) --- pandas/io/data.py | 178 +++++++++++++++++++++++++++++++++ pandas/io/tests/test_google.py | 95 ++++++++++++++++++ 2 files changed, 273 insertions(+) create mode 100644 pandas/io/tests/test_google.py diff --git a/pandas/io/data.py b/pandas/io/data.py index 43178fdcfddf1..f2b539fc795a7 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -58,6 +58,10 @@ def DataReader(name, data_source=None, start=None, end=None, return get_data_yahoo(symbols=name, start=start, end=end, adjust_price=False, chunk=25, retry_count=retry_count, pause=pause) + elif(data_source == "google"): + return get_data_google(symbols=name, start=start, end=end, + adjust_price=False, chunk=25, + retry_count=retry_count, pause=pause) elif(data_source == "fred"): return get_data_fred(name=name, start=start, end=end) elif(data_source == "famafrench"): @@ -132,6 +136,56 @@ def get_quote_yahoo(symbols): return DataFrame(data, index=idx) +def get_quote_google(symbols): + """ + Get current yahoo quote + + Returns a DataFrame + """ + if isinstance(symbols, str): + sym_list = symbols + elif not isinstance(symbols, Series): + symbols = Series(symbols) + sym_list = str.join('+', symbols) + else: + sym_list = str.join('+', symbols) + + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm + codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', + 'time': 't1', 'short_ratio': 's7'} + request = str.join('', codes.values()) # code request string + header = codes.keys() + + data = dict(zip(codes.keys(), [[] for i in range(len(codes))])) + + urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % ( + sym_list, request) + + try: + lines = urllib2.urlopen(urlStr).readlines() + except Exception, e: + s = "Failed to download:\n{0}".format(e) + print s + return None + + for line in lines: + fields = line.decode('utf-8').strip().split(',') + for i, field in enumerate(fields): + if field[-2:] == '%"': + data[header[i]].append(float(field.strip('"%'))) + elif field[0] == '"': + data[header[i]].append(field.strip('"')) + else: + try: + data[header[i]].append(float(field)) + except ValueError: + data[header[i]].append(np.nan) + + idx = data.pop('symbol') + + return DataFrame(data, index=idx) + + def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, pause=0, **kwargs): """ @@ -178,6 +232,52 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, "return a 200 for url %s" % (pause, url)) +def _get_hist_google(sym=None, start=None, end=None, retry_count=3, + pause=0, **kwargs): + """ + Get historical data for the given name from yahoo. + Date format is datetime + + Returns a DataFrame. + """ + if(sym is None): + warnings.warn("Need to provide a name.") + return None + + start, end = _sanitize_dates(start, end) + + yahoo_URL = 'http://ichart.yahoo.com/table.csv?' + + url = yahoo_URL + 's=%s' % sym + \ + '&a=%s' % (start.month - 1) + \ + '&b=%s' % start.day + \ + '&c=%s' % start.year + \ + '&d=%s' % (end.month - 1) + \ + '&e=%s' % end.day + \ + '&f=%s' % end.year + \ + '&g=d' + \ + '&ignore=.csv' + + for _ in range(retry_count): + resp = urllib2.urlopen(url) + if resp.code == 200: + lines = resp.read() + rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + parse_dates=True)[::-1] + + # Yahoo! Finance sometimes does this awesome thing where they + # return 2 rows for the most recent business day + if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover + rs = rs[:-1] + + return rs + + time.sleep(pause) + + raise Exception("after %d tries, Yahoo did not " + "return a 200 for url %s" % (pause, url)) + + def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']): """ Return modifed DataFrame or Panel with adjusted prices based on @@ -347,6 +447,84 @@ def dl_mult_symbols(symbols): return hist_data +def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0, + adjust_price=False, ret_index=False, chunksize=25, + **kwargs): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Yahoo! Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, array-like object (list, tuple, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + adjust_price : bool, default False + If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close') + based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops + 'Adj Close'. + ret_index : bool, default False + If True, includes a simple return index 'Ret_Index' in hist_data. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + + Returns + ------- + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) + """ + + def dl_mult_symbols(symbols): + stocks = {} + for sym_group in _in_chunks(symbols, chunksize): + for sym in sym_group: + try: + stocks[sym] = _get_hist_google(sym, start=start, + end=end, **kwargs) + except: + warnings.warn('Error with sym: ' + sym + '... skipping.') + + time.sleep(pause) + + return Panel(stocks).swapaxes('items', 'minor') + + if 'name' in kwargs: + warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.", + FutureWarning) + symbols = kwargs['name'] + + #If a single symbol, (e.g., 'GOOG') + if isinstance(symbols, (str, int)): + sym = symbols + hist_data = _get_hist_google(sym, start=start, end=end) + #Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) + elif isinstance(symbols, DataFrame): + try: + hist_data = dl_mult_symbols(Series(symbols.index)) + except ValueError: + raise + else: #Guess a Series + try: + hist_data = dl_mult_symbols(symbols) + except TypeError: + hist_data = dl_mult_symbols(Series(symbols)) + + if(ret_index): + hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) + if(adjust_price): + hist_data = _adjust_prices(hist_data) + + return hist_data def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py new file mode 100644 index 0000000000000..9c3e81485f34d --- /dev/null +++ b/pandas/io/tests/test_google.py @@ -0,0 +1,95 @@ +import unittest +import nose +from datetime import datetime + +import pandas as pd +import pandas.io.data as web +from pandas.util.testing import (network, assert_frame_equal, + assert_series_equal, + assert_almost_equal) +from numpy.testing.decorators import slow + +import urllib2 + + +class TestGoogle(unittest.TestCase): + + @slow + @network + def test_google(self): + # asserts that google is minimally working and that it throws + # an excecption when DataReader can't get a 200 response from + # google + start = datetime(2010, 1, 1) + end = datetime(2013, 01, 27) + + try: + self.assertEquals( + web.DataReader("F", 'google', start, end)['Close'][-1], + 13.68) + + self.assertRaises( + Exception, + lambda: web.DataReader("NON EXISTENT TICKER", 'google', + start, end)) + except urllib2.URLError: + try: + urllib2.urlopen('http://www.google.com') + except urllib2.URLError: + raise nose.SkipTest + else: + raise + + + @slow + @network + def test_get_quote(self): + df = web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG'])) + assert_series_equal(df.ix[0], df.ix[2]) + + + @slow + @network + def test_get_data(self): + import numpy as np + df = web.get_data_google('GOOG') + assert df.Volume.ix['OCT-08-2010'] == 2859200 + + sl = ['AAPL', 'AMZN', 'GOOG'] + pan = web.get_data_google(sl, '2012') + ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + assert ts[0].dayofyear == 96 + + pan = web.get_data_google(['GE', 'MSFT', 'INTC'], 'JAN-01-12', 'JAN-31-12') + expected = [19.02, 28.23, 25.39] + result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist() + assert result == expected + + # sanity checking + t= np.array(result) + assert np.issubdtype(t.dtype, np.floating) + assert t.shape == (3,) + + expected = [[ 18.99, 28.4 , 25.18], + [ 18.58, 28.31, 25.13], + [ 19.03, 28.16, 25.52], + [ 18.81, 28.82, 25.87]] + result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values + assert (result == expected).all() + + #Check ret_index + pan = web.get_data_google(['GE', 'INTC', 'IBM'], '1977', '1987', + ret_index=True) + tstamp = pan.Ret_Index.INTC.first_valid_index() + result = pan.Ret_Index.ix[tstamp]['INTC'] + expected = 1.0 + assert result == expected + + # sanity checking + t= np.array(pan) + assert np.issubdtype(t.dtype, np.floating) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) From ad89365c3870f192dde01568b584bec8b7ee1086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 19:00:30 -0400 Subject: [PATCH 04/71] Remove unneeded import from test_google --- pandas/io/tests/test_google.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 9c3e81485f34d..5b5fdd59e4b55 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -4,9 +4,7 @@ import pandas as pd import pandas.io.data as web -from pandas.util.testing import (network, assert_frame_equal, - assert_series_equal, - assert_almost_equal) +from pandas.util.testing import (network, assert_series_equal) from numpy.testing.decorators import slow import urllib2 From ee10caaaa30a81fc0e72ff53ca85f0937099b837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 19:13:13 -0400 Subject: [PATCH 05/71] Implement _get_hist_google --- pandas/io/data.py | 39 +++++++--------------------------- pandas/io/tests/test_google.py | 11 ++-------- 2 files changed, 10 insertions(+), 40 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index f2b539fc795a7..d178d0089e6d6 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -246,18 +246,12 @@ def _get_hist_google(sym=None, start=None, end=None, retry_count=3, start, end = _sanitize_dates(start, end) - yahoo_URL = 'http://ichart.yahoo.com/table.csv?' - - url = yahoo_URL + 's=%s' % sym + \ - '&a=%s' % (start.month - 1) + \ - '&b=%s' % start.day + \ - '&c=%s' % start.year + \ - '&d=%s' % (end.month - 1) + \ - '&e=%s' % end.day + \ - '&f=%s' % end.year + \ - '&g=d' + \ - '&ignore=.csv' + google_URL = 'http://www.google.com/finance/historical?' + # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv + url = google_URL + urllib.urlencode({"q": sym, \ + "startdate": start.strftime('%b %d, %Y'), \ + "enddate": end.strftime('%b %d, %Y'), "output": "csv" }) for _ in range(retry_count): resp = urllib2.urlopen(url) if resp.code == 200: @@ -265,16 +259,11 @@ def _get_hist_google(sym=None, start=None, end=None, retry_count=3, rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True)[::-1] - # Yahoo! Finance sometimes does this awesome thing where they - # return 2 rows for the most recent business day - if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover - rs = rs[:-1] - return rs time.sleep(pause) - raise Exception("after %d tries, Yahoo did not " + raise Exception("after %d tries, Google did not " "return a 200 for url %s" % (pause, url)) @@ -448,11 +437,10 @@ def dl_mult_symbols(symbols): return hist_data def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0, - adjust_price=False, ret_index=False, chunksize=25, - **kwargs): + chunksize=25, **kwargs): """ Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Yahoo! Finance servers, + range, start to end. To avoid being penalized by Google Finance servers, pauses between downloading 'chunks' of symbols can be specified. Parameters @@ -470,12 +458,6 @@ def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0, pause : int, default 0 Time, in seconds, to pause between consecutive queries of chunks. If single value given for symbol, represents the pause between retries. - adjust_price : bool, default False - If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close') - based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops - 'Adj Close'. - ret_index : bool, default False - If True, includes a simple return index 'Ret_Index' in hist_data. chunksize : int, default 25 Number of symbols to download consecutively before intiating pause. @@ -519,11 +501,6 @@ def dl_mult_symbols(symbols): except TypeError: hist_data = dl_mult_symbols(Series(symbols)) - if(ret_index): - hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) - if(adjust_price): - hist_data = _adjust_prices(hist_data) - return hist_data def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 5b5fdd59e4b55..01868a70c3709 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -51,7 +51,8 @@ def test_get_quote(self): def test_get_data(self): import numpy as np df = web.get_data_google('GOOG') - assert df.Volume.ix['OCT-08-2010'] == 2859200 + print(df.Volume.ix['OCT-08-2010']) + assert df.Volume.ix['OCT-08-2010'] == 2863473 sl = ['AAPL', 'AMZN', 'GOOG'] pan = web.get_data_google(sl, '2012') @@ -75,14 +76,6 @@ def test_get_data(self): result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values assert (result == expected).all() - #Check ret_index - pan = web.get_data_google(['GE', 'INTC', 'IBM'], '1977', '1987', - ret_index=True) - tstamp = pan.Ret_Index.INTC.first_valid_index() - result = pan.Ret_Index.ix[tstamp]['INTC'] - expected = 1.0 - assert result == expected - # sanity checking t= np.array(pan) assert np.issubdtype(t.dtype, np.floating) From f43d24540a09cc2855569c6e8811669759cc065a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 19:20:23 -0400 Subject: [PATCH 06/71] No current finance data from Google --- pandas/io/data.py | 49 +--------------------------------- pandas/io/tests/test_google.py | 5 ++-- 2 files changed, 3 insertions(+), 51 deletions(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index d178d0089e6d6..13551272edae2 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -137,54 +137,7 @@ def get_quote_yahoo(symbols): def get_quote_google(symbols): - """ - Get current yahoo quote - - Returns a DataFrame - """ - if isinstance(symbols, str): - sym_list = symbols - elif not isinstance(symbols, Series): - symbols = Series(symbols) - sym_list = str.join('+', symbols) - else: - sym_list = str.join('+', symbols) - - # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm - codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', - 'time': 't1', 'short_ratio': 's7'} - request = str.join('', codes.values()) # code request string - header = codes.keys() - - data = dict(zip(codes.keys(), [[] for i in range(len(codes))])) - - urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % ( - sym_list, request) - - try: - lines = urllib2.urlopen(urlStr).readlines() - except Exception, e: - s = "Failed to download:\n{0}".format(e) - print s - return None - - for line in lines: - fields = line.decode('utf-8').strip().split(',') - for i, field in enumerate(fields): - if field[-2:] == '%"': - data[header[i]].append(float(field.strip('"%'))) - elif field[0] == '"': - data[header[i]].append(field.strip('"')) - else: - try: - data[header[i]].append(float(field)) - except ValueError: - data[header[i]].append(np.nan) - - idx = data.pop('symbol') - - return DataFrame(data, index=idx) - + raise NotImplementedError("Google Finance doesn't have this functionality") def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, pause=0, **kwargs): diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 01868a70c3709..9db7964c1acfe 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -42,9 +42,8 @@ def test_google(self): @slow @network def test_get_quote(self): - df = web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG'])) - assert_series_equal(df.ix[0], df.ix[2]) - + self.assertRaises(NotImplementedError, + lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG']))) @slow @network From 9403b11695f695991ddb061faf297f46032b4fb6 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 8 Jun 2013 19:35:32 -0400 Subject: [PATCH 07/71] DOC: filter the correct warning for unique(key, column) DOC/CLN: clean up indentation and dispatch on numpy version --- doc/source/timeseries.rst | 100 ++++++++++++++++++++++---------------- doc/source/v0.10.1.txt | 2 +- 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 3f6a4b7c59067..7f572c8c8e191 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -930,89 +930,103 @@ They can be both positive and negative. .. ipython:: python - from datetime import datetime, timedelta - s = Series(date_range('2012-1-1', periods=3, freq='D')) - td = Series([ timedelta(days=i) for i in range(3) ]) - df = DataFrame(dict(A = s, B = td)) - df - df['C'] = df['A'] + df['B'] - df - df.dtypes - - s - s.max() - s - datetime(2011,1,1,3,5) - s + timedelta(minutes=5) + from datetime import datetime, timedelta + s = Series(date_range('2012-1-1', periods=3, freq='D')) + td = Series([ timedelta(days=i) for i in range(3) ]) + df = DataFrame(dict(A = s, B = td)) + df + df['C'] = df['A'] + df['B'] + df + df.dtypes + + s - s.max() + s - datetime(2011,1,1,3,5) + s + timedelta(minutes=5) Getting scalar results from a ``timedelta64[ns]`` series +.. ipython:: python + :suppress: + + from distutils.version import LooseVersion + .. ipython:: python y = s - s[0] y - y.apply(lambda x: x.item().total_seconds()) - y.apply(lambda x: x.item().days) - -.. note:: - These operations are different in numpy 1.6.2 and in numpy >= 1.7. The ``timedelta64[ns]`` scalar - type in 1.6.2 is much like a ``datetime.timedelta``, while in 1.7 it is a nanosecond based integer. - A future version of pandas will make this transparent. + if LooseVersion(np.__version__) <= '1.6.2': + y.apply(lambda x: x.item().total_seconds()) + y.apply(lambda x: x.item().days) + else: + y.apply(lambda x: x / np.timedelta64(1, 's')) + y.apply(lambda x: x / np.timedelta64(1, 'D')) + +.. note:: - These are the equivalent operation to above in numpy >= 1.7 + As you can see from the conditional statement above, these operations are + different in numpy 1.6.2 and in numpy >= 1.7. The ``timedelta64[ns]`` scalar + type in 1.6.2 is much like a ``datetime.timedelta``, while in 1.7 it is a + nanosecond based integer. A future version of pandas will make this + transparent. - ``y.apply(lambda x: x.item()/np.timedelta64(1,'s'))`` +.. note:: - ``y.apply(lambda x: x.item()/np.timedelta64(1,'D'))`` + In numpy >= 1.7 dividing a ``timedelta64`` array by another ``timedelta64`` + array will yield an array with dtype ``np.float64``. Series of timedeltas with ``NaT`` values are supported .. ipython:: python - y = s - s.shift() - y + y = s - s.shift() + y + The can be set to ``NaT`` using ``np.nan`` analagously to datetimes .. ipython:: python - y[1] = np.nan - y + y[1] = np.nan + y Operands can also appear in a reversed order (a singluar object operated with a Series) .. ipython:: python - s.max() - s - datetime(2011,1,1,3,5) - s - timedelta(minutes=5) + s + s.max() - s + datetime(2011,1,1,3,5) - s + timedelta(minutes=5) + s Some timedelta numeric like operations are supported. .. ipython:: python - td - timedelta(minutes=5,seconds=5,microseconds=5) + td - timedelta(minutes=5, seconds=5, microseconds=5) ``min, max`` and the corresponding ``idxmin, idxmax`` operations are support on frames .. ipython:: python - df = DataFrame(dict(A = s - Timestamp('20120101')-timedelta(minutes=5,seconds=5), - B = s - Series(date_range('2012-1-2', periods=3, freq='D')))) - df + A = s - Timestamp('20120101') - timedelta(minutes=5, seconds=5) + B = s - Series(date_range('2012-1-2', periods=3, freq='D')) + df = DataFrame(dict(A=A, B=B)) + df - df.min() - df.min(axis=1) + df.min() + df.min(axis=1) - df.idxmin() - df.idxmax() + df.idxmin() + df.idxmax() -``min, max`` operations are support on series, these return a single element ``timedelta64[ns]`` Series (this avoids -having to deal with numpy timedelta64 issues). ``idxmin, idxmax`` are supported as well. +``min, max`` operations are support on series, these return a single element +``timedelta64[ns]`` Series (this avoids having to deal with numpy timedelta64 +issues). ``idxmin, idxmax`` are supported as well. .. ipython:: python - df.min().max() - df.min(axis=1).min() + df.min().max() + df.min(axis=1).min() - df.min().idxmax() - df.min(axis=1).idxmin() + df.min().idxmax() + df.min(axis=1).idxmin() diff --git a/doc/source/v0.10.1.txt b/doc/source/v0.10.1.txt index 3c22e9552c3a2..dafa4300af0e3 100644 --- a/doc/source/v0.10.1.txt +++ b/doc/source/v0.10.1.txt @@ -69,7 +69,7 @@ Retrieving unique values in an indexable or data column. import warnings with warnings.catch_warnings(): - warnings.simplefilter('ignore', category=DeprecationWarning) + warnings.simplefilter('ignore', category=UserWarning) store.unique('df','index') store.unique('df','string') From 359c7369306499e3b371fe99f313d0f050c68bca Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 8 Jun 2013 19:36:28 -0400 Subject: [PATCH 08/71] DOC/BUG: fix overwriting of 'df' variable in doc build --- doc/source/visualization.rst | 45 +++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 63b5920bb0146..f0790396a5c39 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -5,14 +5,14 @@ :suppress: import numpy as np + from numpy.random import randn, rand, randint np.random.seed(123456) - from pandas import * + from pandas import DataFrame, Series, date_range, options import pandas.util.testing as tm - randn = np.random.randn np.set_printoptions(precision=4, suppress=True) import matplotlib.pyplot as plt plt.close('all') - options.display.mpl_style='default' + options.display.mpl_style = 'default' ************************ Plotting with matplotlib @@ -60,8 +60,7 @@ On DataFrame, ``plot`` is a convenience to plot all of the columns with labels: .. ipython:: python - df = DataFrame(randn(1000, 4), index=ts.index, - columns=['A', 'B', 'C', 'D']) + df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() @savefig frame_plot_basic.png width=6in @@ -101,7 +100,7 @@ You can plot one column versus another using the `x` and `y` keywords in plt.figure() - df3 = DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() + df3 = DataFrame(randn(1000, 2), columns=['B', 'C']).cumsum() df3['A'] = Series(range(len(df))) @savefig df_plot_xy.png width=6in @@ -169,7 +168,7 @@ Here is the default behavior, notice how the x-axis tick labelling is performed: df.A.plot() -Using the ``x_compat`` parameter, you can suppress this bevahior: +Using the ``x_compat`` parameter, you can suppress this behavior: .. ipython:: python @@ -200,6 +199,15 @@ Targeting different subplots You can pass an ``ax`` argument to ``Series.plot`` to plot on a particular axis: +.. ipython:: python + :suppress: + + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) + df = df.cumsum() + .. ipython:: python fig, axes = plt.subplots(nrows=2, ncols=2) @@ -210,6 +218,7 @@ You can pass an ``ax`` argument to ``Series.plot`` to plot on a particular axis: @savefig series_plot_multi.png width=6in df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D') + .. _visualization.other: Other plotting features @@ -239,7 +248,7 @@ bar plot: .. ipython:: python - df2 = DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df2 = DataFrame(rand(10, 4), columns=['a', 'b', 'c', 'd']) @savefig bar_plot_multi_ex.png width=5in df2.plot(kind='bar'); @@ -298,10 +307,10 @@ New since 0.10.0, the ``by`` keyword can be specified to plot grouped histograms .. ipython:: python - data = Series(np.random.randn(1000)) + data = Series(randn(1000)) @savefig grouped_hist.png width=6in - data.hist(by=np.random.randint(0, 4, 1000)) + data.hist(by=randint(0, 4, 1000)) .. _visualization.box: @@ -317,7 +326,7 @@ a uniform random variable on [0,1). .. ipython:: python - df = DataFrame(np.random.rand(10,5)) + df = DataFrame(rand(10,5)) plt.figure(); @savefig box_plot_ex.png width=6in @@ -328,7 +337,7 @@ groupings. For instance, .. ipython:: python - df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df = DataFrame(rand(10,2), columns=['Col1', 'Col2'] ) df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) plt.figure(); @@ -341,7 +350,7 @@ columns: .. ipython:: python - df = DataFrame(np.random.rand(10,3), columns=['Col1', 'Col2', 'Col3']) + df = DataFrame(rand(10,3), columns=['Col1', 'Col2', 'Col3']) df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) df['Y'] = Series(['A','B','A','B','A','B','A','B','A','B']) @@ -361,7 +370,7 @@ Scatter plot matrix .. ipython:: python from pandas.tools.plotting import scatter_matrix - df = DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) + df = DataFrame(randn(1000, 4), columns=['a', 'b', 'c', 'd']) @savefig scatter_matrix_kde.png width=6in scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') @@ -378,7 +387,7 @@ setting `kind='kde'`: .. ipython:: python - ser = Series(np.random.randn(1000)) + ser = Series(randn(1000)) @savefig kde_plot.png width=6in ser.plot(kind='kde') @@ -444,7 +453,7 @@ implies that the underlying data are not random. plt.figure() - data = Series(0.1 * np.random.random(1000) + + data = Series(0.1 * rand(1000) + 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) @savefig lag_plot.png width=6in @@ -467,7 +476,7 @@ confidence band. plt.figure() - data = Series(0.7 * np.random.random(1000) + + data = Series(0.7 * rand(1000) + 0.3 * np.sin(np.linspace(-9 * np.pi, 9 * np.pi, num=1000))) @savefig autocorrelation_plot.png width=6in @@ -488,7 +497,7 @@ are what constitutes the bootstrap plot. from pandas.tools.plotting import bootstrap_plot - data = Series(np.random.random(1000)) + data = Series(rand(1000)) @savefig bootstrap_plot.png width=6in bootstrap_plot(data, size=50, samples=500, color='grey') From fdcaf139cebf717b53503a156533b2d3eea1340c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 8 Jun 2013 19:48:18 -0400 Subject: [PATCH 09/71] DOC: give deprecation warning about future timeseries broadcasting behavior DOC/CLN: cleanup --- doc/source/dsintro.rst | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 7870bdbeb97d3..c1d034d0d8e58 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -482,19 +482,23 @@ column-wise: .. ipython:: python index = date_range('1/1/2000', periods=8) - df = DataFrame(randn(8, 3), index=index, - columns=['A', 'B', 'C']) + df = DataFrame(randn(8, 3), index=index, columns=list('ABC')) df type(df['A']) df - df['A'] -Technical purity aside, this case is so common in practice that supporting the -special case is preferable to the alternative of forcing the user to transpose -and do column-based alignment like so: +.. warning:: -.. ipython:: python + .. code-block:: python + + df - df['A'] + + is now deprecated and will be removed in a future release. The preferred way + to replicate this behavior is + + .. code-block:: python - (df.T - df['A']).T + df.sub(df['A'], axis=0) For explicit control over the matching and broadcasting behavior, see the section on :ref:`flexible binary operations `. From c0529576e1bcd4369954539fbafc82a5e8c42502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 20:11:02 -0400 Subject: [PATCH 10/71] Corrected typo in data --- pandas/io/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/data.py b/pandas/io/data.py index 13551272edae2..8bc3df561cadb 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -188,7 +188,7 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3, def _get_hist_google(sym=None, start=None, end=None, retry_count=3, pause=0, **kwargs): """ - Get historical data for the given name from yahoo. + Get historical data for the given name from google. Date format is datetime Returns a DataFrame. From 0aadb1195219269b38e551e9044a52c33898e437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Sat, 8 Jun 2013 20:16:28 -0400 Subject: [PATCH 11/71] Change google finance tests to @network only --- pandas/io/tests/test_google.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 9db7964c1acfe..7f4ca13c27e58 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -12,7 +12,6 @@ class TestGoogle(unittest.TestCase): - @slow @network def test_google(self): # asserts that google is minimally working and that it throws @@ -39,13 +38,11 @@ def test_google(self): raise - @slow @network def test_get_quote(self): self.assertRaises(NotImplementedError, lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG']))) - @slow @network def test_get_data(self): import numpy as np From b8eee755c5c4dbe8300e46d7e8d68760c9572d17 Mon Sep 17 00:00:00 2001 From: gliptak Date: Sat, 8 Jun 2013 21:20:49 -0300 Subject: [PATCH 12/71] Tag yahoo data tests as @network only --- pandas/io/tests/test_yahoo.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index b79fdad2bff9d..0e2c2022af422 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -14,7 +14,6 @@ class TestYahoo(unittest.TestCase): - @slow @network def test_yahoo(self): # asserts that yahoo is minimally working and that it throws @@ -41,14 +40,12 @@ def test_yahoo(self): raise - @slow @network def test_get_quote(self): df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) assert_series_equal(df.ix[0], df.ix[2]) - @slow @network def test_get_components(self): @@ -69,7 +66,6 @@ def test_get_components(self): assert 'GOOG' in df.index assert 'AMZN' in df.index - @slow @network def test_get_data(self): import numpy as np From 9ed14cf62c6f8b592e5776d4c62bab53e3356ac8 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Jun 2013 19:12:43 -0400 Subject: [PATCH 13/71] CLN: conform read_clipboard / to_clipboard to new io standards removed to io.clipboard (from io.parsers) --- RELEASE.rst | 1 + pandas/core/generic.py | 4 ++-- pandas/io/api.py | 4 ++-- pandas/io/clipboard.py | 31 +++++++++++++++++++++++++++++ pandas/io/parsers.py | 44 +++++++++++++----------------------------- 5 files changed, 49 insertions(+), 35 deletions(-) create mode 100644 pandas/io/clipboard.py diff --git a/RELEASE.rst b/RELEASE.rst index 4d85834706e80..eca69d824d377 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -127,6 +127,7 @@ pandas 0.11.1 - added ``pandas.io.api`` for i/o imports - removed ``Excel`` support to ``pandas.io.excel`` - added top-level ``pd.read_sql`` and ``to_sql`` DataFrame methods + - removed ``clipboard`` support to ``pandas.io.clipboard`` - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are deprecated - Implement ``__nonzero__`` for ``NDFrame`` objects (GH3691_, GH3696_) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7dd0315d7d90e..5533584745167 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -492,8 +492,8 @@ def to_hdf(self, path_or_buf, key, **kwargs): return pytables.to_hdf(path_or_buf, key, self, **kwargs) def to_clipboard(self): - from pandas.io import parsers - parsers.to_clipboard(self) + from pandas.io import clipboard + clipboard.to_clipboard(self) # install the indexerse for _name, _indexer in indexing.get_indexers_list(): diff --git a/pandas/io/api.py b/pandas/io/api.py index e4c0c8c0c77f0..f17351921f83f 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -2,8 +2,8 @@ Data IO api """ -from pandas.io.parsers import (read_csv, read_table, read_clipboard, - read_fwf, to_clipboard) +from pandas.io.parsers import read_csv, read_table, read_fwf +from pandas.io.clipboard import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel from pandas.io.pytables import HDFStore, Term, get_store, read_hdf from pandas.io.html import read_html diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py new file mode 100644 index 0000000000000..4aa8db414386b --- /dev/null +++ b/pandas/io/clipboard.py @@ -0,0 +1,31 @@ +""" io on the clipboard """ + +def read_clipboard(**kwargs): # pragma: no cover + """ + Read text from clipboard and pass to read_table. See read_table for the + full argument list + + Returns + ------- + parsed : DataFrame + """ + from pandas.util.clipboard import clipboard_get + text = clipboard_get() + return read_table(StringIO(text), **kwargs) + + +def to_clipboard(obj): # pragma: no cover + """ + Attempt to write text representation of object to the system clipboard + + Notes + ----- + Requirements for your platform + - Linux: xsel command line tool + - Windows: Python win32 extensions + - OS X: + """ + from pandas.util.clipboard import clipboard_set + clipboard_set(str(obj)) + + diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 54ba7536afaee..6e937ba696e39 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,6 +5,7 @@ import re from itertools import izip import csv +from warnings import warn import numpy as np @@ -427,35 +428,6 @@ def read_fwf(filepath_or_buffer, colspecs=None, widths=None, **kwds): return _read(filepath_or_buffer, kwds) -def read_clipboard(**kwargs): # pragma: no cover - """ - Read text from clipboard and pass to read_table. See read_table for the - full argument list - - Returns - ------- - parsed : DataFrame - """ - from pandas.util.clipboard import clipboard_get - text = clipboard_get() - return read_table(StringIO(text), **kwargs) - - -def to_clipboard(obj): # pragma: no cover - """ - Attempt to write text representation of object to the system clipboard - - Notes - ----- - Requirements for your platform - - Linux: xsel command line tool - - Windows: Python win32 extensions - - OS X: - """ - from pandas.util.clipboard import clipboard_set - clipboard_set(str(obj)) - - # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', @@ -1940,15 +1912,25 @@ def _make_reader(self, f): self.data = FixedWidthReader(f, self.colspecs, self.delimiter) +##### deprecations in 0.11.1 ##### +##### remove in 0.12 ##### + +from pandas.io import clipboard +def read_clipboard(**kwargs): + warn("read_clipboard is now a top-level accessible via pandas.read_clipboard", FutureWarning) + clipboard.read_clipboard(**kwargs) + +def to_clipboard(obj): + warn("to_clipboard is now an object level method accessible via obj.to_clipboard()", FutureWarning) + clipboard.to_clipboard(obj) + from pandas.io import excel class ExcelWriter(excel.ExcelWriter): def __init__(self, path): - from warnings import warn warn("ExcelWriter can now be imported from: pandas.io.excel", FutureWarning) super(ExcelWriter, self).__init__(path) class ExcelFile(excel.ExcelFile): def __init__(self, path_or_buf, kind=None, **kwds): - from warnings import warn warn("ExcelFile can now be imported from: pandas.io.excel", FutureWarning) super(ExcelFile, self).__init__(path_or_buf, kind=kind, **kwds) From e8a53188c99bce3052c3b5ac5ca33882d0211b1b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 9 Jun 2013 19:24:23 -0700 Subject: [PATCH 14/71] DOC: release note for #3814 --- RELEASE.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index eca69d824d377..307986ab81681 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -50,6 +50,7 @@ pandas 0.11.1 - Added keyword parameters for different types of scatter_matrix subplots - A ``filter`` method on grouped Series or DataFrames returns a subset of the original (GH3680_, GH919_) + - Access to historical Google Finance data in pandas.io.data (GH3814_) **Improvements to existing features** @@ -85,9 +86,9 @@ pandas 0.11.1 - When removing an object, ``remove(key)`` raises ``KeyError`` if the key is not a valid store object. - - raise a ``TypeError`` on passing ``where`` or ``columns`` + - raise a ``TypeError`` on passing ``where`` or ``columns`` to select with a Storer; these are invalid parameters at this time - - can now specify an ``encoding`` option to ``append/put`` + - can now specify an ``encoding`` option to ``append/put`` to enable alternate encodings (GH3750_) - The repr() for (Multi)Index now obeys display.max_seq_items rather then numpy threshold print options. (GH3426_, GH3466_) @@ -310,6 +311,7 @@ pandas 0.11.1 .. _GH3750: https://github.com/pydata/pandas/issues/3750 .. _GH3726: https://github.com/pydata/pandas/issues/3726 .. _GH3795: https://github.com/pydata/pandas/issues/3795 +.. _GH3814: https://github.com/pydata/pandas/issues/3814 pandas 0.11.0 ============= From 044daaeeda5b44bb6a036ce3ee49a51ce434c73a Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 10 Jun 2013 13:11:50 -0400 Subject: [PATCH 15/71] BUG: missing imports for clipboard --- pandas/io/clipboard.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index 4aa8db414386b..a62b9268ef2e3 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -1,4 +1,5 @@ """ io on the clipboard """ +from StringIO import StringIO def read_clipboard(**kwargs): # pragma: no cover """ @@ -10,6 +11,7 @@ def read_clipboard(**kwargs): # pragma: no cover parsed : DataFrame """ from pandas.util.clipboard import clipboard_get + from pandas.io.parsers import table text = clipboard_get() return read_table(StringIO(text), **kwargs) From 58642a658175304a7fdc7b182e5bd8c6d27e2ed5 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 10 Jun 2013 13:13:57 -0400 Subject: [PATCH 16/71] BUG: missing imports from clipboard 2 --- pandas/io/clipboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index a62b9268ef2e3..c763c1e8faadb 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -11,7 +11,7 @@ def read_clipboard(**kwargs): # pragma: no cover parsed : DataFrame """ from pandas.util.clipboard import clipboard_get - from pandas.io.parsers import table + from pandas.io.parsers import read_table text = clipboard_get() return read_table(StringIO(text), **kwargs) From 6b5c2a0e6618bdd73213c0638d46ac4302817900 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Mon, 10 Jun 2013 19:37:40 +0100 Subject: [PATCH 17/71] TST regression tests for GH3836 --- pandas/tests/test_indexing.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index ad3d150c7e0ad..e7f824ace983c 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -974,6 +974,23 @@ def test_iloc_mask(self): (key,ans,r)) warnings.filterwarnings(action='always', category=UserWarning) + def test_ix_slicing_strings(self): + ##GH3836 + data = {'Classification': ['SA EQUITY CFD', 'bbb', 'SA EQUITY', 'SA SSF', 'aaa'], + 'Random': [1,2,3,4,5], + 'X': ['correct', 'wrong','correct', 'correct','wrong']} + df = DataFrame(data) + x = df[~df.Classification.isin(['SA EQUITY CFD', 'SA EQUITY', 'SA SSF'])] + df.ix[x.index,'X'] = df['Classification'] + + expected = DataFrame({'Classification': {0: 'SA EQUITY CFD', 1: 'bbb', + 2: 'SA EQUITY', 3: 'SA SSF', 4: 'aaa'}, + 'Random': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + 'X': {0: 'correct', 1: 'bbb', 2: 'correct', + 3: 'correct', 4: 'aaa'}}) # bug was 4: 'bbb' + + assert_frame_equal(df, expected) + def test_non_unique_loc(self): ## GH3659 ## non-unique indexer with loc slice From 4047e9f285019c4827321397b53230d5aae90e04 Mon Sep 17 00:00:00 2001 From: Wouter Overmeire Date: Mon, 10 Jun 2013 20:48:43 +0200 Subject: [PATCH 18/71] BUG: fix Series.interpolate() corner cases, close #3674 --- pandas/core/series.py | 15 ++++++++------- pandas/tests/test_series.py | 7 +++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3a7a7d0f49b66..3439aeb79e174 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3180,14 +3180,15 @@ def interpolate(self, method='linear'): invalid = isnull(values) valid = -invalid - firstIndex = valid.argmax() - valid = valid[firstIndex:] - invalid = invalid[firstIndex:] - inds = inds[firstIndex:] - result = values.copy() - result[firstIndex:][invalid] = np.interp(inds[invalid], inds[valid], - values[firstIndex:][valid]) + if valid.any(): + firstIndex = valid.argmax() + valid = valid[firstIndex:] + invalid = invalid[firstIndex:] + inds = inds[firstIndex:] + + result[firstIndex:][invalid] = np.interp( + inds[invalid], inds[valid], values[firstIndex:][valid]) return Series(result, index=self.index, name=self.name) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index e1589b9499757..58ca34b73b6a0 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4063,6 +4063,13 @@ def test_interpolate(self): # try time interpolation on a non-TimeSeries self.assertRaises(Exception, self.series.interpolate, method='time') + def test_interpolate_corners(self): + s = Series([np.nan, np.nan]) + assert_series_equal(s.interpolate(), s) + + s = Series([]).interpolate() + assert_series_equal(s.interpolate(), s) + def test_interpolate_index_values(self): s = Series(np.nan, index=np.sort(np.random.rand(30))) s[::3] = np.random.randn(10) From 4c50b7189782ae985a553e9298ff3918a4f42695 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 9 Jun 2013 20:58:26 -0400 Subject: [PATCH 19/71] CLN: remove relative imports --- pandas/__init__.py | 15 ++++----------- pandas/algos.pyx | 2 +- pandas/index.pyx | 8 +++----- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index da4c146da3cfd..62de9a10e729b 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -3,17 +3,10 @@ __docformat__ = 'restructuredtext' try: - from . import hashtable, tslib, lib -except Exception: # pragma: no cover - import sys - e = sys.exc_info()[1] # Py25 and Py3 current exception syntax conflict - print e - if 'No module named lib' in str(e): - raise ImportError('C extensions not built: if you installed already ' - 'verify that you are not importing from the source ' - 'directory') - else: - raise + from pandas import hashtable, tslib, lib +except ImportError as e: # pragma: no cover + module = str(e).lstrip('cannot import name ') # hack but overkill to use re + raise ImportError("C extensions: {0} not built".format(module)) from datetime import datetime import numpy as np diff --git a/pandas/algos.pyx b/pandas/algos.pyx index cac9c5ccc7a6d..836101ecafa2d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -57,7 +57,7 @@ cdef extern from "src/headers/math.h": double fabs(double) int signbit(double) -from . import lib +from pandas import lib include "skiplist.pyx" diff --git a/pandas/index.pyx b/pandas/index.pyx index 7d33d6083d0eb..85a83b745510f 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -15,8 +15,8 @@ import numpy as np cimport tslib from hashtable cimport * -from . import algos, tslib, hashtable as _hash -from .tslib import Timestamp +from pandas import algos, tslib, hashtable as _hash +from pandas.tslib import Timestamp from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) @@ -34,7 +34,7 @@ try: import pytz UTC = pytz.utc have_pytz = True -except: +except ImportError: have_pytz = False PyDateTime_IMPORT @@ -42,8 +42,6 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) -# int PyList_Check(object) -# int PyTuple_Check(object) cdef inline is_definitely_invalid_key(object val): if PyTuple_Check(val): From dc1938012b2b7f505f49f50ac317f35a2f54621c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 9 Jun 2013 20:10:52 -0400 Subject: [PATCH 20/71] DOC: speedup io.rst doc build --- doc/source/io.rst | 113 ++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 43 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9d923d2d0e0cf..ac5d49e036669 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -953,10 +953,15 @@ Reading HTML Content .. versionadded:: 0.11.1 -The toplevel :func:`~pandas.io.html.read_html` function can accept an HTML +The top-level :func:`~pandas.io.html.read_html` function can accept an HTML string/file/url and will parse HTML tables into list of pandas DataFrames. Let's look at a few examples. +.. note:: + + ``read_html`` returns a ``list`` of ``DataFrame`` objects, even if there is + only a single table contained in the HTML content + Read a URL with no options .. ipython:: python @@ -967,107 +972,129 @@ Read a URL with no options .. note:: - ``read_html`` returns a ``list`` of ``DataFrame`` objects, even if there is - only a single table contained in the HTML content + The data from the above URL changes every Monday so the resulting data above + and the data below may be slightly different. -Read a URL and match a table that contains specific text +Read in the content of the file from the above URL and pass it to ``read_html`` +as a string + +.. ipython:: python + :suppress: + + import os + file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) + +.. ipython:: python + + with open(file_path, 'r') as f: + dfs = read_html(f.read()) + dfs + +You can even pass in an instance of ``StringIO`` if you so desire .. ipython:: python + from cStringIO import StringIO + + with open(file_path, 'r') as f: + sio = StringIO(f.read()) + + dfs = read_html(sio) + dfs + +.. note:: + + The following examples are not run by the IPython evaluator due to the fact + that having so many network-accessing functions slows down the documentation + build. If you spot an error or an example that doesn't run, please do not + hesitate to report it over on `pandas GitHub issues page + `__. + + +Read a URL and match a table that contains specific text + +.. code-block:: python + match = 'Metcalf Bank' df_list = read_html(url, match=match) - len(dfs) - dfs[0] Specify a header row (by default ```` elements are used to form the column index); if specified, the header row is taken from the data minus the parsed header elements (```` elements). -.. ipython:: python +.. code-block:: python dfs = read_html(url, header=0) - len(dfs) - dfs[0] Specify an index column -.. ipython:: python +.. code-block:: python dfs = read_html(url, index_col=0) - len(dfs) - dfs[0] - dfs[0].index.name Specify a number of rows to skip -.. ipython:: python +.. code-block:: python dfs = read_html(url, skiprows=0) - len(dfs) - dfs[0] Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works as well) -.. ipython:: python +.. code-block:: python dfs = read_html(url, skiprows=range(2)) - len(dfs) - dfs[0] Don't infer numeric and date types -.. ipython:: python +.. code-block:: python dfs = read_html(url, infer_types=False) - len(dfs) - dfs[0] Specify an HTML attribute -.. ipython:: python +.. code-block:: python dfs1 = read_html(url, attrs={'id': 'table'}) dfs2 = read_html(url, attrs={'class': 'sortable'}) - np.array_equal(dfs1[0], dfs2[0]) + print np.array_equal(dfs1[0], dfs2[0]) # Should be True Use some combination of the above -.. ipython:: python +.. code-block:: python dfs = read_html(url, match='Metcalf Bank', index_col=0) - len(dfs) - dfs[0] Read in pandas ``to_html`` output (with some loss of floating point precision) -.. ipython:: python +.. code-block:: python df = DataFrame(randn(2, 2)) s = df.to_html(float_format='{0:.40g}'.format) dfin = read_html(s, index_col=0) - df - dfin[0] - df.index - df.columns - dfin[0].index - dfin[0].columns - np.allclose(df, dfin[0]) -``lxml`` will raise an error on a failed parse if that is the only parser you -provide +The ``lxml`` backend will raise an error on a failed parse if that is the only +parser you provide (if you only have a single parser you can provide just a +string, but it is considered good practice to pass a list with one string if, +for example, the function expects a sequence of strings) -.. ipython:: python +.. code-block:: python + + dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) - dfs = read_html(url, match='Metcalf Bank', index_col=0, flavor=['lxml']) +or + +.. code-block:: python + + dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') However, if you have bs4 and html5lib installed and pass ``None`` or ``['lxml', 'bs4']`` then the parse will most likely succeed. Note that *as soon as a parse succeeds, the function will return*. -.. ipython:: python +.. code-block:: python - dfs = read_html(url, match='Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) + dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) Writing to HTML files @@ -1082,8 +1109,8 @@ in the method ``to_string`` described above. .. note:: Not all of the possible options for ``DataFrame.to_html`` are shown here for - brevity's sake. See :func:`~pandas.core.frame.DataFrame.to_html` for the full set of - options. + brevity's sake. See :func:`~pandas.core.frame.DataFrame.to_html` for the + full set of options. .. ipython:: python :suppress: From ab273f620e2bd83d8b3426f0fb8f0529f9adbcbd Mon Sep 17 00:00:00 2001 From: nipunreddevil Date: Tue, 11 Jun 2013 10:30:46 +0530 Subject: [PATCH 21/71] Added documentation for to_clipboard() --- doc/source/io.rst | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9d923d2d0e0cf..8270bfaf23fa5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1231,6 +1231,37 @@ And then import the data directly to a DataFrame by calling: clipdf +.. note:: + + You may need to install xsel on Linux to be able to read from the clipboard. + +The ``to_clipboard`` method can be used to write the contents of a DataFrame to +the clipboard. Following which you can paste the clipboard contents into other +applications (CTRL-V on many operating systems). Here we illustrate writing a +DataFrame into clipboard and reading it back. + +.. ipython:: python + + df=pd.DataFrame(randn(5,3)) + +.. ipython:: python + + df + +.. ipython:: python + + df.to_clipboard() + +.. ipython:: python + + obj=pd.read_clipboard() + +.. ipython:: python + + obj + + + .. _io.excel: From ccac7718b139bb14035632d3170d242f32d8a4de Mon Sep 17 00:00:00 2001 From: nipunreddevil Date: Tue, 11 Jun 2013 10:47:46 +0530 Subject: [PATCH 22/71] Added a line about same retrieved content --- doc/source/io.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 8270bfaf23fa5..9d937bddf7cc2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1260,6 +1260,8 @@ DataFrame into clipboard and reading it back. obj +We can see that we got the same content back, which we had earlier written to the clipboard. + From 4d06037b0fe604121e49ca9511f0828afe4648bf Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 11 Jun 2013 07:49:02 -0400 Subject: [PATCH 23/71] CLN: add banklist.html to _static --- doc/source/_static/banklist.html | 4885 ++++++++++++++++++++++++++++++ 1 file changed, 4885 insertions(+) create mode 100644 doc/source/_static/banklist.html diff --git a/doc/source/_static/banklist.html b/doc/source/_static/banklist.html new file mode 100644 index 0000000000000..8ec1561f8c394 --- /dev/null +++ b/doc/source/_static/banklist.html @@ -0,0 +1,4885 @@ + + + + +FDIC: Failed Bank List + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip Header +
+
+
+ + +
+ + +

Federal Deposit
Insurance Corporation

+

Each depositor insured to at least $250,000 per insured bank

+
+ +
+
+ + + + + + +
+ +

Failed Bank List

+ +

The FDIC is often appointed as receiver for failed banks. This page contains useful information for the customers and vendors of these banks. This includes information on the acquiring bank (if applicable), how your accounts and loans are affected, and how vendors can file claims against the receivership. Failed Financial Institution Contact Search displays point of contact information related to failed banks.

+ +

This list includes banks which have failed since October 1, 2000. To search for banks that failed prior to those on this page, visit this link: Failures and Assistance Transactions

+ +

Failed Bank List - CSV file (Updated on Mondays. Also opens in Excel - Excel Help)

+ +

Due to the small screen size some information is no longer visible.
Full information available when viewed on a larger screen.

+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Bank NameCitySTCERTAcquiring InstitutionClosing DateUpdated Date
Banks of Wisconsin d/b/a Bank of KenoshaKenoshaWI35386North Shore Bank, FSBMay 31, 2013May 31, 2013
Central Arizona BankScottsdaleAZ34527Western State BankMay 14, 2013May 20, 2013
Sunrise BankValdostaGA58185Synovus BankMay 10, 2013May 21, 2013
Pisgah Community BankAshevilleNC58701Capital Bank, N.A.May 10, 2013May 14, 2013
Douglas County BankDouglasvilleGA21649Hamilton State BankApril 26, 2013May 16, 2013
Parkway BankLenoirNC57158CertusBank, National AssociationApril 26, 2013May 17, 2013
Chipola Community BankMariannaFL58034First Federal Bank of FloridaApril 19, 2013May 16, 2013
Heritage Bank of North FloridaOrange ParkFL26680FirstAtlantic BankApril 19, 2013May 16, 2013
First Federal BankLexingtonKY29594Your Community BankApril 19, 2013April 23, 2013
Gold Canyon BankGold CanyonAZ58066First Scottsdale Bank, National AssociationApril 5, 2013April 9, 2013
Frontier BankLaGrangeGA16431HeritageBank of the SouthMarch 8, 2013March 26, 2013
Covenant BankChicagoIL22476Liberty Bank and Trust CompanyFebruary 15, 2013March 4, 2013
1st Regents BankAndoverMN57157First Minnesota BankJanuary 18, 2013February 28, 2013
Westside Community BankUniversity PlaceWA33997Sunwest BankJanuary 11, 2013January 24, 2013
Community Bank of the OzarksSunrise BeachMO27331Bank of SullivanDecember 14, 2012January 24, 2013
Hometown Community BankBraseltonGA57928CertusBank, National AssociationNovember 16, 2012January 24, 2013
Citizens First National BankPrincetonIL3731Heartland Bank and Trust CompanyNovember 2, 2012January 24, 2013
Heritage Bank of FloridaLutzFL35009Centennial BankNovember 2, 2012January 24, 2013
NOVA BankBerwynPA27148No AcquirerOctober 26, 2012January 24, 2013
Excel BankSedaliaMO19189Simmons First National BankOctober 19, 2012January 24, 2013
First East Side Savings BankTamaracFL28144Stearns Bank N.A.October 19, 2012January 24, 2013
GulfSouth Private BankDestinFL58073SmartBankOctober 19, 2012January 24, 2013
First United BankCreteIL20685Old Plank Trail Community Bank, National AssociationSeptember 28, 2012November 15, 2012
Truman BankSt. LouisMO27316Simmons First National BankSeptember 14, 2012December 17, 2012
First Commercial BankBloomingtonMN35246Republic Bank & Trust CompanySeptember 7, 2012December 17, 2012
Waukegan Savings BankWaukeganIL28243First Midwest BankAugust 3, 2012October 11, 2012
Jasper Banking CompanyJasperGA16240Stearns Bank N.A.July 27, 2012December 17, 2012
Second Federal Savings and Loan Association of ChicagoChicagoIL27986Hinsdale Bank & Trust CompanyJuly 20, 2012January 14, 2013
Heartland BankLeawoodKS1361Metcalf BankJuly 20, 2012December 17, 2012
First Cherokee State BankWoodstockGA32711Community & Southern BankJuly 20, 2012October 31, 2012
Georgia Trust BankBufordGA57847Community & Southern BankJuly 20, 2012December 17, 2012
The Royal Palm Bank of FloridaNaplesFL57096First National Bank of the Gulf CoastJuly 20, 2012January 7, 2013
Glasgow Savings BankGlasgowMO1056Regional Missouri BankJuly 13, 2012October 11, 2012
Montgomery Bank & TrustAileyGA19498Ameris BankJuly 6, 2012October 31, 2012
The Farmers Bank of LynchburgLynchburgTN1690Clayton Bank and TrustJune 15, 2012October 31, 2012
Security Exchange BankMariettaGA35299Fidelity BankJune 15, 2012October 10, 2012
Putnam State BankPalatkaFL27405Harbor Community BankJune 15, 2012October 10, 2012
Waccamaw BankWhitevilleNC34515First Community BankJune 8, 2012November 8, 2012
Farmers' and Traders' State BankShabbonaIL9257First State BankJune 8, 2012October 10, 2012
Carolina Federal Savings BankCharlestonSC35372Bank of North CarolinaJune 8, 2012October 31, 2012
First Capital BankKingfisherOK416F & M BankJune 8, 2012October 10, 2012
Alabama Trust Bank, National AssociationSylacaugaAL35224Southern States BankMay 18, 2012May 20, 2013
Security Bank, National AssociationNorth LauderdaleFL23156Banesco USAMay 4, 2012October 31, 2012
Palm Desert National BankPalm DesertCA23632Pacific Premier BankApril 27, 2012May 17, 2013
Plantation Federal BankPawleys IslandSC32503First Federal BankApril 27, 2012May 17, 2013
Inter Savings Bank, fsb D/B/A InterBank, fsbMaple GroveMN31495Great Southern BankApril 27, 2012May 17, 2013
HarVest Bank of MarylandGaithersburgMD57766SonabankApril 27, 2012May 17, 2013
Bank of the Eastern ShoreCambridgeMD26759No AcquirerApril 27, 2012October 17, 2012
Fort Lee Federal Savings Bank, FSBFort LeeNJ35527Alma BankApril 20, 2012May 17, 2013
Fidelity BankDearbornMI33883The Huntington National BankMarch 30, 2012May 16, 2013
Premier BankWilmetteIL35419International Bank of ChicagoMarch 23, 2012October 17, 2012
Covenant Bank & TrustRock SpringGA58068Stearns Bank, N.A.March 23, 2012October 31, 2012
New City BankChicagoIL57597No AcquirerMarch 9, 2012October 29, 2012
Global Commerce BankDoravilleGA34046Metro City BankMarch 2, 2012October 31, 2012
Home Savings of AmericaLittle FallsMN29178No AcquirerFebruary 24, 2012December 17, 2012
Central Bank of GeorgiaEllavilleGA5687Ameris BankFebruary 24, 2012August 9, 2012
SCB BankShelbyvilleIN29761First Merchants Bank, National AssociationFebruary 10, 2012March 25, 2013
Charter National Bank and TrustHoffman EstatesIL23187Barrington Bank & Trust Company, National AssociationFebruary 10, 2012March 25, 2013
BankEastKnoxvilleTN19869U.S.Bank National AssociationJanuary 27, 2012March 8, 2013
Patriot Bank MinnesotaForest LakeMN34823First Resource BankJanuary 27, 2012September 12, 2012
Tennessee Commerce BankFranklinTN35296Republic Bank & Trust CompanyJanuary 27, 2012November 20, 2012
First Guaranty Bank and Trust Company of JacksonvilleJacksonvilleFL16579CenterState Bank of Florida, N.A.January 27, 2012September 12, 2012
American Eagle Savings BankBoothwynPA31581Capital Bank, N.A.January 20, 2012January 25, 2013
The First State BankStockbridgeGA19252Hamilton State BankJanuary 20, 2012January 25, 2013
Central Florida State BankBelleviewFL57186CenterState Bank of Florida, N.A.January 20, 2012January 25, 2013
Western National BankPhoenixAZ57917Washington FederalDecember 16, 2011August 13, 2012
Premier Community Bank of the Emerald CoastCrestviewFL58343Summit BankDecember 16, 2011September 12, 2012
Central Progressive BankLacombeLA19657First NBC BankNovember 18, 2011August 13, 2012
Polk County BankJohnstonIA14194Grinnell State BankNovember 18, 2011August 15, 2012
Community Bank of RockmartRockmartGA57860Century Bank of GeorgiaNovember 10, 2011August 13, 2012
SunFirst BankSaint GeorgeUT57087Cache Valley BankNovember 4, 2011November 16, 2012
Mid City Bank, Inc.OmahaNE19397Premier BankNovember 4, 2011August 15, 2012
All American BankDes PlainesIL57759International Bank of ChicagoOctober 28, 2011August 15, 2012
Community Banks of ColoradoGreenwood VillageCO21132Bank Midwest, N.A.October 21, 2011January 2, 2013
Community Capital BankJonesboroGA57036State Bank and Trust CompanyOctober 21, 2011November 8, 2012
Decatur First BankDecaturGA34392Fidelity BankOctober 21, 2011November 8, 2012
Old Harbor BankClearwaterFL575371st United BankOctober 21, 2011November 8, 2012
Country BankAledoIL35395Blackhawk Bank & TrustOctober 14, 2011August 15, 2012
First State BankCranfordNJ58046Northfield BankOctober 14, 2011November 8, 2012
Blue Ridge Savings Bank, Inc.AshevilleNC32347Bank of North CarolinaOctober 14, 2011November 8, 2012
Piedmont Community BankGrayGA57256State Bank and Trust CompanyOctober 14, 2011January 22, 2013
Sun Security BankEllingtonMO20115Great Southern BankOctober 7, 2011November 7, 2012
The RiverBankWyomingMN10216Central BankOctober 7, 2011November 7, 2012
First International BankPlanoTX33513American First National BankSeptember 30, 2011October 9, 2012
Citizens Bank of Northern CaliforniaNevada CityCA33983Tri Counties BankSeptember 23, 2011October 9, 2012
Bank of the CommonwealthNorfolkVA20408Southern Bank and Trust CompanySeptember 23, 2011October 9, 2012
The First National Bank of FloridaMiltonFL25155CharterBankSeptember 9, 2011September 6, 2012
CreekSide BankWoodstockGA58226Georgia Commerce BankSeptember 2, 2011September 6, 2012
Patriot Bank of GeorgiaCummingGA58273Georgia Commerce BankSeptember 2, 2011November 2, 2012
First Choice BankGenevaIL57212Inland Bank & TrustAugust 19, 2011August 15, 2012
First Southern National BankStatesboroGA57239Heritage Bank of the SouthAugust 19, 2011November 2, 2012
Lydian Private BankPalm BeachFL35356Sabadell United Bank, N.A.August 19, 2011November 2, 2012
Public Savings BankHuntingdon ValleyPA34130Capital Bank, N.A.August 18, 2011August 15, 2012
The First National Bank of OlatheOlatheKS4744Enterprise Bank & TrustAugust 12, 2011August 23, 2012
Bank of WhitmanColfaxWA22528Columbia State BankAugust 5, 2011August 16, 2012
Bank of ShorewoodShorewoodIL22637Heartland Bank and Trust CompanyAugust 5, 2011August 16, 2012
Integra Bank National AssociationEvansvilleIN4392Old National BankJuly 29, 2011August 16, 2012
BankMeridian, N.A.ColumbiaSC58222SCBT National AssociationJuly 29, 2011November 2, 2012
Virginia Business BankRichmondVA58283Xenith BankJuly 29, 2011October 9, 2012
Bank of ChoiceGreeleyCO2994Bank Midwest, N.A.July 22, 2011September 12, 2012
LandMark Bank of FloridaSarasotaFL35244American Momentum BankJuly 22, 2011November 2, 2012
Southshore Community BankApollo BeachFL58056American Momentum BankJuly 22, 2011November 2, 2012
Summit BankPrescottAZ57442The Foothills BankJuly 15, 2011August 16, 2012
First Peoples BankPort St. LucieFL34870Premier American Bank, N.A.July 15, 2011November 2, 2012
High Trust BankStockbridgeGA19554Ameris BankJuly 15, 2011November 2, 2012
One Georgia BankAtlantaGA58238Ameris BankJuly 15, 2011November 2, 2012
Signature BankWindsorCO57835Points West Community BankJuly 8, 2011October 26, 2012
Colorado Capital BankCastle RockCO34522First-Citizens Bank & Trust CompanyJuly 8, 2011January 15, 2013
First Chicago Bank & TrustChicagoIL27935Northbrook Bank & Trust CompanyJuly 8, 2011September 9, 2012
Mountain Heritage BankClaytonGA57593First American Bank and Trust CompanyJune 24, 2011November 2, 2012
First Commercial Bank of Tampa BayTampaFL27583Stonegate BankJune 17, 2011November 2, 2012
McIntosh State BankJacksonGA19237Hamilton State BankJune 17, 2011November 2, 2012
Atlantic Bank and TrustCharlestonSC58420First Citizens Bank and Trust Company, Inc.June 3, 2011October 31, 2012
First Heritage BankSnohomishWA23626Columbia State BankMay 27, 2011January 28, 2013
Summit BankBurlingtonWA513Columbia State BankMay 20, 2011January 22, 2013
First Georgia Banking CompanyFranklinGA57647CertusBank, National AssociationMay 20, 2011November 13, 2012
Atlantic Southern BankMaconGA57213CertusBank, National AssociationMay 20, 2011October 31, 2012
Coastal BankCocoa BeachFL34898Florida Community Bank, a division of Premier American Bank, N.A.May 6, 2011November 30, 2012
Community Central BankMount ClemensMI34234Talmer Bank & TrustApril 29, 2011August 16, 2012
The Park Avenue BankValdostaGA19797Bank of the OzarksApril 29, 2011November 30, 2012
First Choice Community BankDallasGA58539Bank of the OzarksApril 29, 2011January 22, 2013
Cortez Community BankBrooksvilleFL57625Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
First National Bank of Central FloridaWinter ParkFL26297Florida Community Bank, a division of Premier American Bank, N.A.April 29, 2011November 30, 2012
Heritage Banking GroupCarthageMS14273Trustmark National BankApril 15, 2011November 30, 2012
Rosemount National BankRosemountMN24099Central BankApril 15, 2011August 16, 2012
Superior BankBirminghamAL17750Superior Bank, National AssociationApril 15, 2011November 30, 2012
Nexity BankBirminghamAL19794AloStar Bank of CommerceApril 15, 2011September 4, 2012
New Horizons BankEast EllijayGA57705Citizens South BankApril 15, 2011August 16, 2012
Bartow County BankCartersvilleGA21495Hamilton State BankApril 15, 2011January 22, 2013
Nevada Commerce BankLas VegasNV35418City National BankApril 8, 2011September 9, 2012
Western Springs National Bank and TrustWestern SpringsIL10086Heartland Bank and Trust CompanyApril 8, 2011January 22, 2013
The Bank of CommerceWood DaleIL34292Advantage National Bank GroupMarch 25, 2011January 22, 2013
Legacy BankMilwaukeeWI34818Seaway Bank and Trust CompanyMarch 11, 2011September 12, 2012
First National Bank of DavisDavisOK4077The Pauls Valley National BankMarch 11, 2011August 20, 2012
Valley Community BankSt. CharlesIL34187First State BankFebruary 25, 2011September 12, 2012
San Luis Trust Bank, FSBSan Luis ObispoCA34783First California BankFebruary 18, 2011August 20, 2012
Charter Oak BankNapaCA57855Bank of MarinFebruary 18, 2011September 12, 2012
Citizens Bank of EffinghamSpringfieldGA34601Heritage Bank of the SouthFebruary 18, 2011November 2, 2012
Habersham BankClarkesvilleGA151SCBT National AssociationFebruary 18, 2011November 2, 2012
Canyon National BankPalm SpringsCA34692Pacific Premier BankFebruary 11, 2011September 12, 2012
Badger State BankCassvilleWI13272Royal BankFebruary 11, 2011September 12, 2012
Peoples State BankHamtramckMI14939First Michigan BankFebruary 11, 2011January 22, 2013
Sunshine State Community BankPort OrangeFL35478Premier American Bank, N.A.February 11, 2011November 2, 2012
Community First Bank ChicagoChicagoIL57948Northbrook Bank & Trust CompanyFebruary 4, 2011August 20, 2012
North Georgia BankWatkinsvilleGA35242BankSouthFebruary 4, 2011November 2, 2012
American Trust BankRoswellGA57432Renasant BankFebruary 4, 2011October 31, 2012
First Community BankTaosNM12261U.S. Bank, N.A.January 28, 2011September 12, 2012
FirsTier BankLouisvilleCO57646No AcquirerJanuary 28, 2011September 12, 2012
Evergreen State BankStoughtonWI5328McFarland State BankJanuary 28, 2011September 12, 2012
The First State BankCamargoOK2303Bank 7January 28, 2011September 12, 2012
United Western BankDenverCO31293First-Citizens Bank & Trust CompanyJanuary 21, 2011September 12, 2012
The Bank of AshevilleAshevilleNC34516First BankJanuary 21, 2011November 2, 2012
CommunitySouth Bank & TrustEasleySC57868CertusBank, National AssociationJanuary 21, 2011November 2, 2012
Enterprise Banking CompanyMcDonoughGA19758No AcquirerJanuary 21, 2011November 2, 2012
Oglethorpe BankBrunswickGA57440Bank of the OzarksJanuary 14, 2011November 2, 2012
Legacy BankScottsdaleAZ57820Enterprise Bank & TrustJanuary 7, 2011September 12, 2012
First Commercial Bank of FloridaOrlandoFL34965First Southern BankJanuary 7, 2011November 2, 2012
Community National BankLino LakesMN23306Farmers & Merchants Savings BankDecember 17, 2010August 20, 2012
First Southern BankBatesvilleAR58052Southern BankDecember 17, 2010August 20, 2012
United Americas Bank, N.A.AtlantaGA35065State Bank and Trust CompanyDecember 17, 2010November 2, 2012
Appalachian Community Bank, FSBMcCaysvilleGA58495Peoples Bank of East TennesseeDecember 17, 2010October 31, 2012
Chestatee State BankDawsonvilleGA34578Bank of the OzarksDecember 17, 2010November 2, 2012
The Bank of Miami,N.A.Coral GablesFL190401st United BankDecember 17, 2010November 2, 2012
Earthstar BankSouthamptonPA35561Polonia BankDecember 10, 2010August 20, 2012
Paramount BankFarmington HillsMI34673Level One BankDecember 10, 2010August 20, 2012
First Banking CenterBurlingtonWI5287First Michigan BankNovember 19, 2010August 20, 2012
Allegiance Bank of North AmericaBala CynwydPA35078VIST BankNovember 19, 2010August 20, 2012
Gulf State Community BankCarrabelleFL20340Centennial BankNovember 19, 2010November 2, 2012
Copper Star BankScottsdaleAZ35463Stearns Bank, N.A.November 12, 2010August 20, 2012
Darby Bank & Trust Co.VidaliaGA14580Ameris BankNovember 12, 2010January 15, 2013
Tifton Banking CompanyTiftonGA57831Ameris BankNovember 12, 2010November 2, 2012
First Vietnamese American Bank
In Vietnamese
WestminsterCA57885Grandpoint BankNovember 5, 2010September 12, 2012
Pierce Commercial BankTacomaWA34411Heritage BankNovember 5, 2010August 20, 2012
Western Commercial BankWoodland HillsCA58087First California BankNovember 5, 2010September 12, 2012
K BankRandallstownMD31263Manufacturers and Traders Trust Company (M&T Bank)November 5, 2010August 20, 2012
First Arizona Savings, A FSBScottsdaleAZ32582No AcquirerOctober 22, 2010August 20, 2012
Hillcrest BankOverland ParkKS22173Hillcrest Bank, N.A.October 22, 2010August 20, 2012
First Suburban National BankMaywoodIL16089Seaway Bank and Trust CompanyOctober 22, 2010August 20, 2012
The First National Bank of BarnesvilleBarnesvilleGA2119United BankOctober 22, 2010November 2, 2012
The Gordon BankGordonGA33904Morris BankOctober 22, 2010November 2, 2012
Progress Bank of FloridaTampaFL32251Bay Cities BankOctober 22, 2010November 2, 2012
First Bank of JacksonvilleJacksonvilleFL27573Ameris BankOctober 22, 2010November 2, 2012
Premier BankJefferson CityMO34016Providence BankOctober 15, 2010August 20, 2012
WestBridge Bank and Trust CompanyChesterfieldMO58205Midland States BankOctober 15, 2010August 20, 2012
Security Savings Bank, F.S.B.OlatheKS30898Simmons First National BankOctober 15, 2010August 20, 2012
Shoreline BankShorelineWA35250GBC International BankOctober 1, 2010August 20, 2012
Wakulla BankCrawfordvilleFL21777Centennial BankOctober 1, 2010November 2, 2012
North County BankArlingtonWA35053Whidbey Island BankSeptember 24, 2010August 20, 2012
Haven Trust Bank FloridaPonte Vedra BeachFL58308First Southern BankSeptember 24, 2010November 5, 2012
Maritime Savings BankWest AllisWI28612North Shore Bank, FSBSeptember 17, 2010August 20, 2012
Bramble Savings BankMilfordOH27808Foundation BankSeptember 17, 2010August 20, 2012
The Peoples BankWinderGA182Community & Southern BankSeptember 17, 2010November 5, 2012
First Commerce Community BankDouglasvilleGA57448Community & Southern BankSeptember 17, 2010January 15, 2013
Bank of EllijayEllijayGA58197Community & Southern BankSeptember 17, 2010January 15, 2013
ISN BankCherry HillNJ57107Customers BankSeptember 17, 2010August 22, 2012
Horizon BankBradentonFL35061Bank of the OzarksSeptember 10, 2010November 5, 2012
Sonoma Valley BankSonomaCA27259Westamerica BankAugust 20, 2010September 12, 2012
Los Padres BankSolvangCA32165Pacific Western BankAugust 20, 2010September 12, 2012
Butte Community BankChicoCA33219Rabobank, N.A.August 20, 2010September 12, 2012
Pacific State BankStocktonCA27090Rabobank, N.A.August 20, 2010September 12, 2012
ShoreBankChicagoIL15640Urban Partnership BankAugust 20, 2010May 16, 2013
Imperial Savings and Loan AssociationMartinsvilleVA31623River Community Bank, N.A.August 20, 2010August 24, 2012
Independent National BankOcalaFL27344CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Community National Bank at BartowBartowFL25266CenterState Bank of Florida, N.A.August 20, 2010November 5, 2012
Palos Bank and Trust CompanyPalos HeightsIL17599First Midwest BankAugust 13, 2010August 22, 2012
Ravenswood BankChicagoIL34231Northbrook Bank & Trust CompanyAugust 6, 2010August 22, 2012
LibertyBankEugeneOR31964Home Federal BankJuly 30, 2010August 22, 2012
The Cowlitz BankLongviewWA22643Heritage BankJuly 30, 2010August 22, 2012
Coastal Community BankPanama City BeachFL9619Centennial BankJuly 30, 2010November 5, 2012
Bayside Savings BankPort Saint JoeFL57669Centennial BankJuly 30, 2010November 5, 2012
Northwest Bank & TrustAcworthGA57658State Bank and Trust CompanyJuly 30, 2010November 5, 2012
Home Valley BankCave JunctionOR23181South Valley Bank & TrustJuly 23, 2010September 12, 2012
SouthwestUSA BankLas VegasNV35434Plaza BankJuly 23, 2010August 22, 2012
Community Security BankNew PragueMN34486RoundbankJuly 23, 2010September 12, 2012
Thunder BankSylvan GroveKS10506The Bennington State BankJuly 23, 2010September 13, 2012
Williamsburg First National BankKingstreeSC17837First Citizens Bank and Trust Company, Inc.July 23, 2010November 5, 2012
Crescent Bank and Trust CompanyJasperGA27559Renasant BankJuly 23, 2010November 5, 2012
Sterling BankLantanaFL32536IBERIABANKJuly 23, 2010November 5, 2012
Mainstreet Savings Bank, FSBHastingsMI28136Commercial BankJuly 16, 2010September 13, 2012
Olde Cypress Community BankClewistonFL28864CenterState Bank of Florida, N.A.July 16, 2010November 5, 2012
Turnberry BankAventuraFL32280NAFH National BankJuly 16, 2010November 5, 2012
Metro Bank of Dade CountyMiamiFL25172NAFH National BankJuly 16, 2010November 5, 2012
First National Bank of the SouthSpartanburgSC35383NAFH National BankJuly 16, 2010November 5, 2012
Woodlands BankBlufftonSC32571Bank of the OzarksJuly 16, 2010November 5, 2012
Home National BankBlackwellOK11636RCB BankJuly 9, 2010December 10, 2012
USA BankPort ChesterNY58072New Century BankJuly 9, 2010September 14, 2012
Ideal Federal Savings BankBaltimoreMD32456No AcquirerJuly 9, 2010September 14, 2012
Bay National BankBaltimoreMD35462Bay Bank, FSBJuly 9, 2010January 15, 2013
High Desert State BankAlbuquerqueNM35279First American BankJune 25, 2010September 14, 2012
First National BankSavannahGA34152The Savannah Bank, N.A.June 25, 2010November 5, 2012
Peninsula BankEnglewoodFL26563Premier American Bank, N.A.June 25, 2010November 5, 2012
Nevada Security BankRenoNV57110Umpqua BankJune 18, 2010August 23, 2012
Washington First International BankSeattleWA32955East West BankJune 11, 2010September 14, 2012
TierOne BankLincolnNE29341Great Western BankJune 4, 2010September 14, 2012
Arcola Homestead Savings BankArcolaIL31813No AcquirerJune 4, 2010September 14, 2012
First National BankRosedaleMS15814The Jefferson BankJune 4, 2010November 5, 2012
Sun West BankLas VegasNV34785City National BankMay 28, 2010September 14, 2012
Granite Community Bank, NAGranite BayCA57315Tri Counties BankMay 28, 2010September 14, 2012
Bank of Florida - TampaTampaFL57814EverBankMay 28, 2010November 5, 2012
Bank of Florida - SouthwestNaplesFL35106EverBankMay 28, 2010November 5, 2012
Bank of Florida - SoutheastFort LauderdaleFL57360EverBankMay 28, 2010November 5, 2012
Pinehurst BankSaint PaulMN57735Coulee BankMay 21, 2010October 26, 2012
Midwest Bank and Trust CompanyElmwood ParkIL18117FirstMerit Bank, N.A.May 14, 2010August 23, 2012
Southwest Community BankSpringfieldMO34255Simmons First National BankMay 14, 2010August 23, 2012
New Liberty BankPlymouthMI35586Bank of Ann ArborMay 14, 2010August 23, 2012
Satilla Community BankSaint MarysGA35114Ameris BankMay 14, 2010November 5, 2012
1st Pacific Bank of CaliforniaSan DiegoCA35517City National BankMay 7, 2010December 13, 2012
Towne Bank of ArizonaMesaAZ57697Commerce Bank of ArizonaMay 7, 2010August 23, 2012
Access BankChamplinMN16476PrinsBankMay 7, 2010August 23, 2012
The Bank of BonifayBonifayFL14246First Federal Bank of FloridaMay 7, 2010November 5, 2012
Frontier BankEverettWA22710Union Bank, N.A.April 30, 2010January 15, 2013
BC National BanksButlerMO17792Community First BankApril 30, 2010August 23, 2012
Champion BankCreve CoeurMO58362BankLibertyApril 30, 2010August 23, 2012
CF BancorpPort HuronMI30005First Michigan BankApril 30, 2010January 15, 2013
Westernbank Puerto Rico
En Espanol
MayaguezPR31027Banco Popular de Puerto RicoApril 30, 2010November 5, 2012
R-G Premier Bank of Puerto Rico
En Espanol
Hato ReyPR32185Scotiabank de Puerto RicoApril 30, 2010November 5, 2012
Eurobank
En Espanol
San JuanPR27150Oriental Bank and TrustApril 30, 2010November 5, 2012
Wheatland BankNapervilleIL58429Wheaton Bank & TrustApril 23, 2010August 23, 2012
Peotone Bank and Trust CompanyPeotoneIL10888First Midwest BankApril 23, 2010August 23, 2012
Lincoln Park Savings BankChicagoIL30600Northbrook Bank & Trust CompanyApril 23, 2010August 23, 2012
New Century BankChicagoIL34821MB Financial Bank, N.A.April 23, 2010August 23, 2012
Citizens Bank and Trust Company of ChicagoChicagoIL34658Republic Bank of ChicagoApril 23, 2010August 23, 2012
Broadway BankChicagoIL22853MB Financial Bank, N.A.April 23, 2010August 23, 2012
Amcore Bank, National AssociationRockfordIL3735Harris N.A.April 23, 2010August 23, 2012
City BankLynnwoodWA21521Whidbey Island BankApril 16, 2010September 14, 2012
Tamalpais BankSan RafaelCA33493Union Bank, N.A.April 16, 2010August 23, 2012
Innovative BankOaklandCA23876Center BankApril 16, 2010August 23, 2012
Butler BankLowellMA26619People's United BankApril 16, 2010August 23, 2012
Riverside National Bank of FloridaFort PierceFL24067TD Bank, N.A.April 16, 2010November 5, 2012
AmericanFirst BankClermontFL57724TD Bank, N.A.April 16, 2010October 31, 2012
First Federal Bank of North FloridaPalatkaFL28886TD Bank, N.A.April 16, 2010January 15, 2013
Lakeside Community BankSterling HeightsMI34878No AcquirerApril 16, 2010August 23, 2012
Beach First National BankMyrtle BeachSC34242Bank of North CarolinaApril 9, 2010November 5, 2012
Desert Hills BankPhoenixAZ57060New York Community BankMarch 26, 2010August 23, 2012
Unity National BankCartersvilleGA34678Bank of the OzarksMarch 26, 2010September 14, 2012
Key West BankKey WestFL34684Centennial BankMarch 26, 2010August 23, 2012
McIntosh Commercial BankCarrolltonGA57399CharterBankMarch 26, 2010August 23, 2012
State Bank of AuroraAuroraMN8221Northern State BankMarch 19, 2010August 23, 2012
First Lowndes BankFort DepositAL24957First Citizens BankMarch 19, 2010August 23, 2012
Bank of HiawasseeHiawasseeGA10054Citizens South BankMarch 19, 2010August 23, 2012
Appalachian Community BankEllijayGA33989Community & Southern BankMarch 19, 2010October 31, 2012
Advanta Bank Corp.DraperUT33535No AcquirerMarch 19, 2010September 14, 2012
Century Security BankDuluthGA58104Bank of UpsonMarch 19, 2010August 23, 2012
American National BankParmaOH18806The National Bank and Trust CompanyMarch 19, 2010August 23, 2012
Statewide BankCovingtonLA29561Home BankMarch 12, 2010August 23, 2012
Old Southern BankOrlandoFL58182Centennial BankMarch 12, 2010August 23, 2012
The Park Avenue BankNew YorkNY27096Valley National BankMarch 12, 2010August 23, 2012
LibertyPointe BankNew YorkNY58071Valley National BankMarch 11, 2010August 23, 2012
Centennial BankOgdenUT34430No AcquirerMarch 5, 2010September 14, 2012
Waterfield BankGermantownMD34976No AcquirerMarch 5, 2010August 23, 2012
Bank of IllinoisNormalIL9268Heartland Bank and Trust CompanyMarch 5, 2010August 23, 2012
Sun American BankBoca RatonFL27126First-Citizens Bank & Trust CompanyMarch 5, 2010August 23, 2012
Rainier Pacific BankTacomaWA38129Umpqua BankFebruary 26, 2010August 23, 2012
Carson River Community BankCarson CityNV58352Heritage Bank of NevadaFebruary 26, 2010January 15, 2013
La Jolla Bank, FSBLa JollaCA32423OneWest Bank, FSBFebruary 19, 2010August 24, 2012
George Washington Savings BankOrland ParkIL29952FirstMerit Bank, N.A.February 19, 2010August 24, 2012
The La Coste National BankLa CosteTX3287Community National BankFebruary 19, 2010September 14, 2012
Marco Community BankMarco IslandFL57586Mutual of Omaha BankFebruary 19, 2010August 24, 2012
1st American State Bank of MinnesotaHancockMN15448Community Development Bank, FSBFebruary 5, 2010August 24, 2012
American Marine BankBainbridge IslandWA16730Columbia State BankJanuary 29, 2010August 24, 2012
First Regional BankLos AngelesCA23011First-Citizens Bank & Trust CompanyJanuary 29, 2010August 24, 2012
Community Bank and TrustCorneliaGA5702SCBT National AssociationJanuary 29, 2010January 15, 2013
Marshall Bank, N.A.HallockMN16133United Valley BankJanuary 29, 2010August 23, 2012
Florida Community BankImmokaleeFL5672Premier American Bank, N.A.January 29, 2010January 15, 2013
First National Bank of GeorgiaCarrolltonGA16480Community & Southern BankJanuary 29, 2010December 13, 2012
Columbia River BankThe DallesOR22469Columbia State BankJanuary 22, 2010September 14, 2012
Evergreen BankSeattleWA20501Umpqua BankJanuary 22, 2010January 15, 2013
Charter BankSanta FeNM32498Charter BankJanuary 22, 2010August 23, 2012
Bank of LeetonLeetonMO8265Sunflower Bank, N.A.January 22, 2010January 15, 2013
Premier American BankMiamiFL57147Premier American Bank, N.A.January 22, 2010December 13, 2012
Barnes Banking CompanyKaysvilleUT1252No AcquirerJanuary 15, 2010August 23, 2012
St. Stephen State BankSt. StephenMN17522First State Bank of St. JosephJanuary 15, 2010August 23, 2012
Town Community Bank & TrustAntiochIL34705First American BankJanuary 15, 2010August 23, 2012
Horizon BankBellinghamWA22977Washington Federal Savings and Loan AssociationJanuary 8, 2010August 23, 2012
First Federal Bank of California, F.S.B.Santa MonicaCA28536OneWest Bank, FSBDecember 18, 2009August 23, 2012
Imperial Capital BankLa JollaCA26348City National BankDecember 18, 2009September 5, 2012
Independent Bankers' BankSpringfieldIL26820The Independent BankersBank (TIB)December 18, 2009August 23, 2012
New South Federal Savings BankIrondaleAL32276Beal BankDecember 18, 2009August 23, 2012
Citizens State BankNew BaltimoreMI1006No AcquirerDecember 18, 2009November 5, 2012
Peoples First Community BankPanama CityFL32167Hancock BankDecember 18, 2009November 5, 2012
RockBridge Commercial BankAtlantaGA58315No AcquirerDecember 18, 2009November 5, 2012
SolutionsBankOverland ParkKS4731Arvest BankDecember 11, 2009August 23, 2012
Valley Capital Bank, N.A.MesaAZ58399Enterprise Bank & TrustDecember 11, 2009August 23, 2012
Republic Federal Bank, N.A.MiamiFL228461st United BankDecember 11, 2009November 5, 2012
Greater Atlantic BankRestonVA32583SonabankDecember 4, 2009November 5, 2012
Benchmark BankAuroraIL10440MB Financial Bank, N.A.December 4, 2009August 23, 2012
AmTrust BankClevelandOH29776New York Community BankDecember 4, 2009November 5, 2012
The Tattnall BankReidsvilleGA12080Heritage Bank of the SouthDecember 4, 2009November 5, 2012
First Security National BankNorcrossGA26290State Bank and Trust CompanyDecember 4, 2009November 5, 2012
The Buckhead Community BankAtlantaGA34663State Bank and Trust CompanyDecember 4, 2009November 5, 2012
Commerce Bank of Southwest FloridaFort MyersFL58016Central BankNovember 20, 2009November 5, 2012
Pacific Coast National BankSan ClementeCA57914Sunwest BankNovember 13, 2009August 22, 2012
Orion BankNaplesFL22427IBERIABANKNovember 13, 2009November 5, 2012
Century Bank, F.S.B.SarasotaFL32267IBERIABANKNovember 13, 2009August 22, 2012
United Commercial BankSan FranciscoCA32469East West BankNovember 6, 2009November 5, 2012
Gateway Bank of St. LouisSt. LouisMO19450Central Bank of Kansas CityNovember 6, 2009August 22, 2012
Prosperan BankOakdaleMN35074Alerus Financial, N.A.November 6, 2009August 22, 2012
Home Federal Savings BankDetroitMI30329Liberty Bank and Trust CompanyNovember 6, 2009August 22, 2012
United Security BankSpartaGA22286Ameris BankNovember 6, 2009January 15, 2013
North Houston BankHoustonTX18776U.S. Bank N.A.October 30, 2009August 22, 2012
Madisonville State BankMadisonvilleTX33782U.S. Bank N.A.October 30, 2009August 22, 2012
Citizens National BankTeagueTX25222U.S. Bank N.A.October 30, 2009August 22, 2012
Park National BankChicagoIL11677U.S. Bank N.A.October 30, 2009August 22, 2012
Pacific National BankSan FranciscoCA30006U.S. Bank N.A.October 30, 2009August 22, 2012
California National BankLos AngelesCA34659U.S. Bank N.A.October 30, 2009September 5, 2012
San Diego National BankSan DiegoCA23594U.S. Bank N.A.October 30, 2009August 22, 2012
Community Bank of LemontLemontIL35291U.S. Bank N.A.October 30, 2009January 15, 2013
Bank USA, N.A.PhoenixAZ32218U.S. Bank N.A.October 30, 2009August 22, 2012
First DuPage BankWestmontIL35038First Midwest BankOctober 23, 2009August 22, 2012
Riverview Community BankOtsegoMN57525Central BankOctober 23, 2009August 22, 2012
Bank of ElmwoodRacineWI18321Tri City National BankOctober 23, 2009August 22, 2012
Flagship National BankBradentonFL35044First Federal Bank of FloridaOctober 23, 2009August 22, 2012
Hillcrest Bank FloridaNaplesFL58336Stonegate BankOctober 23, 2009August 22, 2012
American United BankLawrencevilleGA57794Ameris BankOctober 23, 2009September 5, 2012
Partners BankNaplesFL57959Stonegate BankOctober 23, 2009January 15, 2013
San Joaquin BankBakersfieldCA23266Citizens Business BankOctober 16, 2009August 22, 2012
Southern Colorado National BankPuebloCO57263Legacy BankOctober 2, 2009September 5, 2012
Jennings State BankSpring GroveMN11416Central BankOctober 2, 2009August 21, 2012
Warren BankWarrenMI34824The Huntington National BankOctober 2, 2009August 21, 2012
Georgian BankAtlantaGA57151First Citizens Bank and Trust Company, Inc.September 25, 2009August 21, 2012
Irwin Union Bank, F.S.B.LouisvilleKY57068First Financial Bank, N.A.September 18, 2009September 5, 2012
Irwin Union Bank and Trust CompanyColumbusIN10100First Financial Bank, N.A.September 18, 2009August 21, 2012
Venture BankLaceyWA22868First-Citizens Bank & Trust CompanySeptember 11, 2009August 21, 2012
Brickwell Community BankWoodburyMN57736CorTrust Bank N.A.September 11, 2009January 15, 2013
Corus Bank, N.A.ChicagoIL13693MB Financial Bank, N.A.September 11, 2009August 21, 2012
First State BankFlagstaffAZ34875Sunwest BankSeptember 4, 2009January 15, 2013
Platinum Community BankRolling MeadowsIL35030No AcquirerSeptember 4, 2009August 21, 2012
Vantus BankSioux CityIN27732Great Southern BankSeptember 4, 2009August 21, 2012
InBankOak ForestIL20203MB Financial Bank, N.A.September 4, 2009August 21, 2012
First Bank of Kansas CityKansas CityMO25231Great American BankSeptember 4, 2009August 21, 2012
Affinity BankVenturaCA27197Pacific Western BankAugust 28, 2009August 21, 2012
Mainstreet BankForest LakeMN1909Central BankAugust 28, 2009August 21, 2012
Bradford BankBaltimoreMD28312Manufacturers and Traders Trust Company (M&T Bank)August 28, 2009January 15, 2013
Guaranty BankAustinTX32618BBVA CompassAugust 21, 2009August 21, 2012
CapitalSouth BankBirminghamAL22130IBERIABANKAugust 21, 2009January 15, 2013
First Coweta BankNewnanGA57702United BankAugust 21, 2009January 15, 2013
ebankAtlantaGA34682Stearns Bank, N.A.August 21, 2009August 21, 2012
Community Bank of NevadaLas VegasNV34043No AcquirerAugust 14, 2009August 21, 2012
Community Bank of ArizonaPhoenixAZ57645MidFirst BankAugust 14, 2009August 21, 2012
Union Bank, National AssociationGilbertAZ34485MidFirst BankAugust 14, 2009August 21, 2012
Colonial BankMontgomeryAL9609Branch Banking & Trust Company, (BB&T)August 14, 2009September 5, 2012
Dwelling House Savings and Loan AssociationPittsburghPA31559PNC Bank, N.A.August 14, 2009January 15, 2013
Community First BankPrinevilleOR23268Home Federal BankAugust 7, 2009January 15, 2013
Community National Bank of Sarasota CountyVeniceFL27183Stearns Bank, N.A.August 7, 2009August 20, 2012
First State BankSarasotaFL27364Stearns Bank, N.A.August 7, 2009August 20, 2012
Mutual BankHarveyIL18659United Central BankJuly 31, 2009August 20, 2012
First BankAmericanoElizabethNJ34270Crown BankJuly 31, 2009August 20, 2012
Peoples Community BankWest ChesterOH32288First Financial Bank, N.A.July 31, 2009August 20, 2012
Integrity BankJupiterFL57604Stonegate BankJuly 31, 2009August 20, 2012
First State Bank of AltusAltusOK9873Herring BankJuly 31, 2009August 20, 2012
Security Bank of Jones CountyGrayGA8486State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Houston CountyPerryGA27048State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Bibb CountyMaconGA27367State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North MetroWoodstockGA57105State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of North FultonAlpharettaGA57430State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Security Bank of Gwinnett CountySuwaneeGA57346State Bank and Trust CompanyJuly 24, 2009August 20, 2012
Waterford Village BankWilliamsvilleNY58065Evans Bank, N.A.July 24, 2009August 20, 2012
Temecula Valley BankTemeculaCA34341First-Citizens Bank & Trust CompanyJuly 17, 2009August 20, 2012
Vineyard BankRancho CucamongaCA23556California Bank & TrustJuly 17, 2009August 20, 2012
BankFirstSioux FallsSD34103Alerus Financial, N.A.July 17, 2009August 20, 2012
First Piedmont BankWinderGA34594First American Bank and Trust CompanyJuly 17, 2009January 15, 2013
Bank of WyomingThermopolisWY22754Central Bank & TrustJuly 10, 2009August 20, 2012
Founders BankWorthIL18390The PrivateBank and Trust CompanyJuly 2, 2009August 20, 2012
Millennium State Bank of TexasDallasTX57667State Bank of TexasJuly 2, 2009October 26, 2012
First National Bank of DanvilleDanvilleIL3644First Financial Bank, N.A.July 2, 2009August 20, 2012
Elizabeth State BankElizabethIL9262Galena State Bank and Trust CompanyJuly 2, 2009August 20, 2012
Rock River BankOregonIL15302The Harvard State BankJuly 2, 2009August 20, 2012
First State Bank of WinchesterWinchesterIL11710The First National Bank of BeardstownJuly 2, 2009August 20, 2012
John Warner BankClintonIL12093State Bank of LincolnJuly 2, 2009August 20, 2012
Mirae BankLos AngelesCA57332Wilshire State BankJune 26, 2009August 20, 2012
MetroPacific BankIrvineCA57893Sunwest BankJune 26, 2009August 20, 2012
Horizon BankPine CityMN9744Stearns Bank, N.A.June 26, 2009August 20, 2012
Neighborhood Community BankNewnanGA35285CharterBankJune 26, 2009August 20, 2012
Community Bank of West GeorgiaVilla RicaGA57436No AcquirerJune 26, 2009August 17, 2012
First National Bank of AnthonyAnthonyKS4614Bank of KansasJune 19, 2009August 17, 2012
Cooperative BankWilmingtonNC27837First BankJune 19, 2009August 17, 2012
Southern Community BankFayettevilleGA35251United Community BankJune 19, 2009August 17, 2012
Bank of LincolnwoodLincolnwoodIL17309Republic Bank of ChicagoJune 5, 2009August 17, 2012
Citizens National BankMacombIL5757Morton Community BankMay 22, 2009September 4, 2012
Strategic Capital BankChampaignIL35175Midland States BankMay 22, 2009September 4, 2012
BankUnited, FSBCoral GablesFL32247BankUnitedMay 21, 2009August 17, 2012
Westsound BankBremertonWA34843Kitsap BankMay 8, 2009September 4, 2012
America West BankLaytonUT35461Cache Valley BankMay 1, 2009August 17, 2012
Citizens Community BankRidgewoodNJ57563North Jersey Community BankMay 1, 2009September 4, 2012
Silverton Bank, NAAtlantaGA26535No AcquirerMay 1, 2009August 17, 2012
First Bank of IdahoKetchumID34396U.S. Bank, N.A.April 24, 2009August 17, 2012
First Bank of Beverly HillsCalabasasCA32069No AcquirerApril 24, 2009September 4, 2012
Michigan Heritage BankFarmington HillsMI34369Level One BankApril 24, 2009August 17, 2012
American Southern BankKennesawGA57943Bank of North GeorgiaApril 24, 2009August 17, 2012
Great Basin Bank of NevadaElkoNV33824Nevada State BankApril 17, 2009September 4, 2012
American Sterling BankSugar CreekMO8266Metcalf BankApril 17, 2009August 31, 2012
New Frontier BankGreeleyCO34881No AcquirerApril 10, 2009September 4, 2012
Cape Fear BankWilmingtonNC34639First Federal Savings and Loan AssociationApril 10, 2009August 17, 2012
Omni National BankAtlantaGA22238No AcquirerMarch 27, 2009August 17, 2012
TeamBank, NAPaolaKS4754Great Southern BankMarch 20, 2009August 17, 2012
Colorado National BankColorado SpringsCO18896Herring BankMarch 20, 2009August 17, 2012
FirstCity BankStockbridgeGA18243No AcquirerMarch 20, 2009August 17, 2012
Freedom Bank of GeorgiaCommerceGA57558Northeast Georgia BankMarch 6, 2009August 17, 2012
Security Savings BankHendersonNV34820Bank of NevadaFebruary 27, 2009September 7, 2012
Heritage Community BankGlenwoodIL20078MB Financial Bank, N.A.February 27, 2009August 17, 2012
Silver Falls BankSilvertonOR35399Citizens BankFebruary 20, 2009August 17, 2012
Pinnacle Bank of OregonBeavertonOR57342Washington Trust Bank of SpokaneFebruary 13, 2009August 17, 2012
Corn Belt Bank & Trust Co.PittsfieldIL16500The Carlinville National BankFebruary 13, 2009August 17, 2012
Riverside Bank of the Gulf CoastCape CoralFL34563TIB BankFebruary 13, 2009August 17, 2012
Sherman County BankLoup CityNE5431Heritage BankFebruary 13, 2009August 17, 2012
County BankMercedCA22574Westamerica BankFebruary 6, 2009September 4, 2012
Alliance BankCulver CityCA23124California Bank & TrustFebruary 6, 2009August 16, 2012
FirstBank Financial ServicesMcDonoughGA57017Regions BankFebruary 6, 2009August 16, 2012
Ocala National BankOcalaFL26538CenterState Bank of Florida, N.A.January 30, 2009September 4, 2012
Suburban FSBCroftonMD30763Bank of EssexJanuary 30, 2009August 16, 2012
MagnetBankSalt Lake CityUT58001No AcquirerJanuary 30, 2009August 16, 2012
1st Centennial BankRedlandsCA33025First California BankJanuary 23, 2009August 16, 2012
Bank of Clark CountyVancouverWA34959Umpqua BankJanuary 16, 2009August 16, 2012
National Bank of CommerceBerkeleyIL19733Republic Bank of ChicagoJanuary 16, 2009August 16, 2012
Sanderson State Bank
En Espanol
SandersonTX11568The Pecos County State BankDecember 12, 2008September 4, 2012
Haven Trust BankDuluthGA35379Branch Banking & Trust Company, (BB&T)December 12, 2008August 16, 2012
First Georgia Community BankJacksonGA34301United BankDecember 5, 2008August 16, 2012
PFF Bank & TrustPomonaCA28344U.S. Bank, N.A.November 21, 2008January 4, 2013
Downey Savings & LoanNewport BeachCA30968U.S. Bank, N.A.November 21, 2008January 4, 2013
Community BankLoganvilleGA16490Bank of EssexNovember 21, 2008September 4, 2012
Security Pacific BankLos AngelesCA23595Pacific Western BankNovember 7, 2008August 28, 2012
Franklin Bank, SSBHoustonTX26870Prosperity BankNovember 7, 2008August 16, 2012
Freedom BankBradentonFL57930Fifth Third BankOctober 31, 2008August 16, 2012
Alpha Bank & TrustAlpharettaGA58241Stearns Bank, N.A.October 24, 2008August 16, 2012
Meridian BankEldredIL13789National BankOctober 10, 2008May 31, 2012
Main Street BankNorthvilleMI57654Monroe Bank & TrustOctober 10, 2008August 16, 2012
Washington Mutual Bank
(Including its subsidiary Washington Mutual Bank FSB)
HendersonNV32633JP Morgan Chase BankSeptember 25, 2008August 16, 2012
AmeribankNorthforkWV6782The Citizens Savings Bank

Pioneer Community Bank, Inc.
September 19, 2008August 16, 2012
Silver State Bank
En Espanol
HendersonNV34194Nevada State BankSeptember 5, 2008August 16, 2012
Integrity BankAlpharettaGA35469Regions BankAugust 29, 2008August 16, 2012
Columbian Bank & TrustTopekaKS22728Citizens Bank & TrustAugust 22, 2008August 16, 2012
First Priority BankBradentonFL57523SunTrust BankAugust 1, 2008August 16, 2012
First Heritage Bank, NANewport BeachCA57961Mutual of Omaha BankJuly 25, 2008August 28, 2012
First National Bank of NevadaRenoNV27011Mutual of Omaha BankJuly 25, 2008August 28, 2012
IndyMac BankPasadenaCA29730OneWest Bank, FSBJuly 11, 2008August 28, 2012
First Integrity Bank, NAStaplesMN12736First International Bank and TrustMay 30, 2008August 28, 2012
ANB Financial, NABentonvilleAR33901Pulaski Bank and Trust CompanyMay 9, 2008August 28, 2012
Hume BankHumeMO1971Security BankMarch 7, 2008August 28, 2012
Douglass National BankKansas CityMO24660Liberty Bank and Trust CompanyJanuary 25, 2008October 26, 2012
Miami Valley BankLakeviewOH16848The Citizens Banking CompanyOctober 4, 2007August 28, 2012
NetBankAlpharettaGA32575ING DIRECTSeptember 28, 2007August 28, 2012
Metropolitan Savings BankPittsburghPA35353Allegheny Valley Bank of PittsburghFebruary 2, 2007October 27, 2010
Bank of EphraimEphraimUT1249Far West BankJune 25, 2004April 9, 2008
Reliance BankWhite PlainsNY26778Union State BankMarch 19, 2004April 9, 2008
Guaranty National Bank of TallahasseeTallahasseeFL26838Hancock Bank of FloridaMarch 12, 2004June 5, 2012
Dollar Savings BankNewarkNJ31330No AcquirerFebruary 14, 2004April 9, 2008
Pulaski Savings BankPhiladelphiaPA27203Earthstar BankNovember 14, 2003July 22, 2005
First National Bank of BlanchardvilleBlanchardvilleWI11639The Park BankMay 9, 2003June 5, 2012
Southern Pacific BankTorranceCA27094Beal BankFebruary 7, 2003October 20, 2008
Farmers Bank of CheneyvilleCheneyvilleLA16445Sabine State Bank & TrustDecember 17, 2002October 20, 2004
Bank of AlamoAlamoTN9961No AcquirerNovember 8, 2002March 18, 2005
AmTrade International Bank
En Espanol
AtlantaGA33784No AcquirerSeptember 30, 2002September 11, 2006
Universal Federal Savings BankChicagoIL29355Chicago Community BankJune 27, 2002April 9, 2008
Connecticut Bank of CommerceStamfordCT19183Hudson United BankJune 26, 2002February 14, 2012
New Century BankShelby TownshipMI34979No AcquirerMarch 28, 2002March 18, 2005
Net 1st National BankBoca RatonFL26652Bank Leumi USAMarch 1, 2002April 9, 2008
NextBank, NAPhoenixAZ22314No AcquirerFebruary 7, 2002August 27, 2010
Oakwood Deposit Bank Co.OakwoodOH8966The State Bank & Trust CompanyFebruary 1, 2002October 25, 2012
Bank of Sierra BlancaSierra BlancaTX22002The Security State Bank of PecosJanuary 18, 2002November 6, 2003
Hamilton Bank, NA
En Espanol
MiamiFL24382Israel Discount Bank of New YorkJanuary 11, 2002June 5, 2012
Sinclair National BankGravetteAR34248Delta Trust & BankSeptember 7, 2001February 10, 2004
Superior Bank, FSBHinsdaleIL32646Superior Federal, FSBJuly 27, 2001June 5, 2012
Malta National BankMaltaOH6629North Valley BankMay 3, 2001November 18, 2002
First Alliance Bank & Trust Co.ManchesterNH34264Southern New Hampshire Bank & TrustFebruary 2, 2001February 18, 2003
National State Bank of MetropolisMetropolisIL3815Banterra Bank of MarionDecember 14, 2000March 17, 2005
Bank of HonoluluHonoluluHI21029Bank of the OrientOctober 13, 2000March 17, 2005
+
+ +
+ + + + + + + + + + + + + + + + + + From 03d5325fe1a2fddad00b5d2074be4b7f930e967d Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 10 Jun 2013 16:35:50 -0400 Subject: [PATCH 24/71] BUG: GH3611 fix again, float na_values were not stringified correctly now, 999.0 (a float) will have: ['999','999.0'] added for matching --- pandas/io/parsers.py | 8 +++++++- pandas/io/tests/test_parsers.py | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6e937ba696e39..cc8d7f64af6c7 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1777,7 +1777,13 @@ def _stringify_na_values(na_values): result.append(str(x)) result.append(x) try: - result.append(float(x)) + v = float(x) + + # we are like 999 here + if v == int(v): + v = int(v) + result.append("%s.0" % v) + result.append(str(v)) except: pass try: diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index cae4c0902a97c..0efb4c09ee950 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -540,6 +540,15 @@ def test_non_string_na_values(self): tm.assert_frame_equal(result1,result2) tm.assert_frame_equal(result2,result3) + result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0']) + result5 = read_csv(path, sep= ' ', header=0, na_values=['-999']) + result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0]) + result7 = read_csv(path, sep= ' ', header=0, na_values=[-999]) + tm.assert_frame_equal(result4,result3) + tm.assert_frame_equal(result5,result3) + tm.assert_frame_equal(result6,result3) + tm.assert_frame_equal(result7,result3) + def test_custom_na_values(self): data = """A,B,C ignore,this,row From c840591754e8db4889f0b19d609c0cc76a091064 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 11 Jun 2013 09:37:55 -0400 Subject: [PATCH 25/71] BUG: add na_fvalues to parsers to enable matching of float values w/o relying on string matching for an exact match --- pandas/io/parsers.py | 100 ++++++++++++++++++++------------ pandas/io/tests/test_parsers.py | 21 +++++++ pandas/parser.pyx | 51 ++++++++++------ 3 files changed, 118 insertions(+), 54 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index cc8d7f64af6c7..e4fb478a2a288 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -297,6 +297,7 @@ def parser_f(filepath_or_buffer, skipfooter=None, skip_footer=0, na_values=None, + na_fvalues=None, true_values=None, false_values=None, delimiter=None, @@ -359,6 +360,7 @@ def parser_f(filepath_or_buffer, prefix=prefix, skiprows=skiprows, na_values=na_values, + na_fvalues=na_fvalues, true_values=true_values, false_values=false_values, keep_default_na=keep_default_na, @@ -554,7 +556,7 @@ def _clean_options(self, options, engine): converters = {} # Converting values to NA - na_values = _clean_na_values(na_values, keep_default_na) + na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) if com.is_integer(skiprows): skiprows = range(skiprows) @@ -565,6 +567,7 @@ def _clean_options(self, options, engine): result['names'] = names result['converters'] = converters result['na_values'] = na_values + result['na_fvalues'] = na_fvalues result['skiprows'] = skiprows return result, engine @@ -644,6 +647,7 @@ def __init__(self, kwds): self.keep_date_col = kwds.pop('keep_date_col', False) self.na_values = kwds.get('na_values') + self.na_fvalues = kwds.get('na_fvalues') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.tupleize_cols = kwds.get('tupleize_cols',True) @@ -837,31 +841,34 @@ def _agg_index(self, index, try_parse_dates=True): arr = self._date_conv(arr) col_na_values = self.na_values + col_na_fvalues = self.na_fvalues if isinstance(self.na_values, dict): col_name = self.index_names[i] if col_name is not None: - col_na_values = _get_na_values(col_name, - self.na_values) - - arr, _ = self._convert_types(arr, col_na_values) + col_na_values, col_na_fvalues = _get_na_values(col_name, + self.na_values, + self.na_fvalues) + + arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_names) return index - def _convert_to_ndarrays(self, dct, na_values, verbose=False, + def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, converters=None): result = {} for c, values in dct.iteritems(): conv_f = None if converters is None else converters.get(c, None) - col_na_values = _get_na_values(c, na_values) + col_na_values, col_na_fvalues = _get_na_values(c, na_values, na_fvalues) coerce_type = True if conv_f is not None: values = lib.map_infer(values, conv_f) coerce_type = False - cvals, na_count = self._convert_types(values, col_na_values, + cvals, na_count = self._convert_types(values, + set(col_na_values) | col_na_fvalues, coerce_type) result[c] = cvals if verbose and na_count: @@ -1370,7 +1377,7 @@ def _convert_data(self, data): col = self.orig_names[col] clean_conv[col] = f - return self._convert_to_ndarrays(data, self.na_values, self.verbose, + return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, clean_conv) def _infer_columns(self): @@ -1754,43 +1761,26 @@ def _try_convert_dates(parser, colspec, data_dict, columns): def _clean_na_values(na_values, keep_default_na=True): + if na_values is None and keep_default_na: na_values = _NA_VALUES + na_fvalues = set() elif isinstance(na_values, dict): if keep_default_na: for k, v in na_values.iteritems(): v = set(list(v)) | _NA_VALUES na_values[k] = v + na_fvalues = dict([ (k, _floatify_na_values(v)) for k, v in na_values.items() ]) else: if not com.is_list_like(na_values): na_values = [na_values] - na_values = set(_stringify_na_values(na_values)) + na_values = _stringify_na_values(na_values) if keep_default_na: na_values = na_values | _NA_VALUES - return na_values + na_fvalues = _floatify_na_values(na_values) -def _stringify_na_values(na_values): - """ return a stringified and numeric for these values """ - result = [] - for x in na_values: - result.append(str(x)) - result.append(x) - try: - v = float(x) - - # we are like 999 here - if v == int(v): - v = int(v) - result.append("%s.0" % v) - result.append(str(v)) - except: - pass - try: - result.append(int(x)) - except: - pass - return result + return na_values, na_fvalues def _clean_index_names(columns, index_col): if not _is_index_col(index_col): @@ -1838,14 +1828,52 @@ def _get_empty_meta(columns, index_col, index_names): return index, columns, {} -def _get_na_values(col, na_values): +def _floatify_na_values(na_values): + # create float versions of the na_values + result = set() + for v in na_values: + try: + v = float(v) + if not np.isnan(v): + result.add(v) + except: + pass + return result + +def _stringify_na_values(na_values): + """ return a stringified and numeric for these values """ + result = [] + for x in na_values: + result.append(str(x)) + result.append(x) + try: + v = float(x) + + # we are like 999 here + if v == int(v): + v = int(v) + result.append("%s.0" % v) + result.append(str(v)) + + result.append(v) + except: + pass + try: + result.append(int(x)) + except: + pass + return set(result) + +def _get_na_values(col, na_values, na_fvalues): if isinstance(na_values, dict): if col in na_values: - return set(_stringify_na_values(list(na_values[col]))) + values = na_values[col] + fvalues = na_fvalues[col] + return na_values[col], na_fvalues[col] else: - return _NA_VALUES + return _NA_VALUES, set() else: - return na_values + return na_values, na_fvalues def _get_col_names(colspec, columns): diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 0efb4c09ee950..cc2dddd829302 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -549,6 +549,27 @@ def test_non_string_na_values(self): tm.assert_frame_equal(result6,result3) tm.assert_frame_equal(result7,result3) + good_compare = result3 + + # with an odd float format, so we can't match the string 999.0 exactly, + # but need float matching + df.to_csv(path, sep=' ', index=False, float_format = '%.3f') + result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999']) + result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0]) + result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999]) + tm.assert_frame_equal(result1,good_compare) + tm.assert_frame_equal(result2,good_compare) + tm.assert_frame_equal(result3,good_compare) + + result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0']) + result5 = read_csv(path, sep= ' ', header=0, na_values=['-999']) + result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0]) + result7 = read_csv(path, sep= ' ', header=0, na_values=[-999]) + tm.assert_frame_equal(result4,good_compare) + tm.assert_frame_equal(result5,good_compare) + tm.assert_frame_equal(result6,good_compare) + tm.assert_frame_equal(result7,good_compare) + def test_custom_na_values(self): data = """A,B,C ignore,this,row diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 004c23d09ccdf..eaa588ef4d150 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -231,7 +231,7 @@ cdef class TextReader: cdef: parser_t *parser - object file_handle + object file_handle, na_fvalues bint factorize, na_filter, verbose, has_usecols, has_mi_columns int parser_start list clocks @@ -294,6 +294,7 @@ cdef class TextReader: na_filter=True, na_values=None, + na_fvalues=None, true_values=None, false_values=None, @@ -391,6 +392,9 @@ cdef class TextReader: self.delim_whitespace = delim_whitespace self.na_values = na_values + if na_fvalues is None: + na_fvalues = set() + self.na_fvalues = na_fvalues self.true_values = _maybe_encode(true_values) self.false_values = _maybe_encode(false_values) @@ -834,7 +838,7 @@ cdef class TextReader: Py_ssize_t i, nused, ncols kh_str_t *na_hashset = NULL int start, end - object name + object name, na_flist bint na_filter = 0 start = self.parser_start @@ -863,8 +867,9 @@ cdef class TextReader: conv = self._get_converter(i, name) # XXX + na_flist = set() if self.na_filter: - na_list = self._get_na_list(i, name) + na_list, na_flist = self._get_na_list(i, name) if na_list is None: na_filter = 0 else: @@ -880,7 +885,7 @@ cdef class TextReader: # Should return as the desired dtype (inferred or specified) col_res, na_count = self._convert_tokens(i, start, end, name, - na_filter, na_hashset) + na_filter, na_hashset, na_flist) if na_filter: self._free_na_set(na_hashset) @@ -906,7 +911,8 @@ cdef class TextReader: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, - kh_str_t *na_hashset): + kh_str_t *na_hashset, + object na_flist): cdef: object col_dtype = None @@ -930,7 +936,7 @@ cdef class TextReader: col_dtype = np.dtype(col_dtype).str return self._convert_with_dtype(col_dtype, i, start, end, - na_filter, 1, na_hashset) + na_filter, 1, na_hashset, na_flist) if i in self.noconvert: return self._string_convert(i, start, end, na_filter, na_hashset) @@ -939,10 +945,10 @@ cdef class TextReader: for dt in dtype_cast_order: try: col_res, na_count = self._convert_with_dtype( - dt, i, start, end, na_filter, 0, na_hashset) + dt, i, start, end, na_filter, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( - '|O8', i, start, end, na_filter, 0, na_hashset) + '|O8', i, start, end, na_filter, 0, na_hashset, na_flist) if col_res is not None: break @@ -953,7 +959,8 @@ cdef class TextReader: int start, int end, bint na_filter, bint user_dtype, - kh_str_t *na_hashset): + kh_str_t *na_hashset, + object na_flist): cdef kh_str_t *true_set, *false_set if dtype[1] == 'i' or dtype[1] == 'u': @@ -969,7 +976,7 @@ cdef class TextReader: elif dtype[1] == 'f': result, na_count = _try_double(self.parser, i, start, end, - na_filter, na_hashset) + na_filter, na_hashset, na_flist) if dtype[1:] != 'f8': result = result.astype(dtype) @@ -1060,7 +1067,7 @@ cdef class TextReader: cdef _get_na_list(self, i, name): if self.na_values is None: - return None + return None, set() if isinstance(self.na_values, dict): values = None @@ -1068,18 +1075,23 @@ cdef class TextReader: values = self.na_values[name] if values is not None and not isinstance(values, list): values = list(values) + fvalues = self.na_fvalues[name] + if fvalues is not None and not isinstance(fvalues, set): + fvalues = set(fvalues) else: if i in self.na_values: - return self.na_values[i] + return self.na_values[i], self.na_fvalues[i] else: - return _NA_VALUES + return _NA_VALUES, set() - return _ensure_encoded(values) + return _ensure_encoded(values), fvalues else: if not isinstance(self.na_values, list): self.na_values = list(self.na_values) + if not isinstance(self.na_fvalues, set): + self.na_fvalues = set(self.na_fvalues) - return _ensure_encoded(self.na_values) + return _ensure_encoded(self.na_values), self.na_fvalues cdef _free_na_set(self, kh_str_t *table): kh_destroy_str(table) @@ -1163,8 +1175,6 @@ def _maybe_upcast(arr): # ---------------------------------------------------------------------- # Type conversions / inference support code - - cdef _string_box_factorize(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): @@ -1357,7 +1367,7 @@ cdef char* cinf = b'inf' cdef char* cneginf = b'-inf' cdef _try_double(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset): + bint na_filter, kh_str_t *na_hashset, object na_flist): cdef: int error, na_count = 0 size_t i, lines @@ -1367,6 +1377,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, double NA = na_values[np.float64] ndarray result khiter_t k + bint use_na_flist = len(na_flist) > 0 lines = line_end - line_start result = np.empty(lines, dtype=np.float64) @@ -1391,6 +1402,10 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, data[0] = NEGINF else: return None, None + if use_na_flist: + if data[0] in na_flist: + na_count += 1 + data[0] = NA data += 1 else: for i in range(lines): From e31f8397519281ebc356e03c78394cca9e7406bb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 11 May 2013 18:34:32 -0700 Subject: [PATCH 26/71] ENH: pull pandasjson back into pandas --- pandas/core/frame.py | 100 ++ pandas/core/series.py | 82 ++ pandas/io/tests/test_json/test_pandas.py | 240 +++++ pandas/io/tests/test_json/test_ujson.py | 1230 ++++++++++++++++++++++ pandas/src/ujson/lib/ultrajson.h | 298 ++++++ pandas/src/ujson/python/py_defines.h | 15 + pandas/src/ujson/python/version.h | 1 + setup.py | 21 + 8 files changed, 1987 insertions(+) create mode 100644 pandas/io/tests/test_json/test_pandas.py create mode 100644 pandas/io/tests/test_json/test_ujson.py create mode 100644 pandas/src/ujson/lib/ultrajson.h create mode 100644 pandas/src/ujson/python/py_defines.h create mode 100644 pandas/src/ujson/python/version.h diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c0a2843370f4..2925bb3e3b73a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5593,6 +5593,106 @@ def mask(self, cond): """ return self.where(~cond, NA) + +@classmethod +def from_json(cls, json, orient="columns", dtype=None, numpy=True): + """ + Convert JSON string to DataFrame + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + dtype : dtype of the resulting DataFrame + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : DataFrame + """ + from pandas.json import loads + + df = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + df = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + df = DataFrame(**decoded) + elif orient == "values": + df = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + df = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "columns": + df = DataFrame(loads(json), dtype=dtype) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + df = DataFrame(dtype=dtype, **decoded) + elif orient == "index": + df = DataFrame(loads(json), dtype=dtype).T + else: + df = DataFrame(loads(json), dtype=dtype) + + return df +DataFrame.from_json = from_json + + +def to_json(self, orient="columns", double_precision=10, + force_ascii=True): + """ + Convert DataFrame to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default 'columns' + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.json import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) +DataFrame.to_json = to_json + + _EMPTY_SERIES = Series([]) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3a7a7d0f49b66..9147e64f5b11a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3298,6 +3298,88 @@ def str(self): from pandas.core.strings import StringMethods return StringMethods(self) + +@classmethod +def from_json(cls, json, orient="index", dtype=None, numpy=True): + """ + Convert JSON string to Series + + Parameters + ---------- + json : The JSON string to parse. + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + dtype : dtype of the resulting Series + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : Series + """ + from pandas.json import loads + s = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + s = Series(**decoded) + elif orient == "columns" or orient == "index": + s = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + s = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + if not numpy: + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + s = Series(dtype=dtype, **decoded) + else: + s = Series(loads(json), dtype=dtype) + + return s +Series.from_json = from_json + +def to_json(self, orient="index", double_precision=10, force_ascii=True): + """ + Convert Series to a JSON string + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.json import dumps + return dumps(self, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) +Series.to_json = to_json + + _INDEX_TYPES = ndarray, Index, list, tuple #------------------------------------------------------------------------------ diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py new file mode 100644 index 0000000000000..506aa382487d6 --- /dev/null +++ b/pandas/io/tests/test_json/test_pandas.py @@ -0,0 +1,240 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +from StringIO import StringIO +import cPickle as pickle +import operator +import os +import unittest + +import numpy as np + +from pandas import Series, DataFrame, DatetimeIndex +import pandas as pd + +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal) +import pandas.util.testing as tm + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() + +_frame = DataFrame(_seriesd) +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_intframe = DataFrame(dict((k, v.astype(int)) + for k, v in _seriesd.iteritems())) + +_tsframe = DataFrame(_tsd) + +_mixed_frame = _frame.copy() + + +class TestPandasObjects(unittest.TestCase): + + def setUp(self): + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.objSeries = tm.makeObjectSeries() + self.objSeries.name = 'objects' + + self.empty_series = Series([], index=[]) + self.empty_frame = DataFrame({}) + + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.intframe = _intframe.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def test_frame_from_json_to_json(self): + + def _check_orient(df, orient, dtype=None, numpy=True): + df = df.sort() + dfjson = df.to_json(orient=orient) + unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy) + unser = unser.sort() + if df.index.dtype.type == np.datetime64: + unser.index = DatetimeIndex(unser.index.values.astype('i8')) + if orient == "records": + # index is not captured in this orientation + assert_almost_equal(df.values, unser.values) + self.assert_(df.columns.equals(unser.columns)) + elif orient == "values": + # index and cols are not captured in this orientation + assert_almost_equal(df.values, unser.values) + elif orient == "split": + # index and col labels might not be strings + unser.index = [str(i) for i in unser.index] + unser.columns = [str(i) for i in unser.columns] + unser = unser.sort() + assert_almost_equal(df.values, unser.values) + else: + assert_frame_equal(df, unser) + + def _check_all_orients(df, dtype=None): + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, numpy=False) + _check_orient(df, "records", dtype=dtype, numpy=False) + _check_orient(df, "split", dtype=dtype, numpy=False) + _check_orient(df, "index", dtype=dtype, numpy=False) + _check_orient(df, "values", dtype=dtype, numpy=False) + + # basic + _check_all_orients(self.frame) + self.assertEqual(self.frame.to_json(), + self.frame.to_json(orient="columns")) + + _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + + # big one + # index and columns are strings as all unserialised JSON object keys + # are assumed to be strings + biggie = DataFrame(np.zeros((200, 4)), + columns=[str(i) for i in range(4)], + index=[str(i) for i in range(200)]) + _check_all_orients(biggie) + + # dtypes + _check_all_orients(DataFrame(biggie, dtype=np.float64), + dtype=np.float64) + _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) + _check_all_orients(DataFrame(biggie, dtype='= 3 + else partial(json.dumps, encoding="utf-8")) + +class UltraJSONTests(TestCase): + def test_encodeDictWithUnicodeKeys(self): + input = { u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1", u"key1": u"value1" } + output = ujson.encode(input) + + input = { u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1", u"بن": u"value1" } + output = ujson.encode(input) + + pass + + def test_encodeDoubleConversion(self): + input = math.pi + output = ujson.encode(input) + self.assertEquals(round(input, 5), round(json.loads(output), 5)) + self.assertEquals(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeWithDecimal(self): + input = 1.0 + output = ujson.encode(input) + self.assertEquals(output, "1.0") + + def test_encodeDoubleNegConversion(self): + input = -math.pi + output = ujson.encode(input) + self.assertEquals(round(input, 5), round(json.loads(output), 5)) + self.assertEquals(round(input, 5), round(ujson.decode(output), 5)) + + def test_encodeArrayOfNestedArrays(self): + input = [[[[]]]] * 20 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + #self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + input = np.array(input) + assert_array_equal(input, ujson.decode(output, numpy=True, dtype=input.dtype)) + + def test_encodeArrayOfDoubles(self): + input = [ 31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + #self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + + def test_doublePrecisionTest(self): + input = 30.012345678901234 + output = ujson.encode(input, double_precision = 15) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + + output = ujson.encode(input, double_precision = 9) + self.assertEquals(round(input, 9), json.loads(output)) + self.assertEquals(round(input, 9), ujson.decode(output)) + + output = ujson.encode(input, double_precision = 3) + self.assertEquals(round(input, 3), json.loads(output)) + self.assertEquals(round(input, 3), ujson.decode(output)) + + output = ujson.encode(input) + self.assertEquals(round(input, 5), json.loads(output)) + self.assertEquals(round(input, 5), ujson.decode(output)) + + def test_invalidDoublePrecision(self): + input = 30.12345678901234567890 + output = ujson.encode(input, double_precision = 20) + # should snap to the max, which is 15 + self.assertEquals(round(input, 15), json.loads(output)) + self.assertEquals(round(input, 15), ujson.decode(output)) + + output = ujson.encode(input, double_precision = -1) + # also should snap to the max, which is 15 + self.assertEquals(round(input, 15), json.loads(output)) + self.assertEquals(round(input, 15), ujson.decode(output)) + + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = '9') + # will throw typeError + self.assertRaises(TypeError, ujson.encode, input, double_precision = None) + + + def test_encodeStringConversion(self): + input = "A string \\ / \b \f \n \r \t" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, '"A string \\\\ \\/ \\b \\f \\n \\r \\t"') + self.assertEquals(input, ujson.decode(output)) + pass + + def test_decodeUnicodeConversion(self): + pass + + def test_encodeUnicodeConversion1(self): + input = "Räksmörgås اسامة بن محمد بن عوض بن لادن" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeControlEscaping(self): + input = "\x19" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(input, dec) + self.assertEquals(enc, json_unicode(input)) + + + def test_encodeUnicodeConversion2(self): + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicodeSurrogatePair(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x90\x8d\x86" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf0\x91\x80\xb0TRAILINGNORMAL" + enc = ujson.encode(input) + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + def test_encodeUnicode4BytesUTF8Highest(self): + _skip_if_python_ver(2, 5) + _skip_if_python_ver(2, 6) + input = "\xf3\xbf\xbf\xbfTRAILINGNORMAL" + enc = ujson.encode(input) + + dec = ujson.decode(enc) + + self.assertEquals(enc, json_unicode(input)) + self.assertEquals(dec, json.loads(enc)) + + + def test_encodeArrayInArray(self): + input = [[[[]]]] + output = ujson.encode(input) + + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeIntConversion(self): + input = 31337 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeIntNegConversion(self): + input = -31337 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + + def test_encodeLongNegConversion(self): + input = -9223372036854775808 + output = ujson.encode(input) + + outputjson = json.loads(output) + outputujson = ujson.decode(output) + + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeListConversion(self): + input = [ 1, 2, 3, 4 ] + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True)) + pass + + def test_encodeDictConversion(self): + input = { "k1": 1, "k2": 2, "k3": 3, "k4": 4 } + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeNoneConversion(self): + input = None + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeTrueConversion(self): + input = True + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_encodeFalseConversion(self): + input = False + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + # def test_encodeDatetimeConversion(self): + # ts = time.time() + # input = datetime.datetime.fromtimestamp(ts) + # output = ujson.encode(input) + # expected = calendar.timegm(input.utctimetuple()) + # self.assertEquals(int(expected), json.loads(output)) + # self.assertEquals(int(expected), ujson.decode(output)) + # pass + + # def test_encodeDateConversion(self): + # ts = time.time() + # input = datetime.date.fromtimestamp(ts) + + # output = ujson.encode(input) + # tup = ( input.year, input.month, input.day, 0, 0, 0 ) + + # expected = calendar.timegm(tup) + # self.assertEquals(int(expected), json.loads(output)) + # self.assertEquals(int(expected), ujson.decode(output)) + + def test_datetime_nanosecond_unit(self): + from datetime import datetime + from pandas.lib import Timestamp + + val = datetime.now() + stamp = Timestamp(val) + + roundtrip = ujson.decode(ujson.encode(val)) + self.assert_(roundtrip == stamp.value) + + def test_encodeToUTF8(self): + _skip_if_python_ver(2, 5) + input = "\xe6\x97\xa5\xd1\x88" + enc = ujson.encode(input, ensure_ascii=False) + dec = ujson.decode(enc) + self.assertEquals(enc, json_unicode(input, ensure_ascii=False)) + self.assertEquals(dec, json.loads(enc)) + + def test_decodeFromUnicode(self): + input = u"{\"obj\": 31337}" + dec1 = ujson.decode(input) + dec2 = ujson.decode(str(input)) + self.assertEquals(dec1, dec2) + + def test_encodeRecursionMax(self): + # 8 is the max recursion depth + + class O2: + member = 0 + pass + + class O1: + member = 0 + pass + + input = O1() + input.member = O2() + input.member.member = input + + try: + output = ujson.encode(input) + assert False, "Expected overflow exception" + except(OverflowError): + pass + + def test_encodeDoubleNan(self): + input = np.nan + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleInf(self): + input = np.inf + assert ujson.encode(input) == 'null', "Expected null" + + def test_encodeDoubleNegInf(self): + input = -np.inf + assert ujson.encode(input) == 'null', "Expected null" + + + def test_decodeJibberish(self): + input = "fdsa sda v9sa fdsa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayStart(self): + input = "[" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectStart(self): + input = "{" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenArrayEnd(self): + input = "]" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeBrokenObjectEnd(self): + input = "}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUnterminated(self): + input = "\"TESTING" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringUntermEscapeSequence(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeStringBadEscape(self): + input = "\"TESTING\\\"" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeTrueBroken(self): + input = "tru" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeFalseBroken(self): + input = "fa" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + def test_decodeNullBroken(self): + input = "n" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + assert False, "Wrong exception" + + + def test_decodeBrokenDictKeyTypeLeakTest(self): + input = '{{1337:""}}' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError),e: + continue + + assert False, "Wrong exception" + + def test_decodeBrokenDictLeakTest(self): + input = '{{"key":"}' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeBrokenListLeakTest(self): + input = '[[[true' + for x in xrange(1000): + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + continue + + assert False, "Wrong exception" + + def test_decodeDictWithNoKey(self): + input = "{{{{31337}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoColonOrValue(self): + input = "{{{{\"key\"}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeDictWithNoValue(self): + input = "{{{{\"key\":}}}}" + try: + ujson.decode(input) + assert False, "Expected exception!" + except(ValueError): + return + + assert False, "Wrong exception" + + def test_decodeNumericIntPos(self): + input = "31337" + self.assertEquals (31337, ujson.decode(input)) + + def test_decodeNumericIntNeg(self): + input = "-31337" + self.assertEquals (-31337, ujson.decode(input)) + + def test_encodeUnicode4BytesUTF8Fail(self): + _skip_if_python_ver(3) + input = "\xfd\xbf\xbf\xbf\xbf\xbf" + try: + enc = ujson.encode(input) + assert False, "Expected exception" + except OverflowError: + pass + + def test_encodeNullCharacter(self): + input = "31337 \x00 1337" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + + input = "\x00" + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + + self.assertEquals('" \\u0000\\r\\n "', ujson.dumps(u" \u0000\r\n ")) + pass + + def test_decodeNullCharacter(self): + input = "\"31337 \\u0000 31337\"" + self.assertEquals(ujson.decode(input), json.loads(input)) + + + def test_encodeListLongConversion(self): + input = [9223372036854775807, 9223372036854775807, 9223372036854775807, + 9223372036854775807, 9223372036854775807, 9223372036854775807 ] + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(input, ujson.decode(output)) + assert_array_equal(np.array(input), ujson.decode(output, numpy=True, + dtype=np.int64)) + pass + + def test_encodeLongConversion(self): + input = 9223372036854775807 + output = ujson.encode(input) + self.assertEquals(input, json.loads(output)) + self.assertEquals(output, json.dumps(input)) + self.assertEquals(input, ujson.decode(output)) + pass + + def test_numericIntExp(self): + input = "1337E40" + output = ujson.decode(input) + self.assertEquals(output, json.loads(input)) + + def test_numericIntFrcExp(self): + input = "1.337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEPLUS(self): + input = "1337E+40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpePLUS(self): + input = "1.337e+40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpE(self): + input = "1337E40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpe(self): + input = "1337e40" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpEMinus(self): + input = "1.337E-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_decodeNumericIntExpeMinus(self): + input = "1.337e-4" + output = ujson.decode(input) + self.assertAlmostEqual(output, json.loads(input)) + + def test_dumpToFile(self): + f = StringIO.StringIO() + ujson.dump([1, 2, 3], f) + self.assertEquals("[1,2,3]", f.getvalue()) + + def test_dumpToFileLikeObject(self): + class filelike: + def __init__(self): + self.bytes = '' + def write(self, bytes): + self.bytes += bytes + f = filelike() + ujson.dump([1, 2, 3], f) + self.assertEquals("[1,2,3]", f.bytes) + + def test_dumpFileArgsError(self): + try: + ujson.dump([], '') + except TypeError: + pass + else: + assert False, 'expected TypeError' + + def test_loadFile(self): + f = StringIO.StringIO("[1,2,3,4]") + self.assertEquals([1, 2, 3, 4], ujson.load(f)) + f = StringIO.StringIO("[1,2,3,4]") + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileLikeObject(self): + class filelike: + def read(self): + try: + self.end + except AttributeError: + self.end = True + return "[1,2,3,4]" + f = filelike() + self.assertEquals([1, 2, 3, 4], ujson.load(f)) + f = filelike() + assert_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + + def test_loadFileArgsError(self): + try: + ujson.load("[]") + except TypeError: + pass + else: + assert False, "expected TypeError" + + def test_version(self): + assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ + "ujson.__version__ must be a string like '1.4.0'" + + def test_encodeNumericOverflow(self): + try: + ujson.encode(12839128391289382193812939) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_encodeNumericOverflowNested(self): + for n in xrange(0, 100): + class Nested: + x = 12839128391289382193812939 + + nested = Nested() + + try: + ujson.encode(nested) + except OverflowError: + pass + else: + assert False, "expected OverflowError" + + def test_decodeNumberWith32bitSignBit(self): + #Test that numbers that fit within 32 bits but would have the + # sign bit set (2**31 <= x < 2**32) are decoded properly. + boundary1 = 2**31 + boundary2 = 2**32 + docs = ( + '{"id": 3590016419}', + '{"id": %s}' % 2**31, + '{"id": %s}' % 2**32, + '{"id": %s}' % ((2**32)-1), + ) + results = (3590016419, 2**31, 2**32, 2**32-1) + for doc,result in zip(docs, results): + self.assertEqual(ujson.decode(doc)['id'], result) + + def test_encodeBigEscape(self): + for x in xrange(10): + if py3compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + input = base * 1024 * 1024 * 2 + output = ujson.encode(input) + + def test_decodeBigEscape(self): + for x in xrange(10): + if py3compat.PY3: + base = '\u00e5'.encode('utf-8') + else: + base = "\xc3\xa5" + quote = py3compat.str_to_bytes("\"") + input = quote + (base * 1024 * 1024 * 2) + quote + output = ujson.decode(input) + + def test_toDict(self): + d = {u"key": 31337} + + class DictTest: + def toDict(self): + return d + + o = DictTest() + output = ujson.encode(o) + dec = ujson.decode(output) + self.assertEquals(dec, d) + + +class NumpyJSONTests(TestCase): + + def testBool(self): + b = np.bool(True) + self.assertEqual(ujson.decode(ujson.encode(b)), b) + + def testBoolArray(self): + inpt = np.array([True, False, True, True, False, True, False , False], + dtype=np.bool) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) + assert_array_equal(inpt, outp) + + def testInt(self): + num = np.int(2562010) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(127) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(2562010) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(2562010) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.int64(2562010) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(255) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(2562010) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(2562010) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + num = np.uint64(2562010) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testIntArray(self): + arr = np.arange(100, dtype=np.int) + dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, + np.uint, np.uint8, np.uint16, np.uint32, np.uint64) + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) + assert_array_equal(inpt, outp) + + def testIntMax(self): + num = np.int(np.iinfo(np.int).max) + self.assertEqual(np.int(ujson.decode(ujson.encode(num))), num) + + num = np.int8(np.iinfo(np.int8).max) + self.assertEqual(np.int8(ujson.decode(ujson.encode(num))), num) + + num = np.int16(np.iinfo(np.int16).max) + self.assertEqual(np.int16(ujson.decode(ujson.encode(num))), num) + + num = np.int32(np.iinfo(np.int32).max) + self.assertEqual(np.int32(ujson.decode(ujson.encode(num))), num) + + num = np.uint8(np.iinfo(np.uint8).max) + self.assertEqual(np.uint8(ujson.decode(ujson.encode(num))), num) + + num = np.uint16(np.iinfo(np.uint16).max) + self.assertEqual(np.uint16(ujson.decode(ujson.encode(num))), num) + + num = np.uint32(np.iinfo(np.uint32).max) + self.assertEqual(np.uint32(ujson.decode(ujson.encode(num))), num) + + if platform.architecture()[0] != '32bit': + num = np.int64(np.iinfo(np.int64).max) + self.assertEqual(np.int64(ujson.decode(ujson.encode(num))), num) + + # uint64 max will always overflow as it's encoded to signed + num = np.uint64(np.iinfo(np.int64).max) + self.assertEqual(np.uint64(ujson.decode(ujson.encode(num))), num) + + def testFloat(self): + num = np.float(256.2013) + self.assertEqual(np.float(ujson.decode(ujson.encode(num))), num) + + num = np.float32(256.2013) + self.assertEqual(np.float32(ujson.decode(ujson.encode(num))), num) + + num = np.float64(256.2013) + self.assertEqual(np.float64(ujson.decode(ujson.encode(num))), num) + + def testFloatArray(self): + arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) + dtypes = (np.float, np.float32, np.float64) + + for dtype in dtypes: + inpt = arr.astype(dtype) + outp = np.array(ujson.decode(ujson.encode(inpt, double_precision=15)), dtype=dtype) + assert_array_almost_equal_nulp(inpt, outp) + + def testFloatMax(self): + num = np.float(np.finfo(np.float).max/10) + assert_approx_equal(np.float(ujson.decode(ujson.encode(num))), num, 15) + + num = np.float32(np.finfo(np.float32).max/10) + assert_approx_equal(np.float32(ujson.decode(ujson.encode(num))), num, 15) + + num = np.float64(np.finfo(np.float64).max/10) + assert_approx_equal(np.float64(ujson.decode(ujson.encode(num))), num, 15) + + def testArrays(self): + arr = np.arange(100); + + arr = arr.reshape((10, 10)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((5, 5, 4)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = arr.reshape((100, 1)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + arr = np.arange(96); + arr = arr.reshape((2, 2, 2, 2, 3, 2)) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + assert_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + l = ['a', list(), dict(), dict(), list(), + 42, 97.8, ['a', 'b'], {'key': 'val'}] + arr = np.array(l) + assert_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + + arr = np.arange(100.202, 200.202, 1, dtype=np.float32); + arr = arr.reshape((5, 5, 4)) + outp = np.array(ujson.decode(ujson.encode(arr)), dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) + assert_array_almost_equal_nulp(arr, outp) + + def testArrayNumpyExcept(self): + + input = ujson.dumps([42, {}, 'a']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps(['a', 'b', [], 'c']) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, ['a'], 42]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{}, []]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([42, None]) + try: + ujson.decode(input, numpy=True) + assert False, "Expected exception!" + except(TypeError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 'b'}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps({'a': {'b': {'c': 42}}}) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + input = ujson.dumps([{'a': 42, 'b': 23}, {'c': 17}]) + try: + ujson.decode(input, numpy=True, labelled=True) + assert False, "Expected exception!" + except(ValueError): + pass + except: + assert False, "Wrong exception" + + def testArrayNumpyLabelled(self): + input = {'a': []} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.empty((1, 0)) == output[0]).all()) + self.assertTrue((np.array(['a']) == output[1]).all()) + self.assertTrue(output[2] is None) + + input = [{'a': 42}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + self.assertTrue((np.array([42]) == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a']) == output[2]).all()) + + input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a', 'b']) == output[2]).all()) + + + input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue((np.array(['1','2','3']) == output[1]).all()) + self.assertTrue((np.array(['a', 'b']) == output[2]).all()) + +class PandasJSONTests(TestCase): + + def testDataFrame(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df))) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"))) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) + outp.index = df.index + self.assertTrue((df.values == outp.values).all()) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + + def testDataFrameNumpy(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), + numpy=True)) + outp = DataFrame(**dec) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), numpy=True)) + self.assertTrue((df.transpose() == outp).values.all()) + assert_array_equal(df.transpose().columns, outp.columns) + assert_array_equal(df.transpose().index, outp.index) + + def testDataFrameNested(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + nested = {'df1': df, 'df2': df.copy()} + + exp = {'df1': ujson.decode(ujson.encode(df)), + 'df2': ujson.decode(ujson.encode(df))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="index")), + 'df2': ujson.decode(ujson.encode(df, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="records")), + 'df2': ujson.decode(ujson.encode(df, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="values")), + 'df2': ujson.decode(ujson.encode(df, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'df1': ujson.decode(ujson.encode(df, orient="split")), + 'df2': ujson.decode(ujson.encode(df, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + def testDataFrameNumpyLabelled(self): + df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + + # column indexed + outp = DataFrame(*ujson.decode(ujson.encode(df), numpy=True, labelled=True)) + self.assertTrue((df.T == outp).values.all()) + assert_array_equal(df.T.columns, outp.columns) + assert_array_equal(df.T.index, outp.index) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), numpy=True, labelled=True)) + outp.index = df.index + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), numpy=True, labelled=True)) + self.assertTrue((df == outp).values.all()) + assert_array_equal(df.columns, outp.columns) + assert_array_equal(df.index, outp.index) + + def testSeries(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + # column indexed + outp = Series(ujson.decode(ujson.encode(s))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"))) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), + numpy=True)) + outp = Series(**dec) + self.assertTrue((s == outp).values.all()) + self.assertTrue(s.name == outp.name) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="records"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True)) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="values"))) + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"))) + outp.sort() + self.assertTrue((s == outp).values.all()) + + outp = Series(ujson.decode(ujson.encode(s, orient="index"), numpy=True)) + outp.sort() + self.assertTrue((s == outp).values.all()) + + def testSeriesNested(self): + s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) + s.sort() + + nested = {'s1': s, 's2': s.copy()} + + exp = {'s1': ujson.decode(ujson.encode(s)), + 's2': ujson.decode(ujson.encode(s))} + self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="split")), + 's2': ujson.decode(ujson.encode(s, orient="split"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="records")), + 's2': ujson.decode(ujson.encode(s, orient="records"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="values")), + 's2': ujson.decode(ujson.encode(s, orient="values"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + + exp = {'s1': ujson.decode(ujson.encode(s, orient="index")), + 's2': ujson.decode(ujson.encode(s, orient="index"))} + self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + + def testIndex(self): + i = Index([23, 45, 18, 98, 43, 11], name="index") + + # column indexed + outp = Index(ujson.decode(ujson.encode(i))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i), numpy=True)) + self.assert_(i.equals(outp)) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"))) + outp = Index(**dec) + self.assert_(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), + numpy=True)) + outp = Index(**dec) + self.assert_(i.equals(outp)) + self.assertTrue(i.name == outp.name) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="values"), numpy=True)) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="records"), numpy=True)) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"))) + self.assert_(i.equals(outp)) + + outp = Index(ujson.decode(ujson.encode(i, orient="index"), numpy=True)) + self.assert_(i.equals(outp)) + + def test_datetimeindex(self): + from pandas.tseries.index import date_range, DatetimeIndex + + rng = date_range('1/1/2000', periods=20) + + encoded = ujson.encode(rng) + decoded = DatetimeIndex(np.array(ujson.decode(encoded))) + + self.assert_(rng.equals(decoded)) + + ts = Series(np.random.randn(len(rng)), index=rng) + decoded = Series(ujson.decode(ujson.encode(ts))) + idx_values = decoded.index.values.astype(np.int64) + decoded.index = DatetimeIndex(idx_values) + tm.assert_series_equal(np.round(ts, 5), decoded) + +""" +def test_decodeNumericIntFrcOverflow(self): +input = "X.Y" +raise NotImplementedError("Implement this test!") + + +def test_decodeStringUnicodeEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUnicodeBrokenEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUnicodeInvalidEscape(self): +input = "\u3131" +raise NotImplementedError("Implement this test!") + +def test_decodeStringUTF8(self): +input = "someutfcharacters" +raise NotImplementedError("Implement this test!") + + + +""" + +def _clean_dict(d): + return dict((str(k), v) for k, v in d.iteritems()) + +if __name__ == '__main__': + # unittest.main() + import nose + # nose.runmodule(argv=[__file__,'-vvs','-x', '--ipdb-failure'], + # exit=False) + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h new file mode 100644 index 0000000000000..eae665f00f03e --- /dev/null +++ b/pandas/src/ujson/lib/ultrajson.h @@ -0,0 +1,298 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +/* +Ultra fast JSON encoder and decoder +Developed by Jonas Tarnstrom (jonas@esn.me). + +Encoder notes: +------------------ + +:: Cyclic references :: +Cyclic referenced objects are not detected. +Set JSONObjectEncoder.recursionMax to suitable value or make sure input object +tree doesn't have cyclic references. + +*/ + +#ifndef __ULTRAJSON_H__ +#define __ULTRAJSON_H__ + +#include +#include + +//#define JSON_DECODE_NUMERIC_AS_DOUBLE + +// Don't output any extra whitespaces when encoding +#define JSON_NO_EXTRA_WHITESPACE + +// Max decimals to encode double floating point numbers with +#ifndef JSON_DOUBLE_MAX_DECIMALS +#define JSON_DOUBLE_MAX_DECIMALS 15 +#endif + +// Max recursion depth, default for encoder +#ifndef JSON_MAX_RECURSION_DEPTH +#define JSON_MAX_RECURSION_DEPTH 1024 +#endif + +/* +Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */ +#ifndef JSON_MAX_STACK_BUFFER_SIZE +#define JSON_MAX_STACK_BUFFER_SIZE 131072 +#endif + +#ifdef _WIN32 + +typedef __int64 JSINT64; +typedef unsigned __int64 JSUINT64; + +typedef __int32 JSINT32; +typedef unsigned __int32 JSUINT32; +typedef unsigned __int8 JSUINT8; +typedef unsigned __int16 JSUTF16; +typedef unsigned __int32 JSUTF32; +typedef __int64 JSLONG; + +#define EXPORTFUNCTION __declspec(dllexport) + +#define FASTCALL_MSVC __fastcall +#define FASTCALL_ATTR +#define INLINE_PREFIX __inline + +#else + +#include +typedef int64_t JSINT64; +typedef u_int64_t JSUINT64; + +typedef int32_t JSINT32; +typedef u_int32_t JSUINT32; + +#define FASTCALL_MSVC +#define FASTCALL_ATTR __attribute__((fastcall)) +#define INLINE_PREFIX inline + +typedef u_int8_t JSUINT8; +typedef u_int16_t JSUTF16; +typedef u_int32_t JSUTF32; + +typedef int64_t JSLONG; + +#define EXPORTFUNCTION +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __LITTLE_ENDIAN__ +#else + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __BIG_ENDIAN__ +#endif + +#endif + +#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) +#error "Endianess not supported" +#endif + +enum JSTYPES +{ + JT_NULL, // NULL + JT_TRUE, //boolean true + JT_FALSE, //boolean false + JT_INT, //(JSINT32 (signed 32-bit)) + JT_LONG, //(JSINT64 (signed 64-bit)) + JT_DOUBLE, //(double) + JT_UTF8, //(char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect +}; + +typedef void * JSOBJ; +typedef void * JSITER; + +typedef struct __JSONTypeContext +{ + int type; + void *encoder; + void *prv; +} JSONTypeContext; + +/* +Function pointer declarations, suitable for implementing UltraJSON */ +typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc); +typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc); +typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc); +typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc); +typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc, size_t *outLen); +typedef void *(*JSPFN_MALLOC)(size_t size); +typedef void (*JSPFN_FREE)(void *pptr); +typedef void *(*JSPFN_REALLOC)(void *base, size_t size); + +typedef struct __JSONObjectEncoder +{ + void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); + double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + + /* + Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) + Implementor should setup iteration state in ti->prv + */ + JSPFN_ITERBEGIN iterBegin; + + /* + Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items. + Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this + */ + JSPFN_ITERNEXT iterNext; + + /* + Ends the iteration of an iteratable object. + Any iteration state stored in ti->prv can be freed here + */ + JSPFN_ITEREND iterEnd; + + /* + Returns a reference to the value object of an iterator + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETVALUE iterGetValue; + + /* + Return name of iterator. + The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + */ + JSPFN_ITERGETNAME iterGetName; + + /* + Release a value as indicated by setting ti->release = 1 in the previous getValue call. + The ti->prv array should contain the necessary context to release the value + */ + void (*releaseObject)(JSOBJ obj); + + /* Library functions + Set to NULL to use STDLIB malloc,realloc,free */ + JSPFN_MALLOC malloc; + JSPFN_REALLOC realloc; + JSPFN_FREE free; + + /* + Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/ + int recursionMax; + + /* + Configuration for max decimals of double floating poiunt numbers to encode (0-9) */ + int doublePrecision; + + /* + If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + int forceASCII; + + + /* + Set to an error message if error occured */ + const char *errorMsg; + JSOBJ errorObj; + + /* Buffer stuff */ + char *start; + char *offset; + char *end; + int heap; + int level; + +} JSONObjectEncoder; + + +/* +Encode an object structure into JSON. + +Arguments: +obj - An anonymous type representing the object +enc - Function definitions for querying JSOBJ type +buffer - Preallocated buffer to store result in. If NULL function allocates own buffer +cbBuffer - Length of buffer (ignored if buffer is NULL) + +Returns: +Encoded JSON object as a null terminated char string. + +NOTE: +If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer. +Life cycle of the provided buffer must still be handled by caller. + +If the return value doesn't equal the specified buffer caller must release the memory using +JSONObjectEncoder.free or free() as specified when calling this function. +*/ +EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); + + + +typedef struct __JSONObjectDecoder +{ + JSOBJ (*newString)(wchar_t *start, wchar_t *end); + int (*objectAddKey)(JSOBJ obj, JSOBJ name, JSOBJ value); + int (*arrayAddItem)(JSOBJ obj, JSOBJ value); + JSOBJ (*newTrue)(void); + JSOBJ (*newFalse)(void); + JSOBJ (*newNull)(void); + JSOBJ (*newObject)(void *decoder); + JSOBJ (*endObject)(JSOBJ obj); + JSOBJ (*newArray)(void *decoder); + JSOBJ (*endArray)(JSOBJ obj); + JSOBJ (*newInt)(JSINT32 value); + JSOBJ (*newLong)(JSINT64 value); + JSOBJ (*newDouble)(double value); + void (*releaseObject)(JSOBJ obj, void *decoder); + JSPFN_MALLOC malloc; + JSPFN_FREE free; + JSPFN_REALLOC realloc; + + char *errorStr; + char *errorOffset; + + + +} JSONObjectDecoder; + +EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); + +#endif diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/src/ujson/python/py_defines.h new file mode 100644 index 0000000000000..1544c2e3cf34d --- /dev/null +++ b/pandas/src/ujson/python/py_defines.h @@ -0,0 +1,15 @@ +#include + +#if PY_MAJOR_VERSION >= 3 + +#define PyInt_Check PyLong_Check +#define PyInt_AS_LONG PyLong_AsLong +#define PyInt_FromLong PyLong_FromLong + +#define PyString_Check PyBytes_Check +#define PyString_GET_SIZE PyBytes_GET_SIZE +#define PyString_AS_STRING PyBytes_AS_STRING + +#define PyString_FromString PyUnicode_FromString + +#endif diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h new file mode 100644 index 0000000000000..9449441411192 --- /dev/null +++ b/pandas/src/ujson/python/version.h @@ -0,0 +1 @@ +#define UJSON_VERSION "1.18" diff --git a/setup.py b/setup.py index 030584ba509d3..1cc666c87404b 100755 --- a/setup.py +++ b/setup.py @@ -250,6 +250,11 @@ def initialize_options(self): for f in files: if f in self._clean_exclude: continue + + # XXX + if 'ujson' in f: + continue + if os.path.splitext(f)[-1] in ('.pyc', '.so', '.o', '.pyo', '.pyd', '.c', '.orig'): @@ -457,6 +462,21 @@ def pxd(name): root, _ = os.path.splitext(ext.sources[0]) ext.sources[0] = root + suffix +ujson_ext = Extension('pandas.json', + depends=['pandas/src/ujson/lib/ultrajson.h'], + sources=['pandas/src/ujson/python/ujson.c', + 'pandas/src/ujson/python/objToJSON.c', + 'pandas/src/ujson/python/JSONtoObj.c', + 'pandas/src/ujson/lib/ultrajsonenc.c', + 'pandas/src/ujson/lib/ultrajsondec.c', + 'pandas/src/datetime/np_datetime.c', + 'pandas/src/datetime/np_datetime_strings.c'], + include_dirs=['pandas/src/ujson/python', + 'pandas/src/ujson/lib'] + common_include) + + +extensions.append(ujson_ext) + if _have_setuptools: setuptools_kwargs["test_suite"] = "nose.collector" @@ -485,6 +505,7 @@ def pxd(name): 'pandas.tseries', 'pandas.tseries.tests', 'pandas.io.tests', + 'pandas.io.tests.test_json', 'pandas.stats.tests', ], package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', From 8327c5b21586bde16393aed895be3f5630c1233b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 11 May 2013 18:39:55 -0700 Subject: [PATCH 27/71] DOC: add ultrajson license --- LICENSES/ULTRAJSON_LICENSE | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 LICENSES/ULTRAJSON_LICENSE diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE new file mode 100644 index 0000000000000..defca46e7f820 --- /dev/null +++ b/LICENSES/ULTRAJSON_LICENSE @@ -0,0 +1,34 @@ +Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +Numeric decoder derived from from TCL library +http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + * Copyright (c) 1988-1993 The Regents of the University of California. + * Copyright (c) 1994 Sun Microsystems, Inc. \ No newline at end of file From ade5d0ffc7e752522051332f8d23aa5ba0cae55b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 12 May 2013 11:13:13 -0700 Subject: [PATCH 28/71] TST: json manip test script. and trigger travis --- scripts/json_manip.py | 421 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 421 insertions(+) create mode 100644 scripts/json_manip.py diff --git a/scripts/json_manip.py b/scripts/json_manip.py new file mode 100644 index 0000000000000..e76a99cca344a --- /dev/null +++ b/scripts/json_manip.py @@ -0,0 +1,421 @@ +""" + +Tasks +------- + +Search and transform jsonable structures, specifically to make it 'easy' to make tabular/csv output for other consumers. + +Example +~~~~~~~~~~~~~ + + *give me a list of all the fields called 'id' in this stupid, gnarly + thing* + + >>> Q('id',gnarly_data) + ['id1','id2','id3'] + + +Observations: +--------------------- + +1) 'simple data structures' exist and are common. They are tedious + to search. + +2) The DOM is another nested / treeish structure, and jQuery selector is + a good tool for that. + +3a) R, Numpy, Excel and other analysis tools want 'tabular' data. These + analyses are valuable and worth doing. + +3b) Dot/Graphviz, NetworkX, and some other analyses *like* treeish/dicty + things, and those analyses are also worth doing! + +3c) Some analyses are best done using 'one-off' and custom code in C, Python, + or another 'real' programming language. + +4) Arbitrary transforms are tedious and error prone. SQL is one solution, + XSLT is another, + +5) the XPATH/XML/XSLT family is.... not universally loved :) They are + very complete, and the completeness can make simple cases... gross. + +6) For really complicated data structures, we can write one-off code. Getting + 80% of the way is mostly okay. There will always have to be programmers + in the loop. + +7) Re-inventing SQL is probably a failure mode. So is reinventing XPATH, XSLT + and the like. Be wary of mission creep! Re-use when possible (e.g., can + we put the thing into a DOM using + +8) If the interface is good, people can improve performance later. + + +Simplifying +--------------- + + +1) Assuming 'jsonable' structures + +2) keys are strings or stringlike. Python allows any hashable to be a key. + for now, we pretend that doesn't happen. + +3) assumes most dicts are 'well behaved'. DAG, no cycles! + +4) assume that if people want really specialized transforms, they can do it + themselves. + +""" + +from collections import Counter, namedtuple +import csv +import itertools +from itertools import product +from operator import attrgetter as aget, itemgetter as iget +import operator +import sys + + + +## note 'url' appears multiple places and not all extensions have same struct +ex1 = { + 'name': 'Gregg', + 'extensions': [ + {'id':'hello', + 'url':'url1'}, + {'id':'gbye', + 'url':'url2', + 'more': dict(url='url3')}, + ] +} + +## much longer example +ex2 = {u'metadata': {u'accessibilities': [{u'name': u'accessibility.tabfocus', + u'value': 7}, + {u'name': u'accessibility.mouse_focuses_formcontrol', u'value': False}, + {u'name': u'accessibility.browsewithcaret', u'value': False}, + {u'name': u'accessibility.win32.force_disabled', u'value': False}, + {u'name': u'accessibility.typeaheadfind.startlinksonly', u'value': False}, + {u'name': u'accessibility.usebrailledisplay', u'value': u''}, + {u'name': u'accessibility.typeaheadfind.timeout', u'value': 5000}, + {u'name': u'accessibility.typeaheadfind.enabletimeout', u'value': True}, + {u'name': u'accessibility.tabfocus_applies_to_xul', u'value': False}, + {u'name': u'accessibility.typeaheadfind.flashBar', u'value': 1}, + {u'name': u'accessibility.typeaheadfind.autostart', u'value': True}, + {u'name': u'accessibility.blockautorefresh', u'value': False}, + {u'name': u'accessibility.browsewithcaret_shortcut.enabled', + u'value': True}, + {u'name': u'accessibility.typeaheadfind.enablesound', u'value': True}, + {u'name': u'accessibility.typeaheadfind.prefillwithselection', + u'value': True}, + {u'name': u'accessibility.typeaheadfind.soundURL', u'value': u'beep'}, + {u'name': u'accessibility.typeaheadfind', u'value': False}, + {u'name': u'accessibility.typeaheadfind.casesensitive', u'value': 0}, + {u'name': u'accessibility.warn_on_browsewithcaret', u'value': True}, + {u'name': u'accessibility.usetexttospeech', u'value': u''}, + {u'name': u'accessibility.accesskeycausesactivation', u'value': True}, + {u'name': u'accessibility.typeaheadfind.linksonly', u'value': False}, + {u'name': u'isInstantiated', u'value': True}], + u'extensions': [{u'id': u'216ee7f7f4a5b8175374cd62150664efe2433a31', + u'isEnabled': True}, + {u'id': u'1aa53d3b720800c43c4ced5740a6e82bb0b3813e', u'isEnabled': False}, + {u'id': u'01ecfac5a7bd8c9e27b7c5499e71c2d285084b37', u'isEnabled': True}, + {u'id': u'1c01f5b22371b70b312ace94785f7b0b87c3dfb2', u'isEnabled': True}, + {u'id': u'fb723781a2385055f7d024788b75e959ad8ea8c3', u'isEnabled': True}], + u'fxVersion': u'9.0', + u'location': u'zh-CN', + u'operatingSystem': u'WINNT Windows NT 5.1', + u'surveyAnswers': u'', + u'task_guid': u'd69fbd15-2517-45b5-8a17-bb7354122a75', + u'tpVersion': u'1.2', + u'updateChannel': u'beta'}, + u'survey_data': { + u'extensions': [{u'appDisabled': False, + u'id': u'testpilot?labs.mozilla.com', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Test Pilot'}, + {u'appDisabled': True, + u'id': u'dict?www.youdao.com', + u'isCompatible': False, + u'isEnabled': False, + u'isPlatformCompatible': True, + u'name': u'Youdao Word Capturer'}, + {u'appDisabled': False, + u'id': u'jqs?sun.com', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Java Quick Starter'}, + {u'appDisabled': False, + u'id': u'?20a82645-c095-46ed-80e3-08825760534b?', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'Microsoft .NET Framework Assistant'}, + {u'appDisabled': False, + u'id': u'?a0d7ccb3-214d-498b-b4aa-0e8fda9a7bf7?', + u'isCompatible': True, + u'isEnabled': True, + u'isPlatformCompatible': True, + u'name': u'WOT'}], + u'version_number': 1}} + +# class SurveyResult(object): + +# def __init__(self, record): +# self.record = record +# self.metadata, self.survey_data = self._flatten_results() + +# def _flatten_results(self): +# survey_data = self.record['survey_data'] +# extensions = DataFrame(survey_data['extensions']) + +def denorm(queries,iterable_of_things,default=None): + """ + 'repeat', or 'stutter' to 'tableize' for downstream. + (I have no idea what a good word for this is!) + + Think ``kronecker`` products, or: + + ``SELECT single,multiple FROM table;`` + + single multiple + ------- --------- + id1 val1 + id1 val2 + + + Args: + + queries: iterable of ``Q`` queries. + iterable_of_things: to be queried. + + Returns: + + list of 'stuttered' output, where if a query returns + a 'single', it gets repeated appropriately. + + + """ + + def _denorm(queries,thing): + fields = [] + results = [] + for q in queries: + #print q + r = Ql(q,thing) + #print "-- result: ", r + if not r: + r = [default] + if type(r[0]) is type({}): + fields.append(sorted(r[0].keys())) # dicty answers + else: + fields.append([q]) # stringy answer + + results.append(r) + + #print results + #print fields + flist = list(flatten(*map(iter,fields))) + + prod = itertools.product(*results) + for p in prod: + U = dict() + for (ii,thing) in enumerate(p): + #print ii,thing + if type(thing) is type({}): + U.update(thing) + else: + U[fields[ii][0]] = thing + + yield U + + return list(flatten(*[_denorm(queries,thing) for thing in iterable_of_things])) + + +def default_iget(fields,default=None,): + """ itemgetter with 'default' handling, that *always* returns lists + + API CHANGES from ``operator.itemgetter`` + + Note: Sorry to break the iget api... (fields vs *fields) + Note: *always* returns a list... unlike itemgetter, + which can return tuples or 'singles' + """ + myiget = operator.itemgetter(*fields) + L = len(fields) + def f(thing): + try: + ans = list(myiget(thing)) + if L < 2: + ans = [ans,] + return ans + except KeyError: + # slower! + return [thing.get(x,default) for x in fields] + + f.__doc__ = "itemgetter with default %r for fields %r" %(default,fields) + f.__name__ = "default_itemgetter" + return f + + +def flatten(*stack): + """ + helper function for flattening iterables of generators in a + sensible way. + """ + stack = list(stack) + while stack: + try: x = stack[0].next() + except StopIteration: + stack.pop(0) + continue + if hasattr(x,'next') and callable(getattr(x,'next')): + stack.insert(0, x) + + #if isinstance(x, (GeneratorType,listerator)): + else: yield x + + +def _Q(filter_, thing): + """ underlying machinery for Q function recursion """ + T = type(thing) + if T is type({}): + for k,v in thing.iteritems(): + #print k,v + if filter_ == k: + if type(v) is type([]): + yield iter(v) + else: + yield v + + if type(v) in (type({}),type([])): + yield Q(filter_,v) + + elif T is type([]): + for k in thing: + #print k + yield Q(filter_,k) + + else: + # no recursion. + pass + +def Q(filter_,thing): + """ + type(filter): + - list: a flattened list of all searches (one list) + - dict: dict with vals each of which is that search + + Notes: + + [1] 'parent thing', with space, will do a descendent + [2] this will come back 'flattened' jQuery style + [3] returns a generator. Use ``Ql`` if you want a list. + + """ + if type(filter_) is type([]): + return flatten(*[_Q(x,thing) for x in filter_]) + elif type(filter_) is type({}): + d = dict.fromkeys(filter_.keys()) + #print d + for k in d: + #print flatten(Q(k,thing)) + d[k] = Q(k,thing) + + return d + + else: + if " " in filter_: # i.e. "antecendent post" + parts = filter_.strip().split() + r = None + for p in parts: + r = Ql(p,thing) + thing = r + + return r + + else: # simple. + return flatten(_Q(filter_,thing)) + +def Ql(filter_,thing): + """ same as Q, but returns a list, not a generator """ + res = Q(filter_,thing) + + if type(filter_) is type({}): + for k in res: + res[k] = list(res[k]) + return res + + else: + return list(res) + + + +def countit(fields,iter_of_iter,default=None): + """ + note: robust to fields not being in i_of_i, using ``default`` + """ + C = Counter() # needs hashables + T = namedtuple("Thing",fields) + get = default_iget(*fields,default=default) + return Counter( + (T(*get(thing)) for thing in iter_of_iter) + ) + + +## right now this works for one row... +def printout(queries,things,default=None, f=sys.stdout, **kwargs): + """ will print header and objects + + **kwargs go to csv.DictWriter + + help(csv.DictWriter) for more. + """ + + results = denorm(queries,things,default=None) + fields = set(itertools.chain(*(x.keys() for x in results))) + + W = csv.DictWriter(f=f,fieldnames=fields,**kwargs) + #print "---prod---" + #print list(prod) + W.writeheader() + for r in results: + W.writerow(r) + + +def test_run(): + print "\n>>> print list(Q('url',ex1))" + print list(Q('url',ex1)) + assert list(Q('url',ex1)) == ['url1','url2','url3'] + assert Ql('url',ex1) == ['url1','url2','url3'] + + print "\n>>> print list(Q(['name','id'],ex1))" + print list(Q(['name','id'],ex1)) + assert Ql(['name','id'],ex1) == ['Gregg','hello','gbye'] + + + print "\n>>> print Ql('more url',ex1)" + print Ql('more url',ex1) + + + print "\n>>> list(Q('extensions',ex1))" + print list(Q('extensions',ex1)) + + print "\n>>> print Ql('extensions',ex1)" + print Ql('extensions',ex1) + + print "\n>>> printout(['name','extensions'],[ex1,], extrasaction='ignore')" + printout(['name','extensions'],[ex1,], extrasaction='ignore') + + print "\n\n" + + from pprint import pprint as pp + + print "-- note that the extension fields are also flattened! (and N/A) -- " + pp(denorm(['location','fxVersion','notthere','survey_data extensions'],[ex2,], default="N/A")[:2]) + + +if __name__ == "__main__": + pass From 9633880214ca6f6372ced250146368628014e3d0 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 7 Jun 2013 17:37:49 -0400 Subject: [PATCH 29/71] BLD: fix setup.py to work on current pandas --- pandas/io/tests/test_json/__init__.py | 0 pandas/src/ujson/lib/ultrajsondec.c | 845 ++++++++++++ pandas/src/ujson/lib/ultrajsonenc.c | 891 +++++++++++++ pandas/src/ujson/python/JSONtoObj.c | 674 ++++++++++ pandas/src/ujson/python/objToJSON.c | 1701 +++++++++++++++++++++++++ pandas/src/ujson/python/ujson.c | 73 ++ setup.py | 11 +- 7 files changed, 4193 insertions(+), 2 deletions(-) create mode 100644 pandas/io/tests/test_json/__init__.py create mode 100644 pandas/src/ujson/lib/ultrajsondec.c create mode 100644 pandas/src/ujson/lib/ultrajsonenc.c create mode 100644 pandas/src/ujson/python/JSONtoObj.c create mode 100644 pandas/src/ujson/python/objToJSON.c create mode 100644 pandas/src/ujson/python/ujson.c diff --git a/pandas/io/tests/test_json/__init__.py b/pandas/io/tests/test_json/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c new file mode 100644 index 0000000000000..eda30f3fea839 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -0,0 +1,845 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include + +struct DecoderState +{ + char *start; + char *end; + wchar_t *escStart; + wchar_t *escEnd; + int escHeap; + int lastType; + JSONObjectDecoder *dec; +}; + +JSOBJ FASTCALL_MSVC decode_any( struct DecoderState *ds) FASTCALL_ATTR; +typedef JSOBJ (*PFN_DECODER)( struct DecoderState *ds); +#define RETURN_JSOBJ_NULLCHECK(_expr) return(_expr); + +double createDouble(double intNeg, double intValue, double frcValue, int frcDecimalCount) +{ + static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; + + return (intValue + (frcValue / g_pow10[frcDecimalCount])) * intNeg; +} + +static JSOBJ SetError( struct DecoderState *ds, int offset, const char *message) +{ + ds->dec->errorOffset = ds->start + offset; + ds->dec->errorStr = (char *) message; + return NULL; +} + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric ( struct DecoderState *ds) +{ +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + double intNeg = 1; + double intValue; +#else + int intNeg = 1; + JSLONG intValue; +#endif + + double expNeg; + int chr; + int decimalCount = 0; + double frcValue = 0.0; + double expValue; + char *offset = ds->start; + + if (*(offset) == '-') + { + offset ++; + intNeg = -1; + } + + // Scan integer part + intValue = 0; + + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + //FIXME: Check for arithemtic overflow here + //PERF: Don't do 64-bit arithmetic here unless we know we have to +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + intValue = intValue * 10.0 + (double) (chr - 48); +#else + intValue = intValue * 10LL + (JSLONG) (chr - 48); +#endif + offset ++; + break; + + case '.': + offset ++; + goto DECODE_FRACTION; + break; + + case 'e': + case 'E': + offset ++; + goto DECODE_EXPONENT; + break; + + default: + goto BREAK_INT_LOOP; + break; + } + } + +BREAK_INT_LOOP: + + ds->lastType = JT_INT; + ds->start = offset; + + //If input string is LONGLONG_MIN here the value is already negative so we should not flip it + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE +#else + if (intValue < 0) + { + intNeg = 1; + } +#endif + + //dbg1 = (intValue * intNeg); + //dbg2 = (JSLONG) dbg1; + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE + if (intValue > (double) INT_MAX || intValue < (double) INT_MIN) +#else + if ( (intValue >> 31)) +#endif + { + RETURN_JSOBJ_NULLCHECK(ds->dec->newLong( (JSINT64) (intValue * (JSINT64) intNeg))); + } + else + { + RETURN_JSOBJ_NULLCHECK(ds->dec->newInt( (JSINT32) (intValue * intNeg))); + } + + + +DECODE_FRACTION: + + // Scan fraction part + frcValue = 0.0; + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) + { + frcValue = frcValue * 10.0 + (double) (chr - 48); + decimalCount ++; + } + offset ++; + break; + + case 'e': + case 'E': + offset ++; + goto DECODE_EXPONENT; + break; + + default: + goto BREAK_FRC_LOOP; + } + } + +BREAK_FRC_LOOP: + + if (intValue < 0) + { + intNeg = 1; + } + + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newDouble (createDouble( (double) intNeg, (double) intValue, frcValue, decimalCount))); + +DECODE_EXPONENT: + expNeg = 1.0; + + if (*(offset) == '-') + { + expNeg = -1.0; + offset ++; + } + else + if (*(offset) == '+') + { + expNeg = +1.0; + offset ++; + } + + expValue = 0.0; + + while (1) + { + chr = (int) (unsigned char) *(offset); + + switch (chr) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + expValue = expValue * 10.0 + (double) (chr - 48); + offset ++; + break; + + default: + goto BREAK_EXP_LOOP; + + } + } + +BREAK_EXP_LOOP: + +#ifdef JSON_DECODE_NUMERIC_AS_DOUBLE +#else + if (intValue < 0) + { + intNeg = 1; + } +#endif + + //FIXME: Check for arithemtic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newDouble (createDouble( (double) intNeg, (double) intValue , frcValue, decimalCount) * pow(10.0, expValue * expNeg))); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'r') + goto SETERROR; + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_TRUE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newTrue()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'true'"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'a') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 's') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_FALSE; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newFalse()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'false'"); + +} + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null ( struct DecoderState *ds) +{ + char *offset = ds->start; + offset ++; + + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + RETURN_JSOBJ_NULLCHECK(ds->dec->newNull()); + +SETERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'null'"); +} + +FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) +{ + char *offset = ds->start; + + while (1) + { + switch (*offset) + { + case ' ': + case '\t': + case '\r': + case '\n': + offset ++; + break; + + default: + ds->start = offset; + return; + } + } +} + + +enum DECODESTRINGSTATE +{ + DS_ISNULL = 0x32, + DS_ISQUOTE, + DS_ISESCAPE, + DS_UTFLENERROR, + +}; + +static const JSUINT8 g_decoderLookup[256] = +{ +/* 0x00 */ DS_ISNULL, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x20 */ 1, 1, DS_ISQUOTE, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, DS_ISESCAPE, 1, 1, 1, +/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, DS_UTFLENERROR, +}; + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) +{ + JSUTF16 sur[2] = { 0 }; + int iSur = 0; + int index; + wchar_t *escOffset; + size_t escLen = (ds->escEnd - ds->escStart); + JSUINT8 *inputOffset; + JSUINT8 oct; + JSUTF32 ucs; + ds->lastType = JT_INVALID; + ds->start ++; + + if ( (ds->end - ds->start) > escLen) + { + size_t newSize = (ds->end - ds->start); + + if (ds->escHeap) + { + ds->escStart = (wchar_t *) ds->dec->realloc (ds->escStart, newSize * sizeof(wchar_t)); + if (!ds->escStart) + { + return SetError(ds, -1, "Could not reserve memory block"); + } + } + else + { + wchar_t *oldStart = ds->escStart; + ds->escHeap = 1; + ds->escStart = (wchar_t *) ds->dec->malloc (newSize * sizeof(wchar_t)); + if (!ds->escStart) + { + return SetError(ds, -1, "Could not reserve memory block"); + } + memcpy (ds->escStart, oldStart, escLen * sizeof(wchar_t)); + } + + ds->escEnd = ds->escStart + newSize; + } + + escOffset = ds->escStart; + inputOffset = ds->start; + + while(1) + { + switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) + { + case DS_ISNULL: + return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); + + case DS_ISQUOTE: + ds->lastType = JT_UTF8; + inputOffset ++; + ds->start += ( (char *) inputOffset - (ds->start)); + RETURN_JSOBJ_NULLCHECK(ds->dec->newString(ds->escStart, escOffset)); + + case DS_UTFLENERROR: + return SetError (ds, -1, "Invalid UTF-8 sequence length when decoding 'string'"); + + case DS_ISESCAPE: + inputOffset ++; + switch (*inputOffset) + { + case '\\': *(escOffset++) = L'\\'; inputOffset++; continue; + case '\"': *(escOffset++) = L'\"'; inputOffset++; continue; + case '/': *(escOffset++) = L'/'; inputOffset++; continue; + case 'b': *(escOffset++) = L'\b'; inputOffset++; continue; + case 'f': *(escOffset++) = L'\f'; inputOffset++; continue; + case 'n': *(escOffset++) = L'\n'; inputOffset++; continue; + case 'r': *(escOffset++) = L'\r'; inputOffset++; continue; + case 't': *(escOffset++) = L'\t'; inputOffset++; continue; + + case 'u': + { + int index; + inputOffset ++; + + for (index = 0; index < 4; index ++) + { + switch (*inputOffset) + { + case '\0': return SetError (ds, -1, "Unterminated unicode escape sequence when decoding 'string'"); + default: return SetError (ds, -1, "Unexpected character in unicode escape sequence when decoding 'string'"); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0'); + break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); + break; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); + break; + } + + inputOffset ++; + } + + + if (iSur == 0) + { + if((sur[iSur] & 0xfc00) == 0xd800) + { + // First of a surrogate pair, continue parsing + iSur ++; + break; + } + (*escOffset++) = (wchar_t) sur[iSur]; + iSur = 0; + } + else + { + // Decode pair + if ((sur[1] & 0xfc00) != 0xdc00) + { + return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'"); + } + +#if WCHAR_MAX == 0xffff + (*escOffset++) = (wchar_t) sur[0]; + (*escOffset++) = (wchar_t) sur[1]; +#else + (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); +#endif + iSur = 0; + } + break; + } + + case '\0': return SetError(ds, -1, "Unterminated escape sequence when decoding 'string'"); + default: return SetError(ds, -1, "Unrecognized escape sequence when decoding 'string'"); + } + break; + + case 1: + *(escOffset++) = (wchar_t) (*inputOffset++); + break; + + case 2: + { + ucs = (*inputOffset++) & 0x1f; + ucs <<= 6; + if (((*inputOffset) & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + ucs |= (*inputOffset++) & 0x3f; + if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 3: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x0f; + + for (index = 0; index < 2; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + *(escOffset++) = (wchar_t) ucs; + break; + } + + case 4: + { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x07; + + for (index = 0; index < 3; index ++) + { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) + { + return SetError(ds, -1, "Invalid octet in UTF-8 sequence when decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'"); + + #if WCHAR_MAX == 0xffff + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(escOffset++) = (ucs >> 10) + 0xd800; + *(escOffset++) = (ucs & 0x3ff) + 0xdc00; + } + else + { + *(escOffset++) = (wchar_t) ucs; + } + #else + *(escOffset++) = (wchar_t) ucs; + #endif + break; + } + } + } +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array( struct DecoderState *ds) +{ + JSOBJ itemValue; + JSOBJ newObj = ds->dec->newArray(ds->dec); + + ds->lastType = JT_INVALID; + ds->start ++; + + while (1)//(*ds->start) != '\0') + { + SkipWhitespace(ds); + + if ((*ds->start) == ']') + { + ds->start++; + return ds->dec->endArray(newObj); + } + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + if (!ds->dec->arrayAddItem (newObj, itemValue)) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case ']': + return ds->dec->endArray(newObj); + + case ',': + break; + + default: + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unexpected character in found when decoding array value"); + } + } + + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unmatched ']' when decoding 'array'"); +} + + + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object( struct DecoderState *ds) +{ + JSOBJ itemName; + JSOBJ itemValue; + JSOBJ newObj = ds->dec->newObject(ds->dec); + + ds->start ++; + + while (1) + { + SkipWhitespace(ds); + + if ((*ds->start) == '}') + { + ds->start ++; + return ds->dec->endObject(newObj); + } + + ds->lastType = JT_INVALID; + itemName = decode_any(ds); + + if (itemName == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + return NULL; + } + + if (ds->lastType != JT_UTF8) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return SetError(ds, -1, "Key name of object must be 'string' when decoding 'object'"); + } + + SkipWhitespace(ds); + + if (*(ds->start++) != ':') + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return SetError(ds, -1, "No ':' found when decoding object value"); + } + + SkipWhitespace(ds); + + itemValue = decode_any(ds); + + if (itemValue == NULL) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + return NULL; + } + + if (!ds->dec->objectAddKey (newObj, itemName, itemValue)) + { + ds->dec->releaseObject(newObj, ds->dec); + ds->dec->releaseObject(itemName, ds->dec); + ds->dec->releaseObject(itemValue, ds->dec); + return NULL; + } + + SkipWhitespace(ds); + + switch (*(ds->start++)) + { + case '}': + return ds->dec->endObject(newObj); + + case ',': + break; + + default: + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unexpected character in found when decoding object value"); + } + } + + ds->dec->releaseObject(newObj, ds->dec); + return SetError(ds, -1, "Unmatched '}' when decoding object"); +} + +FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) +{ + while (1) + { + switch (*ds->start) + { + case '\"': + return decode_string (ds); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + return decode_numeric (ds); + + case '[': return decode_array (ds); + case '{': return decode_object (ds); + case 't': return decode_true (ds); + case 'f': return decode_false (ds); + case 'n': return decode_null (ds); + + case ' ': + case '\t': + case '\r': + case '\n': + // White space + ds->start ++; + break; + + default: + return SetError(ds, -1, "Expected object or value"); + } + } +} + + +JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer) +{ + + /* + FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */ + struct DecoderState ds; + wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSOBJ ret; + + ds.start = (char *) buffer; + ds.end = ds.start + cbBuffer; + + ds.escStart = escBuffer; + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escHeap = 0; + ds.dec = dec; + ds.dec->errorStr = NULL; + ds.dec->errorOffset = NULL; + + ds.dec = dec; + + ret = decode_any (&ds); + + if (ds.escHeap) + { + dec->free(ds.escStart); + } + return ret; +} diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c new file mode 100644 index 0000000000000..22871513870b7 --- /dev/null +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -0,0 +1,891 @@ +/* +Copyright (c) 2011, Jonas Tarnstrom and ESN Social Software AB +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. All advertising materials mentioning features or use of this software + must display the following acknowledgement: + This product includes software developed by ESN Social Software AB (www.esn.me). +4. Neither the name of the ESN Social Software AB nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ESN SOCIAL SOFTWARE AB ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Portions of code from: +MODP_ASCII - Ascii transformations (upper/lower, etc) +http://code.google.com/p/stringencoders/ +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. + +*/ + +#include "ultrajson.h" +#include +#include +#include +#include +#include + +#include + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +static const double g_pow10[] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000}; +static const char g_hexChars[] = "0123456789abcdef"; +static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/"; + + +/* +FIXME: While this is fine dandy and working it's a magic value mess which probably only the author understands. +Needs a cleanup and more documentation */ + +/* +Table for pure ascii output escaping all characters above 127 to \uXXXX */ +static const JSUINT8 g_asciiOutputTable[256] = +{ +/* 0x00 */ 0, 30, 30, 30, 30, 30, 30, 30, 10, 12, 14, 30, 16, 18, 30, 30, +/* 0x10 */ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, +/* 0x20 */ 1, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, +/* 0x30 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x40 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x50 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22, 1, 1, 1, +/* 0x60 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x70 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x80 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0x90 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xa0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xb0 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +/* 0xc0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xd0 */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +/* 0xe0 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +/* 0xf0 */ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 +}; + + +static void SetError (JSOBJ obj, JSONObjectEncoder *enc, const char *message) +{ + enc->errorMsg = message; + enc->errorObj = obj; +} + +/* +FIXME: Keep track of how big these get across several encoder calls and try to make an estimate +That way we won't run our head into the wall each call */ +void Buffer_Realloc (JSONObjectEncoder *enc, size_t cbNeeded) +{ + size_t curSize = enc->end - enc->start; + size_t newSize = curSize * 2; + size_t offset = enc->offset - enc->start; + + while (newSize < curSize + cbNeeded) + { + newSize *= 2; + } + + if (enc->heap) + { + enc->start = (char *) enc->realloc (enc->start, newSize); + if (!enc->start) + { + SetError (NULL, enc, "Could not reserve memory block"); + return; + } + } + else + { + char *oldStart = enc->start; + enc->heap = 1; + enc->start = (char *) enc->malloc (newSize); + if (!enc->start) + { + SetError (NULL, enc, "Could not reserve memory block"); + return; + } + memcpy (enc->start, oldStart, offset); + } + enc->offset = enc->start + offset; + enc->end = enc->start + newSize; +} + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked (char *outputOffset, unsigned short value) +{ + *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; + *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; + *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; + *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; +} + +int Buffer_EscapeStringUnvalidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + char *of = (char *) enc->offset; + + while (1) + { + switch (*io) + { + case 0x00: + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + break; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + + case '\"': (*of++) = '\\'; (*of++) = '\"'; break; + case '\\': (*of++) = '\\'; (*of++) = '\\'; break; + case '/': (*of++) = '\\'; (*of++) = '/'; break; + case '\b': (*of++) = '\\'; (*of++) = 'b'; break; + case '\f': (*of++) = '\\'; (*of++) = 'f'; break; + case '\n': (*of++) = '\\'; (*of++) = 'n'; break; + case '\r': (*of++) = '\\'; (*of++) = 'r'; break; + case '\t': (*of++) = '\\'; (*of++) = 't'; break; + + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + break; + + default: (*of++) = (*io); break; + } + + io++; + } + + return FALSE; +} + + +/* +FIXME: +This code only works with Little and Big Endian + +FIXME: The JSON spec says escape "/" but non of the others do and we don't +want to be left alone doing it so we don't :) + +*/ +int Buffer_EscapeStringValidated (JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) +{ + JSUTF32 ucs; + char *of = (char *) enc->offset; + + while (1) + { + + //JSUINT8 chr = (unsigned char) *io; + JSUINT8 utflen = g_asciiOutputTable[(unsigned char) *io]; + + switch (utflen) + { + case 0: + { + if (io < end) + { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + io ++; + continue; + } + else + { + enc->offset += (of - enc->offset); + return TRUE; + } + } + + case 1: + { + *(of++)= (*io++); + continue; + } + + case 2: + { + JSUTF32 in; + JSUTF16 in16; + + if (end - io < 1) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + in = (JSUTF32) in16; + +#ifdef __LITTLE_ENDIAN__ + ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); +#else + ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); +#endif + + if (ucs < 0x80) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 2 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 2; + break; + } + + case 3: + { + JSUTF32 in; + JSUTF16 in16; + JSUINT8 in8; + + if (end - io < 2) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in16, io, sizeof(JSUTF16)); + memcpy(&in8, io + 2, sizeof(JSUINT8)); +#ifdef __LITTLE_ENDIAN__ + in = (JSUTF32) in16; + in |= in8 << 16; + ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); +#else + in = in16 << 8; + in |= in8; + ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + + + if (ucs < 0x800) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 3 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 3; + break; + } + case 4: + { + JSUTF32 in; + + if (end - io < 3) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } + + memcpy(&in, io, sizeof(JSUTF32)); +#ifdef __LITTLE_ENDIAN__ + ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); +#else + ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); +#endif + if (ucs < 0x10000) + { + enc->offset += (of - enc->offset); + SetError (obj, enc, "Overlong 4 byte UTF-8 sequence detected when encoding string"); + return FALSE; + } + + io += 4; + break; + } + + + case 5: + case 6: + enc->offset += (of - enc->offset); + SetError (obj, enc, "Unsupported UTF-8 sequence length when encoding string"); + return FALSE; + + case 30: + // \uXXXX encode + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[ (unsigned char) (((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[ (unsigned char) ((*io) & 0x0f)]; + io ++; + continue; + + case 10: + case 12: + case 14: + case 16: + case 18: + case 20: + case 22: + case 24: + *(of++) = *( (char *) (g_escapeChars + utflen + 0)); + *(of++) = *( (char *) (g_escapeChars + utflen + 1)); + io ++; + continue; + } + + /* + If the character is a UTF8 sequence of length > 1 we end up here */ + if (ucs >= 0x10000) + { + ucs -= 0x10000; + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (ucs >> 10) + 0xd800); + of += 4; + + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (ucs & 0x3ff) + 0xdc00); + of += 4; + } + else + { + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, ucs); + of += 4; + } + } + + return FALSE; +} + +#define Buffer_Reserve(__enc, __len) \ + if ((__enc)->end - (__enc)->offset < (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + + +#define Buffer_AppendCharUnchecked(__enc, __chr) \ + *((__enc)->offset++) = __chr; \ + +FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char* begin, char* end) +{ + char aux; + while (end > begin) + aux = *end, *end-- = *begin, *begin++ = aux; +} + +void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) +{ + char* wstr; + JSUINT32 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10)); while(uvalue /= 10); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) +{ + char* wstr; + JSUINT64 uvalue = (value < 0) ? -value : value; + + wstr = enc->offset; + // Conversion. Number is reversed. + + do *wstr++ = (char)(48 + (uvalue % 10ULL)); while(uvalue /= 10ULL); + if (value < 0) *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset,wstr - 1); + enc->offset += (wstr - (enc->offset)); +} + +int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, double value) +{ + /* if input is larger than thres_max, revert to exponential */ + const double thres_max = (double) 1e16 - 1; + int count; + double diff = 0.0; + char* str = enc->offset; + char* wstr = str; + unsigned long long whole; + double tmp; + unsigned long long frac; + int neg; + double pow10; + + if (value == HUGE_VAL || value == -HUGE_VAL) + { + SetError (obj, enc, "Invalid Inf value when encoding double"); + return FALSE; + } + if (! (value == value)) + { + SetError (obj, enc, "Invalid Nan value when encoding double"); + return FALSE; + } + + + /* we'll work in positive values and deal with the + negative sign issue later */ + neg = 0; + if (value < 0) + { + neg = 1; + value = -value; + } + + pow10 = g_pow10[enc->doublePrecision]; + + whole = (unsigned long long) value; + tmp = (value - whole) * pow10; + frac = (unsigned long long)(tmp); + diff = tmp - frac; + + if (diff > 0.5) + { + ++frac; + /* handle rollover, e.g. case 0.99 with prec 1 is 1.0 */ + if (frac >= pow10) + { + frac = 0; + ++whole; + } + } + else + if (diff == 0.5 && ((frac == 0) || (frac & 1))) + { + /* if halfway, round up if odd, OR + if last digit is 0. That last part is strange */ + ++frac; + } + + /* for very large numbers switch back to native sprintf for exponentials. + anyone want to write code to replace this? */ + /* + normal printf behavior is to print EVERY whole number digit + which can be 100s of characters overflowing your buffers == bad + */ + if (value > thres_max) + { + enc->offset += sprintf(str, "%.15e", neg ? -value : value); + return TRUE; + } + + if (enc->doublePrecision == 0) + { + diff = value - whole; + + if (diff > 0.5) + { + /* greater than 0.5, round up, e.g. 1.6 -> 2 */ + ++whole; + } + else + if (diff == 0.5 && (whole & 1)) + { + /* exactly 0.5 and ODD, then round up */ + /* 1.5 -> 2, but 2.5 -> 2 */ + ++whole; + } + + //vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 + } + else + if (frac) + { + count = enc->doublePrecision; + // now do fractional part, as an unsigned number + // we know it is not 0 but we can have leading zeros, these + // should be removed + while (!(frac % 10)) + { + --count; + frac /= 10; + } + //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 + + // now do fractional part, as an unsigned number + do + { + --count; + *wstr++ = (char)(48 + (frac % 10)); + } while (frac /= 10); + // add extra 0s + while (count-- > 0) + { + *wstr++ = '0'; + } + // add decimal + *wstr++ = '.'; + } + else + { + *wstr++ = '0'; + *wstr++ = '.'; + } + + // do whole part + // Take care of sign + // Conversion. Number is reversed. + do *wstr++ = (char)(48 + (whole % 10)); while (whole /= 10); + + if (neg) + { + *wstr++ = '-'; + } + strreverse(str, wstr-1); + enc->offset += (wstr - (enc->offset)); + + return TRUE; +} + + + + + + +/* +FIXME: +Handle integration functions returning NULL here */ + +/* +FIXME: +Perhaps implement recursion detection */ + +void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) +{ + const char *value; + char *objName; + int count; + JSOBJ iterObj; + size_t szlen; + JSONTypeContext tc; + tc.encoder = enc; + + if (enc->level > enc->recursionMax) + { + SetError (obj, enc, "Maximum recursion level reached"); + return; + } + + /* + This reservation must hold + + length of _name as encoded worst case + + maxLength of double to string OR maxLength of JSLONG to string + + Since input is assumed to be UTF-8 the worst character length is: + + 4 bytes (of UTF-8) => "\uXXXX\uXXXX" (12 bytes) + */ + + Buffer_Reserve(enc, 256 + (((cbName / 4) + 1) * 12)); + if (enc->errorMsg) + { + return; + } + + if (name) + { + Buffer_AppendCharUnchecked(enc, '\"'); + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) + { + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(obj, enc, name, name + cbName)) + { + return; + } + } + + + Buffer_AppendCharUnchecked(enc, '\"'); + + Buffer_AppendCharUnchecked (enc, ':'); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + enc->beginTypeContext(obj, &tc); + + switch (tc.type) + { + case JT_INVALID: + return; + + case JT_ARRAY: + { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '['); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (buffer, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + + enc->level ++; + encode (iterObj, enc, NULL, 0); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, ']'); + break; + } + + case JT_OBJECT: + { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked (enc, '{'); + + while (enc->iterNext(obj, &tc)) + { + if (count > 0) + { + Buffer_AppendCharUnchecked (enc, ','); +#ifndef JSON_NO_EXTRA_WHITESPACE + Buffer_AppendCharUnchecked (enc, ' '); +#endif + } + + iterObj = enc->iterGetValue(obj, &tc); + objName = enc->iterGetName(obj, &tc, &szlen); + + enc->level ++; + encode (iterObj, enc, objName, szlen); + count ++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendCharUnchecked (enc, '}'); + break; + } + + case JT_LONG: + { + Buffer_AppendLongUnchecked (enc, enc->getLongValue(obj, &tc)); + break; + } + + case JT_INT: + { + Buffer_AppendIntUnchecked (enc, enc->getIntValue(obj, &tc)); + break; + } + + case JT_TRUE: + { + Buffer_AppendCharUnchecked (enc, 't'); + Buffer_AppendCharUnchecked (enc, 'r'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + case JT_FALSE: + { + Buffer_AppendCharUnchecked (enc, 'f'); + Buffer_AppendCharUnchecked (enc, 'a'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 's'); + Buffer_AppendCharUnchecked (enc, 'e'); + break; + } + + + case JT_NULL: + { + Buffer_AppendCharUnchecked (enc, 'n'); + Buffer_AppendCharUnchecked (enc, 'u'); + Buffer_AppendCharUnchecked (enc, 'l'); + Buffer_AppendCharUnchecked (enc, 'l'); + break; + } + + case JT_DOUBLE: + { + if (!Buffer_AppendDoubleUnchecked (obj, enc, enc->getDoubleValue(obj, &tc))) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + break; + } + + case JT_UTF8: + { + value = enc->getStringValue(obj, &tc, &szlen); + Buffer_Reserve(enc, ((szlen / 4) + 1) * 12); + if (enc->errorMsg) + { + enc->endTypeContext(obj, &tc); + return; + } + Buffer_AppendCharUnchecked (enc, '\"'); + + + if (enc->forceASCII) + { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + else + { + if (!Buffer_EscapeStringUnvalidated(obj, enc, value, value + szlen)) + { + enc->endTypeContext(obj, &tc); + enc->level --; + return; + } + } + + Buffer_AppendCharUnchecked (enc, '\"'); + break; + } + } + + enc->endTypeContext(obj, &tc); + enc->level --; + +} + +char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer) +{ + enc->malloc = enc->malloc ? enc->malloc : malloc; + enc->free = enc->free ? enc->free : free; + enc->realloc = enc->realloc ? enc->realloc : realloc; + enc->errorMsg = NULL; + enc->errorObj = NULL; + enc->level = 0; + + if (enc->recursionMax < 1) + { + enc->recursionMax = JSON_MAX_RECURSION_DEPTH; + } + + if (enc->doublePrecision < 0 || + enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) + { + enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; + } + + if (_buffer == NULL) + { + _cbBuffer = 32768; + enc->start = (char *) enc->malloc (_cbBuffer); + if (!enc->start) + { + SetError(obj, enc, "Could not reserve memory block"); + return NULL; + } + enc->heap = 1; + } + else + { + enc->start = _buffer; + enc->heap = 0; + } + + enc->end = enc->start + _cbBuffer; + enc->offset = enc->start; + + + encode (obj, enc, NULL, 0); + + Buffer_Reserve(enc, 1); + if (enc->errorMsg) + { + return NULL; + } + Buffer_AppendCharUnchecked(enc, '\0'); + + return enc->start; +} diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c new file mode 100644 index 0000000000000..1db7586ad17f7 --- /dev/null +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -0,0 +1,674 @@ +#include "py_defines.h" +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#define NO_IMPORT_ARRAY +#include +#include + + +typedef struct __PyObjectDecoder +{ + JSONObjectDecoder dec; + + void* npyarr; // Numpy context buffer + npy_intp curdim; // Current array dimension + + PyArray_Descr* dtype; +} PyObjectDecoder; + +typedef struct __NpyArrContext +{ + PyObject* ret; + PyObject* labels[2]; + PyArray_Dims shape; + + PyObjectDecoder* dec; + + npy_intp i; + npy_intp elsize; + npy_intp elcount; +} NpyArrContext; + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +// Numpy handling based on numpy internal code, specifically the function +// PyArray_FromIter. + +// numpy related functions are inter-dependent so declare them all here, +// to ensure the compiler catches any errors + +// standard numpy array handling +JSOBJ Object_npyNewArray(void* decoder); +JSOBJ Object_npyEndArray(JSOBJ obj); +int Object_npyArrayAddItem(JSOBJ obj, JSOBJ value); + +// for more complex dtypes (object and string) fill a standard Python list +// and convert to a numpy array when done. +JSOBJ Object_npyNewArrayList(void* decoder); +JSOBJ Object_npyEndArrayList(JSOBJ obj); +int Object_npyArrayListAddItem(JSOBJ obj, JSOBJ value); + +// labelled support, encode keys and values of JS object into separate numpy +// arrays +JSOBJ Object_npyNewObject(void* decoder); +JSOBJ Object_npyEndObject(JSOBJ obj); +int Object_npyObjectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value); + + +// free the numpy context buffer +void Npy_releaseContext(NpyArrContext* npyarr) +{ + PRINTMARK(); + if (npyarr) + { + if (npyarr->shape.ptr) + { + PyObject_Free(npyarr->shape.ptr); + } + if (npyarr->dec) + { + // Don't set to null, used to make sure we don't Py_DECREF npyarr + // in releaseObject + // npyarr->dec->npyarr = NULL; + npyarr->dec->curdim = 0; + } + Py_XDECREF(npyarr->labels[0]); + Py_XDECREF(npyarr->labels[1]); + Py_XDECREF(npyarr->ret); + PyObject_Free(npyarr); + } +} + +JSOBJ Object_npyNewArray(void* _decoder) +{ + NpyArrContext* npyarr; + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + if (decoder->curdim <= 0) + { + // start of array - initialise the context buffer + npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + + if (!npyarr) + { + PyErr_NoMemory(); + return NULL; + } + + npyarr->dec = decoder; + npyarr->labels[0] = npyarr->labels[1] = NULL; + + npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp)*NPY_MAXDIMS); + npyarr->shape.len = 1; + npyarr->ret = NULL; + + npyarr->elsize = 0; + npyarr->elcount = 4; + npyarr->i = 0; + } + else + { + // starting a new dimension continue the current array (and reshape after) + npyarr = (NpyArrContext*) decoder->npyarr; + if (decoder->curdim >= npyarr->shape.len) + { + npyarr->shape.len++; + } + } + + npyarr->shape.ptr[decoder->curdim] = 0; + decoder->curdim++; + return npyarr; +} + +PyObject* Npy_returnLabelled(NpyArrContext* npyarr) +{ + PyObject* ret = npyarr->ret; + npy_intp i; + + if (npyarr->labels[0] || npyarr->labels[1]) + { + // finished decoding, build tuple with values and labels + ret = PyTuple_New(npyarr->shape.len+1); + for (i = 0; i < npyarr->shape.len; i++) + { + if (npyarr->labels[i]) + { + PyTuple_SET_ITEM(ret, i+1, npyarr->labels[i]); + npyarr->labels[i] = NULL; + } + else + { + Py_INCREF(Py_None); + PyTuple_SET_ITEM(ret, i+1, Py_None); + } + } + PyTuple_SET_ITEM(ret, 0, npyarr->ret); + } + + return ret; +} + +JSOBJ Object_npyEndArray(JSOBJ obj) +{ + PyObject *ret; + char* new_data; + NpyArrContext* npyarr = (NpyArrContext*) obj; + int emptyType = NPY_DEFAULT_TYPE; + npy_intp i; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + ret = npyarr->ret; + i = npyarr->i; + + npyarr->dec->curdim--; + + if (i == 0 || !npyarr->ret) { + // empty array would not have been initialised so do it now. + if (npyarr->dec->dtype) + { + emptyType = npyarr->dec->dtype->type_num; + } + npyarr->ret = ret = PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); + } + else if (npyarr->dec->curdim <= 0) + { + // realloc to final size + new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); + if (new_data == NULL) { + PyErr_NoMemory(); + Npy_releaseContext(npyarr); + return NULL; + } + ((PyArrayObject*) ret)->data = (void*) new_data; + // PyArray_BYTES(ret) = new_data; + } + + if (npyarr->dec->curdim <= 0) + { + // finished decoding array, reshape if necessary + if (npyarr->shape.len > 1) + { + npyarr->ret = PyArray_Newshape((PyArrayObject*) ret, &npyarr->shape, NPY_ANYORDER); + Py_DECREF(ret); + } + + ret = Npy_returnLabelled(npyarr); + + npyarr->ret = NULL; + Npy_releaseContext(npyarr); + } + + return ret; +} + +int Object_npyArrayAddItem(JSOBJ obj, JSOBJ value) +{ + PyObject* type; + PyArray_Descr* dtype; + npy_intp i; + char *new_data, *item; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + + i = npyarr->i; + + npyarr->shape.ptr[npyarr->dec->curdim-1]++; + + if (PyArray_Check((PyObject*)value)) + { + // multidimensional array, keep decoding values. + return 1; + } + + if (!npyarr->ret) + { + // Array not initialised yet. + // We do it here so we can 'sniff' the data type if none was provided + if (!npyarr->dec->dtype) + { + type = PyObject_Type(value); + if(!PyArray_DescrConverter(type, &dtype)) + { + Py_DECREF(type); + goto fail; + } + Py_INCREF(dtype); + Py_DECREF(type); + } + else + { + dtype = PyArray_DescrNew(npyarr->dec->dtype); + } + + // If it's an object or string then fill a Python list and subsequently + // convert. Otherwise we would need to somehow mess about with + // reference counts when renewing memory. + npyarr->elsize = dtype->elsize; + if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) + { + Py_XDECREF(dtype); + + if (npyarr->dec->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + npyarr->elcount = 0; + npyarr->ret = PyList_New(0); + if (!npyarr->ret) + { + goto fail; + } + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArrayList; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayListAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArrayList; + return Object_npyArrayListAddItem(obj, value); + } + + npyarr->ret = PyArray_NewFromDescr(&PyArray_Type, dtype, 1, + &npyarr->elcount, NULL,NULL, 0, NULL); + + if (!npyarr->ret) + { + goto fail; + } + } + + if (i >= npyarr->elcount) { + // Grow PyArray_DATA(ret): + // this is similar for the strategy for PyListObject, but we use + // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... + if (npyarr->elsize == 0) + { + PyErr_SetString(PyExc_ValueError, "Cannot decode multidimensional arrays with variable length elements to numpy"); + goto fail; + } + + npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; + if (npyarr->elcount <= NPY_MAX_INTP/npyarr->elsize) { + new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), npyarr->elcount * npyarr->elsize); + } + else { + PyErr_NoMemory(); + goto fail; + } + ((PyArrayObject*) npyarr->ret)->data = (void*) new_data; + + // PyArray_BYTES(npyarr->ret) = new_data; + } + + PyArray_DIMS(npyarr->ret)[0] = i + 1; + + if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL + || PyArray_SETITEM(npyarr->ret, item, value) == -1) { + goto fail; + } + + Py_DECREF( (PyObject *) value); + npyarr->i++; + return 1; + +fail: + + Npy_releaseContext(npyarr); + return 0; +} + +JSOBJ Object_npyNewArrayList(void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, "nesting not supported for object or variable length dtypes"); + Npy_releaseContext(decoder->npyarr); + return NULL; +} + +JSOBJ Object_npyEndArrayList(JSOBJ obj) +{ + PyObject *list, *ret; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + // convert decoded list to numpy array + list = (PyObject *) npyarr->ret; + npyarr->ret = PyArray_FROM_O(list); + + ret = Npy_returnLabelled(npyarr); + npyarr->ret = list; + + ((JSONObjectDecoder*)npyarr->dec)->newArray = Object_npyNewArray; + ((JSONObjectDecoder*)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; + ((JSONObjectDecoder*)npyarr->dec)->endArray = Object_npyEndArray; + Npy_releaseContext(npyarr); + return ret; +} + +int Object_npyArrayListAddItem(JSOBJ obj, JSOBJ value) +{ + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + PyList_Append((PyObject*) npyarr->ret, value); + Py_DECREF( (PyObject *) value); + npyarr->elcount++; + return 1; +} + + +JSOBJ Object_npyNewObject(void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + PRINTMARK(); + if (decoder->curdim > 1) + { + PyErr_SetString(PyExc_ValueError, "labels only supported up to 2 dimensions"); + return NULL; + } + + return ((JSONObjectDecoder*)decoder)->newArray(decoder); +} + +JSOBJ Object_npyEndObject(JSOBJ obj) +{ + PyObject *list; + npy_intp labelidx; + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return NULL; + } + + labelidx = npyarr->dec->curdim-1; + + list = npyarr->labels[labelidx]; + if (list) + { + npyarr->labels[labelidx] = PyArray_FROM_O(list); + Py_DECREF(list); + } + + return (PyObject*) ((JSONObjectDecoder*)npyarr->dec)->endArray(obj); +} + +int Object_npyObjectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PyObject *label; + npy_intp labelidx; + // add key to label array, value to values array + NpyArrContext* npyarr = (NpyArrContext*) obj; + PRINTMARK(); + if (!npyarr) + { + return 0; + } + + label = (PyObject*) name; + labelidx = npyarr->dec->curdim-1; + + if (!npyarr->labels[labelidx]) + { + npyarr->labels[labelidx] = PyList_New(0); + } + + // only fill label array once, assumes all column labels are the same + // for 2-dimensional arrays. + if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) + { + PyList_Append(npyarr->labels[labelidx], label); + } + + if(((JSONObjectDecoder*)npyarr->dec)->arrayAddItem(obj, value)) + { + Py_DECREF(label); + return 1; + } + return 0; +} + +int Object_objectAddKey(JSOBJ obj, JSOBJ name, JSOBJ value) +{ + PyDict_SetItem (obj, name, value); + Py_DECREF( (PyObject *) name); + Py_DECREF( (PyObject *) value); + return 1; +} + +int Object_arrayAddItem(JSOBJ obj, JSOBJ value) +{ + PyList_Append(obj, value); + Py_DECREF( (PyObject *) value); + return 1; +} + +JSOBJ Object_newString(wchar_t *start, wchar_t *end) +{ + return PyUnicode_FromWideChar (start, (end - start)); +} + +JSOBJ Object_newTrue(void) +{ + Py_RETURN_TRUE; +} + +JSOBJ Object_newFalse(void) +{ + Py_RETURN_FALSE; +} + +JSOBJ Object_newNull(void) +{ + Py_RETURN_NONE; +} + +JSOBJ Object_newObject(void* decoder) +{ + return PyDict_New(); +} + +JSOBJ Object_endObject(JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newArray(void* decoder) +{ + return PyList_New(0); +} + +JSOBJ Object_endArray(JSOBJ obj) +{ + return obj; +} + +JSOBJ Object_newInteger(JSINT32 value) +{ + return PyInt_FromLong( (long) value); +} + +JSOBJ Object_newLong(JSINT64 value) +{ + return PyLong_FromLongLong (value); +} + +JSOBJ Object_newDouble(double value) +{ + return PyFloat_FromDouble(value); +} + +static void Object_releaseObject(JSOBJ obj, void* _decoder) +{ + PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; + if (obj != decoder->npyarr) + { + Py_XDECREF( ((PyObject *)obj)); + } +} + + +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *ret; + PyObject *sarg; + JSONObjectDecoder *decoder; + PyObjectDecoder pyDecoder; + PyArray_Descr *dtype = NULL; + static char *kwlist[] = { "obj", "numpy", "labelled", "dtype", NULL}; + int numpy = 0, labelled = 0, decref = 0; + // PRINTMARK(); + + JSONObjectDecoder dec = { + Object_newString, + Object_objectAddKey, + Object_arrayAddItem, + Object_newTrue, + Object_newFalse, + Object_newNull, + Object_newObject, + Object_endObject, + Object_newArray, + Object_endArray, + Object_newInteger, + Object_newLong, + Object_newDouble, + Object_releaseObject, + PyObject_Malloc, + PyObject_Free, + PyObject_Realloc, + }; + pyDecoder.dec = dec; + pyDecoder.curdim = 0; + pyDecoder.npyarr = NULL; + + decoder = (JSONObjectDecoder*) &pyDecoder; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO&", kwlist, &sarg, &numpy, &labelled, PyArray_DescrConverter2, &dtype)) + { + return NULL; + } + + if (PyUnicode_Check(sarg)) + { + sarg = PyUnicode_AsUTF8String(sarg); + if (sarg == NULL) + { + //Exception raised above us by codec according to docs + return NULL; + } + decref = 1; + } + else + if (!PyString_Check(sarg)) + { + PyErr_Format(PyExc_TypeError, "Expected String or Unicode"); + return NULL; + } + + if (numpy) + { + pyDecoder.dtype = dtype; + decoder->newArray = Object_npyNewArray; + decoder->endArray = Object_npyEndArray; + decoder->arrayAddItem = Object_npyArrayAddItem; + + if (labelled) + { + decoder->newObject = Object_npyNewObject; + decoder->endObject = Object_npyEndObject; + decoder->objectAddKey = Object_npyObjectAddKey; + } + } + + decoder->errorStr = NULL; + decoder->errorOffset = NULL; + + PRINTMARK(); + ret = JSON_DecodeObject(decoder, PyString_AS_STRING(sarg), PyString_GET_SIZE(sarg)); + PRINTMARK(); + + if (decref) + { + Py_DECREF(sarg); + } + + if (PyErr_Occurred()) + { + return NULL; + } + + if (decoder->errorStr) + { + /*FIXME: It's possible to give a much nicer error message here with actual failing element in input etc*/ + PyErr_Format (PyExc_ValueError, "%s", decoder->errorStr); + Py_XDECREF( (PyObject *) ret); + Npy_releaseContext(pyDecoder.npyarr); + + return NULL; + } + + return ret; +} + +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *file; + PyObject *read; + PyObject *string; + PyObject *result; + PyObject *argtuple; + + if (!PyArg_ParseTuple (args, "O", &file)) { + return NULL; + } + + if (!PyObject_HasAttrString (file, "read")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + read = PyObject_GetAttrString (file, "read"); + + if (!PyCallable_Check (read)) { + Py_XDECREF(read); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + string = PyObject_CallObject (read, NULL); + Py_XDECREF(read); + + if (string == NULL) + { + return NULL; + } + + argtuple = PyTuple_Pack(1, string); + + result = JSONToObj (self, argtuple, kwargs); + Py_XDECREF(string); + Py_DECREF(argtuple); + + if (result == NULL) { + return NULL; + } + + return result; +} + diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c new file mode 100644 index 0000000000000..ce8bdf3721f5e --- /dev/null +++ b/pandas/src/ujson/python/objToJSON.c @@ -0,0 +1,1701 @@ +#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY + +#include "py_defines.h" +#include +#include +#include +#include +#include +#include + +#define NPY_JSON_BUFSIZE 32768 + +static PyObject* cls_dataframe; +static PyObject* cls_series; +static PyObject* cls_index; + +typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, void *outValue, size_t *_outLen); + + +#if (PY_VERSION_HEX < 0x02050000) +typedef ssize_t Py_ssize_t; +#endif + +typedef struct __NpyArrContext +{ + PyObject *array; + char* dataptr; + int was_datetime64; + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) + npy_intp dim; + npy_intp stride; + npy_intp ndim; + npy_intp index[NPY_MAXDIMS]; + PyArray_GetItemFunc* getitem; + + char** rowLabels; + char** columnLabels; +} NpyArrContext; + +typedef struct __TypeContext +{ + JSPFN_ITERBEGIN iterBegin; + JSPFN_ITEREND iterEnd; + JSPFN_ITERNEXT iterNext; + JSPFN_ITERGETNAME iterGetName; + JSPFN_ITERGETVALUE iterGetValue; + PFN_PyTypeToJSON PyTypeToJSON; + PyObject *newObj; + PyObject *dictObj; + Py_ssize_t index; + Py_ssize_t size; + PyObject *itemValue; + PyObject *itemName; + PyObject *attrList; + char *citemName; + + JSINT64 longValue; + + NpyArrContext *npyarr; + int transpose; + char** rowLabels; + char** columnLabels; + npy_intp rowLabelsLen; + npy_intp columnLabelsLen; + +} TypeContext; + +typedef struct __PyObjectEncoder +{ + JSONObjectEncoder enc; + + // pass through the NpyArrContext when encoding multi-dimensional arrays + NpyArrContext* npyCtxtPassthru; + + // output format style for pandas data types + int outputFormat; + int originalOutputFormat; +} PyObjectEncoder; + +#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) + +struct PyDictIterState +{ + PyObject *keys; + size_t i; + size_t sz; +}; + +enum PANDAS_FORMAT +{ + SPLIT, + RECORDS, + INDEX, + COLUMNS, + VALUES +}; + +//#define PRINTMARK() fprintf(stderr, "%s: MARK(%d)\n", __FILE__, __LINE__) +#define PRINTMARK() + +void initObjToJSON(void) +{ + PyObject *mod_frame; + PyDateTime_IMPORT; + + mod_frame = PyImport_ImportModule("pandas.core.frame"); + if (mod_frame) + { + cls_dataframe = PyObject_GetAttrString(mod_frame, "DataFrame"); + cls_index = PyObject_GetAttrString(mod_frame, "Index"); + cls_series = PyObject_GetAttrString(mod_frame, "Series"); + Py_DECREF(mod_frame); + } + + /* Initialise numpy API */ + import_array(); +} + +static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT32 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((JSINT64 *) outValue) = PyInt_AS_LONG (obj); + return NULL; +} + +static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + *((JSINT64 *) outValue) = GET_TC(tc)->longValue; + return NULL; +} + +static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); + return NULL; +} + +static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *((double *) outValue) = PyFloat_AS_DOUBLE (obj); + return NULL; +} + +static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + *_outLen = PyString_GET_SIZE(obj); + return PyString_AS_STRING(obj); +} + +static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *newObj = PyUnicode_AsUTF8String (obj); + + GET_TC(tc)->newObj = newObj; + + *_outLen = PyString_GET_SIZE(newObj); + return PyString_AS_STRING(newObj); +} + +static void *NpyDateTimeToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PyObject *obj = (PyObject *) _obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DATETIME)); + return NULL; +} + +static void *PyDateTimeToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + pandas_datetimestruct dts; + PyObject *obj = (PyObject *) _obj; + + dts.year = PyDateTime_GET_YEAR(obj); + dts.month = PyDateTime_GET_MONTH(obj); + dts.day = PyDateTime_GET_DAY(obj); + dts.hour = PyDateTime_DATE_GET_HOUR(obj); + dts.min = PyDateTime_DATE_GET_MINUTE(obj); + dts.sec = PyDateTime_DATE_GET_SECOND(obj); + dts.us = PyDateTime_DATE_GET_MICROSECOND(obj); + dts.ps = dts.as = 0; + *((JSINT64*)outValue) = (JSINT64) pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts); + return NULL; +} + +static void *PyDateToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + pandas_datetimestruct dts; + PyObject *obj = (PyObject *) _obj; + + dts.year = PyDateTime_GET_YEAR(obj); + dts.month = PyDateTime_GET_MONTH(obj); + dts.day = PyDateTime_GET_DAY(obj); + dts.hour = dts.min = dts.sec = dts.ps = dts.as = 0; + *((JSINT64*)outValue) = (JSINT64) pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts); + return NULL; +} + +//============================================================================= +// Numpy array iteration functions +//============================================================================= +int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) +{ + return 0; +} + +void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) +{ + PyArrayObject *obj; + PyArray_Descr *dtype; + NpyArrContext *npyarr; + + if (GET_TC(tc)->newObj) + { + obj = (PyArrayObject *) GET_TC(tc)->newObj; + } + else + { + obj = (PyArrayObject *) _obj; + } + + if (PyArray_SIZE(obj) > 0) + { + PRINTMARK(); + npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + GET_TC(tc)->npyarr = npyarr; + + if (!npyarr) + { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + + // uber hack to support datetime64[ns] arrays + if (PyArray_DESCR(obj)->type_num == NPY_DATETIME) { + npyarr->was_datetime64 = 1; + dtype = PyArray_DescrFromType(NPY_INT64); + obj = (PyArrayObject *) PyArray_CastToType(obj, dtype, 0); + } else { + npyarr->was_datetime64 = 0; + } + + npyarr->array = (PyObject*) obj; + npyarr->getitem = (PyArray_GetItemFunc*) PyArray_DESCR(obj)->f->getitem; + npyarr->dataptr = PyArray_DATA(obj); + npyarr->ndim = PyArray_NDIM(obj) - 1; + npyarr->curdim = 0; + + if (GET_TC(tc)->transpose) + { + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; + } + else + { + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; + } + + npyarr->columnLabels = GET_TC(tc)->columnLabels; + npyarr->rowLabels = GET_TC(tc)->rowLabels; + } + else + { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + } + PRINTMARK(); +} + +void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + + if (npyarr) + { + if (npyarr->was_datetime64) { + Py_XDECREF(npyarr->array); + } + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + } + GET_TC(tc)->itemValue = NULL; + + PyObject_Free(npyarr); + } + PRINTMARK(); +} + +void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); +} + +void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + // finished this dimension, reset the data pointer + npyarr = GET_TC(tc)->npyarr; + npyarr->curdim--; + npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; + npyarr->stridedim -= npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->dataptr += npyarr->stride; + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } +} + +int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + + if (GET_TC(tc)->itemValue != npyarr->array) + { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + return 0; + } + + GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + + npyarr->dataptr += npyarr->stride; + npyarr->index[npyarr->stridedim]++; + return 1; +} + +int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + NpyArrContext* npyarr; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + + if (npyarr->curdim >= npyarr->ndim || npyarr->index[npyarr->stridedim] >= npyarr->dim) + { + // innermost dimension, start retrieving item values + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + return NpyArr_iterNextItem(_obj, tc); + } + + // dig a dimension deeper + npyarr->index[npyarr->stridedim]++; + + npyarr->curdim++; + npyarr->stridedim += npyarr->inc; + npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + npyarr->index[npyarr->stridedim] = 0; + + ((PyObjectEncoder*) tc->encoder)->npyCtxtPassthru = npyarr; + GET_TC(tc)->itemValue = npyarr->array; + return 1; +} + +JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + NpyArrContext* npyarr; + npy_intp idx; + PRINTMARK(); + npyarr = GET_TC(tc)->npyarr; + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) + { + idx = npyarr->index[npyarr->stridedim] - 1; + *outLen = strlen(npyarr->columnLabels[idx]); + return npyarr->columnLabels[idx]; + } + else + { + idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; + *outLen = strlen(npyarr->rowLabels[idx]); + return npyarr->rowLabels[idx]; + } +} + +//============================================================================= +// Tuple iteration functions +// itemValue is borrowed reference, no ref counting +//============================================================================= +void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyTuple_GET_SIZE( (PyObject *) obj); + GET_TC(tc)->itemValue = NULL; +} + +int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + PyObject *item; + + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + return 0; + } + + item = PyTuple_GET_ITEM (obj, GET_TC(tc)->index); + + GET_TC(tc)->itemValue = item; + GET_TC(tc)->index ++; + return 1; +} + +void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// Dir iteration functions +// itemName ref is borrowed from PyObject_Dir (attrList). No refcount +// itemValue ref is from PyObject_GetAttr. Ref counted +//============================================================================= +void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->attrList = PyObject_Dir(obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); + PRINTMARK(); +} + +void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } + + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + + Py_DECREF( (PyObject *) GET_TC(tc)->attrList); + PRINTMARK(); +} + +int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) +{ + PyObject *obj = (PyObject *) _obj; + PyObject *itemValue = GET_TC(tc)->itemValue; + PyObject *itemName = GET_TC(tc)->itemName; + PyObject* attr; + PyObject* attrName; + char* attrStr; + + + if (itemValue) + { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = itemValue = NULL; + } + + if (itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = itemName = NULL; + } + + for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index ++) + { + attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); +#if PY_MAJOR_VERSION >= 3 + attr = PyUnicode_AsUTF8String(attrName); +#else + attr = attrName; + Py_INCREF(attr); +#endif + attrStr = PyString_AS_STRING(attr); + + if (attrStr[0] == '_') + { + PRINTMARK(); + Py_DECREF(attr); + continue; + } + + itemValue = PyObject_GetAttr(obj, attrName); + if (itemValue == NULL) + { + PyErr_Clear(); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + if (PyCallable_Check(itemValue)) + { + Py_DECREF(itemValue); + Py_DECREF(attr); + PRINTMARK(); + continue; + } + + PRINTMARK(); + itemName = attr; + break; + } + + if (itemName == NULL) + { + GET_TC(tc)->index = GET_TC(tc)->size; + GET_TC(tc)->itemValue = NULL; + return 0; + } + + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index ++; + + PRINTMARK(); + return 1; +} + + + +JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + PRINTMARK(); + return GET_TC(tc)->itemValue; +} + +char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + PRINTMARK(); + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + + + + +//============================================================================= +// List iteration functions +// itemValue is borrowed from object (which is list). No refcounting +//============================================================================= +void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE( (PyObject *) obj); +} + +int List_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->index >= GET_TC(tc)->size) + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->itemValue = PyList_GET_ITEM (obj, GET_TC(tc)->index); + GET_TC(tc)->index ++; + return 1; +} + +void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ +} + +JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return NULL; +} + +//============================================================================= +// pandas Index iteration functions +//============================================================================= +void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->citemName) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "name", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->citemName) + { + PyObject_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// pandas Series iteration functions +//============================================================================= +void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->citemName) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "name", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "index", sizeof(char)*6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + if (GET_TC(tc)->citemName) + { + PyObject_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// pandas DataFrame iteration functions +//============================================================================= +void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->citemName) + { + PyErr_NoMemory(); + } + PRINTMARK(); +} + +int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_ssize_t index; + if (!GET_TC(tc)->citemName) + { + return 0; + } + + index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) + { + memcpy(GET_TC(tc)->citemName, "columns", sizeof(char)*8); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); + } + else + if (index == 1) + { + memcpy(GET_TC(tc)->citemName, "index", sizeof(char)*6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } + else + if (index == 2) + { + memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + } + else + { + PRINTMARK(); + return 0; + } + + GET_TC(tc)->index++; + PRINTMARK(); + return 1; +} + +void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; + enc->outputFormat = enc->originalOutputFormat; + if (GET_TC(tc)->citemName) + { + PyObject_Free(GET_TC(tc)->citemName); + } + PRINTMARK(); +} + +JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = strlen(GET_TC(tc)->citemName); + return GET_TC(tc)->citemName; +} + +//============================================================================= +// Dict iteration functions +// itemName might converted to string (Python_Str). Do refCounting +// itemValue is borrowed from object (which is dict). No refCounting +//============================================================================= +void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->index = 0; + PRINTMARK(); +} + +int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ +#if PY_MAJOR_VERSION >= 3 + PyObject* itemNameTmp; +#endif + + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + + + if (!PyDict_Next ( (PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) + { + PRINTMARK(); + return 0; + } + + if (PyUnicode_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); + } + else + if (!PyString_Check(GET_TC(tc)->itemName)) + { + GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); +#if PY_MAJOR_VERSION >= 3 + itemNameTmp = GET_TC(tc)->itemName; + GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName); + Py_DECREF(itemNameTmp); +#endif + } + else + { + Py_INCREF(GET_TC(tc)->itemName); + } + PRINTMARK(); + return 1; +} + +void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + if (GET_TC(tc)->itemName) + { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + Py_DECREF(GET_TC(tc)->dictObj); + PRINTMARK(); +} + +JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->itemValue; +} + +char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + *outLen = PyString_GET_SIZE(GET_TC(tc)->itemName); + return PyString_AS_STRING(GET_TC(tc)->itemName); +} + +void NpyArr_freeLabels(char** labels, npy_intp len) +{ + npy_intp i; + + if (labels) + { + for (i = 0; i < len; i++) + { + PyObject_Free(labels[i]); + } + PyObject_Free(labels); + } +} + +char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_intp num) +{ + // NOTE this function steals a reference to labels. + PyArray_Descr *dtype = NULL; + PyArrayObject* labelsTmp = NULL; + PyObject* item = NULL; + npy_intp i, stride, len; + // npy_intp bufsize = 32768; + char** ret; + char *dataptr, *cLabel, *origend, *origst, *origoffset; + char labelBuffer[NPY_JSON_BUFSIZE]; + PyArray_GetItemFunc* getitem; + PRINTMARK(); + + if (PyArray_SIZE(labels) < num) + { + PyErr_SetString(PyExc_ValueError, "Label array sizes do not match corresponding data shape"); + Py_DECREF(labels); + return 0; + } + + ret = PyObject_Malloc(sizeof(char*)*num); + if (!ret) + { + PyErr_NoMemory(); + Py_DECREF(labels); + return 0; + } + + for (i = 0; i < num; i++) + { + ret[i] = NULL; + } + + origst = enc->start; + origend = enc->end; + origoffset = enc->offset; + + if (PyArray_DESCR(labels)->type_num == NPY_DATETIME) { + dtype = PyArray_DescrFromType(NPY_INT64); + labelsTmp = labels; + labels = (PyArrayObject *) PyArray_CastToType(labels, dtype, 0); + Py_DECREF(labelsTmp); + } + + stride = PyArray_STRIDE(labels, 0); + dataptr = PyArray_DATA(labels); + getitem = PyArray_DESCR(labels)->f->getitem; + + for (i = 0; i < num; i++) + { + item = getitem(dataptr, labels); + if (!item) + { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); + Py_DECREF(item); + + if (PyErr_Occurred() || enc->errorMsg) + { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + // trim off any quotes surrounding the result + if (*cLabel == '\"') + { + cLabel++; + enc->offset -= 2; + *(enc->offset) = '\0'; + } + + len = enc->offset - cLabel + 1; + ret[i] = PyObject_Malloc(sizeof(char)*len); + + if (!ret[i]) + { + PyErr_NoMemory(); + ret = 0; + break; + } + + memcpy(ret[i], cLabel, sizeof(char)*len); + dataptr += stride; + } + + enc->start = origst; + enc->end = origend; + enc->offset = origoffset; + + Py_DECREF(labels); + return ret; +} + +void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) +{ + PyObject *obj, *exc, *toDictFunc; + TypeContext *pc; + PyObjectEncoder *enc; + double val; + PRINTMARK(); + if (!_obj) { + tc->type = JT_INVALID; + return; + } + + obj = (PyObject*) _obj; + enc = (PyObjectEncoder*) tc->encoder; + + tc->prv = PyObject_Malloc(sizeof(TypeContext)); + pc = (TypeContext *) tc->prv; + if (!pc) + { + tc->type = JT_INVALID; + PyErr_NoMemory(); + return; + } + pc->newObj = NULL; + pc->dictObj = NULL; + pc->itemValue = NULL; + pc->itemName = NULL; + pc->attrList = NULL; + pc->citemName = NULL; + pc->npyarr = NULL; + pc->rowLabels = NULL; + pc->columnLabels = NULL; + pc->index = 0; + pc->size = 0; + pc->longValue = 0; + pc->transpose = 0; + pc->rowLabelsLen = 0; + pc->columnLabelsLen = 0; + + if (PyIter_Check(obj) || PyArray_Check(obj)) + { + goto ISITERABLE; + } + + if (PyBool_Check(obj)) + { + PRINTMARK(); + tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; + return; + } + else + if (PyLong_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + GET_TC(tc)->longValue = PyLong_AsLongLong(obj); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + goto INVALID; + } + + return; + } + else + if (PyInt_Check(obj)) + { + PRINTMARK(); +#ifdef _LP64 + pc->PyTypeToJSON = PyIntToINT64; tc->type = JT_LONG; +#else + pc->PyTypeToJSON = PyIntToINT32; tc->type = JT_INT; +#endif + return; + } + else + if (PyArray_IsScalar(obj, Integer)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyLongToINT64; + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); + + exc = PyErr_Occurred(); + + if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) + { + PRINTMARK(); + goto INVALID; + } + + return; + } + else + if (PyString_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyStringToUTF8; tc->type = JT_UTF8; + return; + } + else + if (PyUnicode_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyUnicodeToUTF8; tc->type = JT_UTF8; + return; + } + else + if (PyFloat_Check(obj)) + { + PRINTMARK(); + val = PyFloat_AS_DOUBLE (obj); + if (npy_isnan(val) || npy_isinf(val)) + { + tc->type = JT_NULL; + } + else + { + pc->PyTypeToJSON = PyFloatToDOUBLE; tc->type = JT_DOUBLE; + } + return; + } + else + if (PyArray_IsScalar(obj, Float)) + { + PRINTMARK(); + pc->PyTypeToJSON = NpyFloatToDOUBLE; tc->type = JT_DOUBLE; + return; + } + else + if (PyArray_IsScalar(obj, Datetime)) + { + PRINTMARK(); + pc->PyTypeToJSON = NpyDateTimeToINT64; tc->type = JT_LONG; + return; + } + else + if (PyDateTime_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyDateTimeToINT64; tc->type = JT_LONG; + return; + } + else + if (PyDate_Check(obj)) + { + PRINTMARK(); + pc->PyTypeToJSON = PyDateToINT64; tc->type = JT_LONG; + return; + } + else + if (obj == Py_None) + { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + + +ISITERABLE: + + if (PyDict_Check(obj)) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); + + return; + } + else + if (PyList_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } + else + if (PyTuple_Check(obj)) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_index)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Index_iterBegin; + pc->iterEnd = Index_iterEnd; + pc->iterNext = Index_iterNext; + pc->iterGetValue = Index_iterGetValue; + pc->iterGetName = Index_iterGetName; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_series)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Series_iterBegin; + pc->iterEnd = Series_iterEnd; + pc->iterNext = Series_iterNext; + pc->iterGetValue = Series_iterGetValue; + pc->iterGetName = Series_iterGetName; + return; + } + + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->columnLabelsLen = PyArray_SIZE(obj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + goto INVALID; + } + } + else + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyArray_Check(obj)) + { + if (enc->npyCtxtPassthru) + { + PRINTMARK(); + pc->npyarr = enc->npyCtxtPassthru; + tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); + pc->iterBegin = NpyArrPassThru_iterBegin; + pc->iterEnd = NpyArrPassThru_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + enc->npyCtxtPassthru = NULL; + return; + } + + PRINTMARK(); + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } + else + if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_dataframe)) + { + if (enc->outputFormat == SPLIT) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } + + PRINTMARK(); + pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + if (enc->outputFormat == VALUES) + { + PRINTMARK(); + tc->type = JT_ARRAY; + } + else + if (enc->outputFormat == RECORDS) + { + PRINTMARK(); + tc->type = JT_ARRAY; + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + goto INVALID; + } + } + else + if (enc->outputFormat == INDEX) + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + } + else + { + PRINTMARK(); + tc->type = JT_OBJECT; + pc->rowLabelsLen = PyArray_DIM(pc->newObj, 1); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "columns"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + if (!pc->rowLabels) + { + goto INVALID; + } + pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + if (!pc->columnLabels) + { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + pc->transpose = 1; + } + return; + } + + + toDictFunc = PyObject_GetAttrString(obj, "toDict"); + + if (toDictFunc) + { + PyObject* tuple = PyTuple_New(0); + PyObject* toDictResult = PyObject_Call(toDictFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toDictFunc); + + if (toDictResult == NULL) + { + PyErr_Clear(); + tc->type = JT_NULL; + return; + } + + if (!PyDict_Check(toDictResult)) + { + Py_DECREF(toDictResult); + tc->type = JT_NULL; + return; + } + + PRINTMARK(); + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = toDictResult; + return; + } + + PyErr_Clear(); + + tc->type = JT_OBJECT; + pc->iterBegin = Dir_iterBegin; + pc->iterEnd = Dir_iterEnd; + pc->iterNext = Dir_iterNext; + pc->iterGetValue = Dir_iterGetValue; + pc->iterGetName = Dir_iterGetName; + + return; + +INVALID: + tc->type = JT_INVALID; + PyObject_Free(tc->prv); + tc->prv = NULL; + return; +} + + +void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) +{ + Py_XDECREF(GET_TC(tc)->newObj); + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); + + PyObject_Free(tc->prv); + tc->prv = NULL; +} + +const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) +{ + return GET_TC(tc)->PyTypeToJSON (obj, tc, NULL, _outLen); +} + +JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT64 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + + return ret; +} + +JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) +{ + JSINT32 ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + + +double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) +{ + double ret; + GET_TC(tc)->PyTypeToJSON (obj, tc, &ret, NULL); + return ret; +} + +static void Object_releaseObject(JSOBJ _obj) +{ + Py_DECREF( (PyObject *) _obj); +} + + + +void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterBegin(obj, tc); +} + +int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterNext(obj, tc); +} + +void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) +{ + GET_TC(tc)->iterEnd(obj, tc); +} + +JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) +{ + return GET_TC(tc)->iterGetValue(obj, tc); +} + +char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) +{ + return GET_TC(tc)->iterGetName(obj, tc, outLen); +} + + +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) +{ + static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "orient", NULL}; + + char buffer[65536]; + char *ret; + PyObject *newobj; + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; + char *sOrient = NULL; + int idoublePrecision = 5; // default double precision setting + + PyObjectEncoder pyEncoder = + { + { + Object_beginTypeContext, //void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc); + Object_endTypeContext, //void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc); + Object_getStringValue, //const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen); + Object_getLongValue, //JSLONG (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + Object_getIntValue, //JSLONG (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); + Object_getDoubleValue, //double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + Object_iterBegin, //JSPFN_ITERBEGIN iterBegin; + Object_iterNext, //JSPFN_ITERNEXT iterNext; + Object_iterEnd, //JSPFN_ITEREND iterEnd; + Object_iterGetValue, //JSPFN_ITERGETVALUE iterGetValue; + Object_iterGetName, //JSPFN_ITERGETNAME iterGetName; + Object_releaseObject, //void (*releaseValue)(JSONTypeContext *ti); + PyObject_Malloc, //JSPFN_MALLOC malloc; + PyObject_Realloc, //JSPFN_REALLOC realloc; + PyObject_Free, //JSPFN_FREE free; + -1, //recursionMax + idoublePrecision, + 1, //forceAscii + } + }; + JSONObjectEncoder* encoder = (JSONObjectEncoder*) &pyEncoder; + + pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.outputFormat = COLUMNS; + + PRINTMARK(); + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|Ois", kwlist, &oinput, &oensureAscii, &idoublePrecision, &sOrient)) + { + return NULL; + } + + if (sOrient != NULL) + { + if (strcmp(sOrient, "records") == 0) + { + pyEncoder.outputFormat = RECORDS; + } + else + if (strcmp(sOrient, "index") == 0) + { + pyEncoder.outputFormat = INDEX; + } + else + if (strcmp(sOrient, "split") == 0) + { + pyEncoder.outputFormat = SPLIT; + } + else + if (strcmp(sOrient, "values") == 0) + { + pyEncoder.outputFormat = VALUES; + } + else + if (strcmp(sOrient, "columns") != 0) + { + PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'orient'", sOrient); + return NULL; + } + } + + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; + + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) + { + encoder->forceASCII = 0; + } + + encoder->doublePrecision = idoublePrecision; + + PRINTMARK(); + ret = JSON_EncodeObject (oinput, encoder, buffer, sizeof (buffer)); + PRINTMARK(); + + if (PyErr_Occurred()) + { + return NULL; + } + + if (encoder->errorMsg) + { + if (ret != buffer) + { + encoder->free (ret); + } + + PyErr_Format (PyExc_OverflowError, "%s", encoder->errorMsg); + return NULL; + } + + newobj = PyString_FromString (ret); + + if (ret != buffer) + { + encoder->free (ret); + } + + PRINTMARK(); + + return newobj; +} + +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs) +{ + PyObject *data; + PyObject *file; + PyObject *string; + PyObject *write; + PyObject *argtuple; + + PRINTMARK(); + + if (!PyArg_ParseTuple (args, "OO", &data, &file)) { + return NULL; + } + + if (!PyObject_HasAttrString (file, "write")) + { + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + write = PyObject_GetAttrString (file, "write"); + + if (!PyCallable_Check (write)) { + Py_XDECREF(write); + PyErr_Format (PyExc_TypeError, "expected file"); + return NULL; + } + + argtuple = PyTuple_Pack(1, data); + + string = objToJSON (self, argtuple, kwargs); + + if (string == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(argtuple); + + argtuple = PyTuple_Pack (1, string); + if (argtuple == NULL) + { + Py_XDECREF(write); + return NULL; + } + if (PyObject_CallObject (write, argtuple) == NULL) + { + Py_XDECREF(write); + Py_XDECREF(argtuple); + return NULL; + } + + Py_XDECREF(write); + Py_DECREF(argtuple); + Py_XDECREF(string); + + PRINTMARK(); + + Py_RETURN_NONE; + + +} + diff --git a/pandas/src/ujson/python/ujson.c b/pandas/src/ujson/python/ujson.c new file mode 100644 index 0000000000000..e04309e620a1d --- /dev/null +++ b/pandas/src/ujson/python/ujson.c @@ -0,0 +1,73 @@ +#include "py_defines.h" +#include "version.h" + +/* objToJSON */ +PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs); +void initObjToJSON(void); + +/* JSONToObj */ +PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs); + +/* objToJSONFile */ +PyObject* objToJSONFile(PyObject* self, PyObject *args, PyObject *kwargs); + +/* JSONFileToObj */ +PyObject* JSONFileToObj(PyObject* self, PyObject *args, PyObject *kwargs); + + +static PyMethodDef ujsonMethods[] = { + {"encode", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8. Pass in double_precision to alter the maximum digit precision with doubles"}, + {"decode", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure"}, + {"dumps", (PyCFunction) objToJSON, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursivly into JSON. Use ensure_ascii=false to output UTF-8"}, + {"loads", (PyCFunction) JSONToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure"}, + {"dump", (PyCFunction) objToJSONFile, METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON file. Use ensure_ascii=false to output UTF-8"}, + {"load", (PyCFunction) JSONFileToObj, METH_VARARGS | METH_KEYWORDS, "Converts JSON as file to dict object structure"}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + +#if PY_MAJOR_VERSION >= 3 + +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "_pandasujson", + 0, /* m_doc */ + -1, /* m_size */ + ujsonMethods, /* m_methods */ + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL /* m_free */ +}; + +#define PYMODINITFUNC PyObject *PyInit_json(void) +#define PYMODULE_CREATE() PyModule_Create(&moduledef) +#define MODINITERROR return NULL + +#else + +#define PYMODINITFUNC PyMODINIT_FUNC initjson(void) +#define PYMODULE_CREATE() Py_InitModule("json", ujsonMethods) +#define MODINITERROR return + +#endif + +PYMODINITFUNC +{ + PyObject *module; + PyObject *version_string; + + initObjToJSON(); + module = PYMODULE_CREATE(); + + if (module == NULL) + { + MODINITERROR; + } + + version_string = PyString_FromString (UJSON_VERSION); + PyModule_AddObject (module, "__version__", version_string); + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif +} diff --git a/setup.py b/setup.py index 1cc666c87404b..ff40738ddfb78 100755 --- a/setup.py +++ b/setup.py @@ -244,7 +244,13 @@ def initialize_options(self): 'np_datetime_strings.c', 'period.c', 'tokenizer.c', - 'io.c'] + 'io.c', + 'ujson.c', + 'objToJSON.c', + 'JSONtoObj.c', + 'ultrajsonenc.c', + 'ultrajsondec.c', + ] for root, dirs, files in list(os.walk('pandas')): for f in files: @@ -472,7 +478,8 @@ def pxd(name): 'pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c'], include_dirs=['pandas/src/ujson/python', - 'pandas/src/ujson/lib'] + common_include) + 'pandas/src/ujson/lib', + 'pandas/src/datetime'] + common_include) extensions.append(ujson_ext) From 7dd12cce711ffc478b69ebee2e8fa013d34ba746 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 7 Jun 2013 18:42:56 -0400 Subject: [PATCH 30/71] CLN: revised json support to use the to_json/read_json in pandas.io.json DOC: docs in io.rst/whatsnew/release notes/api TST: cleaned up cruft in test_series/test_frame --- doc/source/api.rst | 11 ++ doc/source/io.rst | 27 +++- doc/source/v0.11.1.txt | 6 + pandas/core/frame.py | 100 --------------- pandas/core/generic.py | 32 +++++ pandas/core/series.py | 82 ------------ pandas/io/api.py | 1 + pandas/io/json.py | 152 +++++++++++++++++++++++ pandas/io/tests/test_json/test_pandas.py | 42 +++++-- pandas/tests/test_frame.py | 140 --------------------- pandas/tests/test_series.py | 56 --------- 11 files changed, 257 insertions(+), 392 deletions(-) create mode 100644 pandas/io/json.py diff --git a/doc/source/api.rst b/doc/source/api.rst index e263554460380..bb6f0ac073e21 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -45,6 +45,16 @@ Excel read_excel ExcelFile.parse +JSON +~~~~ + +.. currentmodule:: pandas.io.json + +.. autosummary:: + :toctree: generated/ + + read_json + HTML ~~~~ @@ -597,6 +607,7 @@ Serialization / IO / Conversion DataFrame.to_hdf DataFrame.to_dict DataFrame.to_excel + DataFrame.to_json DataFrame.to_html DataFrame.to_stata DataFrame.to_records diff --git a/doc/source/io.rst b/doc/source/io.rst index ac5d49e036669..625ff39cd7eba 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -35,6 +35,7 @@ object. * ``read_excel`` * ``read_hdf`` * ``read_sql`` + * ``read_json`` * ``read_html`` * ``read_stata`` * ``read_clipboard`` @@ -45,6 +46,7 @@ The corresponding ``writer`` functions are object methods that are accessed like * ``to_excel`` * ``to_hdf`` * ``to_sql`` + * ``to_json`` * ``to_html`` * ``to_stata`` * ``to_clipboard`` @@ -937,6 +939,30 @@ The Series object also has a ``to_string`` method, but with only the ``buf``, which, if set to ``True``, will additionally output the length of the Series. + +JSON +---- + +Read and write ``JSON`` format files. + +.. _io.json: + +Writing JSON +~~~~~~~~~~~~ + +.. ipython:: python + + df = DataFrame(randn(10, 2), columns=list('AB')) + s = df.to_json() + s + +Reading JSON +~~~~~~~~~~~~ + +.. ipython:: python + + pd.read_json(s) + HTML ---- @@ -2193,7 +2219,6 @@ into a .dta file. The format version of this file is always the latest one, 115. .. ipython:: python - from pandas.io.stata import StataWriter df = DataFrame(randn(10, 2), columns=list('AB')) df.to_stata('stata.dta') diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 70d840f8c477a..5045f73375a97 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -16,6 +16,7 @@ API changes * ``read_excel`` * ``read_hdf`` * ``read_sql`` + * ``read_json`` * ``read_html`` * ``read_stata`` * ``read_clipboard`` @@ -26,6 +27,7 @@ API changes * ``to_excel`` * ``to_hdf`` * ``to_sql`` + * ``to_json`` * ``to_html`` * ``to_stata`` * ``to_clipboard`` @@ -175,6 +177,10 @@ Enhancements accessable via ``read_stata`` top-level function for reading, and ``to_stata`` DataFrame method for writing, :ref:`See the docs` + - Added module for reading and writing json format files: ``pandas.io.json`` + accessable via ``read_json`` top-level function for reading, + and ``to_json`` DataFrame method for writing, :ref:`See the docs` + - ``DataFrame.replace()`` now allows regular expressions on contained ``Series`` with object dtype. See the examples section in the regular docs :ref:`Replacing via String Expression ` diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2925bb3e3b73a..9c0a2843370f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5593,106 +5593,6 @@ def mask(self, cond): """ return self.where(~cond, NA) - -@classmethod -def from_json(cls, json, orient="columns", dtype=None, numpy=True): - """ - Convert JSON string to DataFrame - - Parameters - ---------- - json : The JSON string to parse. - orient : {'split', 'records', 'index', 'columns', 'values'}, - default 'columns' - The format of the JSON string - split : dict like - {index -> [index], columns -> [columns], data -> [values]} - records : list like [{column -> value}, ... , {column -> value}] - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array - dtype : dtype of the resulting DataFrame - nupmpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. - - Returns - ------- - result : DataFrame - """ - from pandas.json import loads - - df = None - - if dtype is not None and orient == "split": - numpy = False - - if numpy: - try: - if orient == "columns": - args = loads(json, dtype=dtype, numpy=True, labelled=True) - if args: - args = (args[0].T, args[2], args[1]) - df = DataFrame(*args) - elif orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - df = DataFrame(**decoded) - elif orient == "values": - df = DataFrame(loads(json, dtype=dtype, numpy=True)) - else: - df = DataFrame(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - except ValueError: - numpy = False - if not numpy: - if orient == "columns": - df = DataFrame(loads(json), dtype=dtype) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - df = DataFrame(dtype=dtype, **decoded) - elif orient == "index": - df = DataFrame(loads(json), dtype=dtype).T - else: - df = DataFrame(loads(json), dtype=dtype) - - return df -DataFrame.from_json = from_json - - -def to_json(self, orient="columns", double_precision=10, - force_ascii=True): - """ - Convert DataFrame to a JSON string. - - Note NaN's and None will be converted to null and datetime objects - will be converted to UNIX timestamps. - - Parameters - ---------- - orient : {'split', 'records', 'index', 'columns', 'values'}, - default 'columns' - The format of the JSON string - split : dict like - {index -> [index], columns -> [columns], data -> [values]} - records : list like [{column -> value}, ... , {column -> value}] - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array - double_precision : The number of decimal places to use when encoding - floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. - - Returns - ------- - result : JSON compatible string - """ - from pandas.json import dumps - return dumps(self, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) -DataFrame.to_json = to_json - - _EMPTY_SERIES = Series([]) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5533584745167..7a947f9b4f96b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,6 +495,38 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) + def to_json(self, orient=None, double_precision=10, + force_ascii=True): + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default is 'index' for Series, 'columns' for DataFrame + + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + from pandas.io import json + return json.to_json(self, orient=orient, double_precision=double_precision, + force_ascii=force_ascii) + # install the indexerse for _name, _indexer in indexing.get_indexers_list(): PandasObject._create_indexer(_name,_indexer) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9147e64f5b11a..3a7a7d0f49b66 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3298,88 +3298,6 @@ def str(self): from pandas.core.strings import StringMethods return StringMethods(self) - -@classmethod -def from_json(cls, json, orient="index", dtype=None, numpy=True): - """ - Convert JSON string to Series - - Parameters - ---------- - json : The JSON string to parse. - orient : {'split', 'records', 'index'}, default 'index' - The format of the JSON string - split : dict like - {index -> [index], name -> name, data -> [values]} - records : list like [value, ... , value] - index : dict like {index -> value} - dtype : dtype of the resulting Series - nupmpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. - - Returns - ------- - result : Series - """ - from pandas.json import loads - s = None - - if dtype is not None and orient == "split": - numpy = False - - if numpy: - try: - if orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - s = Series(**decoded) - elif orient == "columns" or orient == "index": - s = Series(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - else: - s = Series(loads(json, dtype=dtype, numpy=True)) - except ValueError: - numpy = False - if not numpy: - if orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - s = Series(dtype=dtype, **decoded) - else: - s = Series(loads(json), dtype=dtype) - - return s -Series.from_json = from_json - -def to_json(self, orient="index", double_precision=10, force_ascii=True): - """ - Convert Series to a JSON string - - Note NaN's and None will be converted to null and datetime objects - will be converted to UNIX timestamps. - - Parameters - ---------- - orient : {'split', 'records', 'index'}, default 'index' - The format of the JSON string - split : dict like - {index -> [index], name -> name, data -> [values]} - records : list like [value, ... , value] - index : dict like {index -> value} - double_precision : The number of decimal places to use when encoding - floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. - - Returns - ------- - result : JSON compatible string - """ - from pandas.json import dumps - return dumps(self, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) -Series.to_json = to_json - - _INDEX_TYPES = ndarray, Index, list, tuple #------------------------------------------------------------------------------ diff --git a/pandas/io/api.py b/pandas/io/api.py index f17351921f83f..48566399f9bfe 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -6,6 +6,7 @@ from pandas.io.clipboard import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel from pandas.io.pytables import HDFStore, Term, get_store, read_hdf +from pandas.io.json import read_json from pandas.io.html import read_html from pandas.io.sql import read_sql from pandas.io.stata import read_stata diff --git a/pandas/io/json.py b/pandas/io/json.py new file mode 100644 index 0000000000000..7c8f6f40bfd4e --- /dev/null +++ b/pandas/io/json.py @@ -0,0 +1,152 @@ + +# pylint: disable-msg=E1101,W0613,W0603 +from pandas import Series, DataFrame + +import pandas.json as _json +loads = _json.loads +dumps = _json.dumps + +### interface to/from ### + +def to_json(obj, orient=None, double_precision=10, + force_ascii=True): + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + Parameters + ---------- + orient : {'split', 'records', 'index', 'columns', 'values'}, + default is 'index' for Series, 'columns' for DataFrame + + The format of the JSON string + split : dict like + {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + double_precision : The number of decimal places to use when encoding + floating point values, default 10. + force_ascii : force encoded string to be ASCII, default True. + + Returns + ------- + result : JSON compatible string + """ + if orient is None: + if isinstance(obj, Series): + orient = 'index' + elif isinstance(obj, DataFrame): + orient = 'columns' + + return dumps(obj, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) + +def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): + """ + Convert JSON string to pandas object + + Parameters + ---------- + json : The JSON string to parse. + typ : type of object to recover (series or frame), default 'frame' + orient : {'split', 'records', 'index'}, default 'index' + The format of the JSON string + split : dict like + {index -> [index], name -> name, data -> [values]} + records : list like [value, ... , value] + index : dict like {index -> value} + dtype : dtype of the resulting Series + nupmpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. + + Returns + ------- + result : Series or DataFrame + """ + + obj = None + if typ == 'frame': + if orient is None: + orient = 'columns' + obj = load_frame(json, orient, dtype, numpy) + + if typ == 'series' or obj is None: + if orient == 'columns': + orient = 'index' + obj = load_series(json, orient, dtype, numpy) + + return obj + +def load_series(json, orient, dtype, numpy): + s = None + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + s = Series(**decoded) + elif orient == "columns" or orient == "index": + s = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + s = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + + if not numpy: + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + s = Series(dtype=dtype, **decoded) + else: + s = Series(loads(json), dtype=dtype) + + return s + + +def load_frame(json, orient, dtype, numpy): + """ try to recover a frame, return None if we didn't get anything """ + + if dtype is not None and orient == "split": + numpy = False + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + df = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + df = DataFrame(**decoded) + elif orient == "values": + df = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + df = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + + if not numpy: + if orient == "columns": + df = DataFrame(loads(json), dtype=dtype) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + df = DataFrame(dtype=dtype, **decoded) + elif orient == "index": + df = DataFrame(loads(json), dtype=dtype).T + else: + df = DataFrame(loads(json), dtype=dtype) + + return df diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 506aa382487d6..f4cb7ed03c026 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -1,3 +1,4 @@ + # pylint: disable-msg=W0612,E1101 from copy import deepcopy from datetime import datetime, timedelta @@ -11,6 +12,7 @@ from pandas import Series, DataFrame, DatetimeIndex import pandas as pd +read_json = pd.read_json from pandas.util.testing import (assert_almost_equal, assert_frame_equal, assert_series_equal) @@ -55,8 +57,8 @@ def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=True): df = df.sort() dfjson = df.to_json(orient=orient) - unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy) + unser = read_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy) unser = unser.sort() if df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) @@ -136,50 +138,50 @@ def _check_all_orients(df, dtype=None): _check_orient(df.transpose().transpose(), "index") def test_frame_from_json_bad_data(self): - self.assertRaises(ValueError, DataFrame.from_json, '{"key":b:a:d}') + self.assertRaises(ValueError, read_json, '{"key":b:a:d}') # too few indices json = ('{"columns":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') - self.assertRaises(ValueError, DataFrame.from_json, json, + self.assertRaises(ValueError, read_json, json, orient="split") # too many columns json = ('{"columns":["A","B","C"],' '"index":["1","2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') - self.assertRaises(AssertionError, DataFrame.from_json, json, + self.assertRaises(AssertionError, read_json, json, orient="split") # bad key json = ('{"badkey":["A","B"],' '"index":["2","3"],' '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') - self.assertRaises(TypeError, DataFrame.from_json, json, + self.assertRaises(TypeError, read_json, json, orient="split") def test_frame_from_json_nones(self): df = DataFrame([[1, 2], [4, 5, 6]]) - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(np.isnan(unser['2'][0])) df = DataFrame([['1', '2'], ['4', '5', '6']]) - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(unser['2'][0] is None) - unser = DataFrame.from_json(df.to_json(), numpy=False) + unser = read_json(df.to_json(), numpy=False) self.assert_(unser['2'][0] is None) # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df[2][0] = np.inf - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(np.isnan(unser['2'][0])) df[2][0] = np.NINF - unser = DataFrame.from_json(df.to_json()) + unser = read_json(df.to_json()) self.assert_(np.isnan(unser['2'][0])) def test_frame_to_json_except(self): @@ -190,8 +192,8 @@ def test_series_from_json_to_json(self): def _check_orient(series, orient, dtype=None, numpy=True): series = series.sort_index() - unser = Series.from_json(series.to_json(orient=orient), - orient=orient, numpy=numpy, dtype=dtype) + unser = read_json(series.to_json(orient=orient), typ='series', + orient=orient, numpy=numpy, dtype=dtype) unser = unser.sort_index() if series.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) @@ -238,3 +240,17 @@ def _check_all_orients(series, dtype=None): def test_series_to_json_except(self): s = Series([1, 2, 3]) self.assertRaises(ValueError, s.to_json, orient="garbage") + + def test_typ(self): + + s = Series(range(6), index=['a','b','c','d','e','f']) + result = read_json(s.to_json(),typ=None) + assert_series_equal(result,s) + + def test_reconstruction_index(self): + + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + result = read_json(df.to_json()) + + # the index is serialized as strings....correct? + #assert_frame_equal(result,df) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d674a2f44ebe1..2c6d3b221c6ff 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3338,146 +3338,6 @@ def test_to_dict(self): for k2, v2 in v.iteritems(): self.assertEqual(v2, recons_data[k][k2]) - def test_from_json_to_json(self): - raise nose.SkipTest - - def _check_orient(df, orient, dtype=None, numpy=True): - df = df.sort() - dfjson = df.to_json(orient=orient) - unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy) - unser = unser.sort() - if df.index.dtype.type == np.datetime64: - unser.index = DatetimeIndex(unser.index.values.astype('i8')) - if orient == "records": - # index is not captured in this orientation - assert_almost_equal(df.values, unser.values) - self.assert_(df.columns.equals(unser.columns)) - elif orient == "values": - # index and cols are not captured in this orientation - assert_almost_equal(df.values, unser.values) - elif orient == "split": - # index and col labels might not be strings - unser.index = [str(i) for i in unser.index] - unser.columns = [str(i) for i in unser.columns] - unser = unser.sort() - assert_almost_equal(df.values, unser.values) - else: - assert_frame_equal(df, unser) - - def _check_all_orients(df, dtype=None): - _check_orient(df, "columns", dtype=dtype) - _check_orient(df, "records", dtype=dtype) - _check_orient(df, "split", dtype=dtype) - _check_orient(df, "index", dtype=dtype) - _check_orient(df, "values", dtype=dtype) - - _check_orient(df, "columns", dtype=dtype, numpy=False) - _check_orient(df, "records", dtype=dtype, numpy=False) - _check_orient(df, "split", dtype=dtype, numpy=False) - _check_orient(df, "index", dtype=dtype, numpy=False) - _check_orient(df, "values", dtype=dtype, numpy=False) - - # basic - _check_all_orients(self.frame) - self.assertEqual(self.frame.to_json(), - self.frame.to_json(orient="columns")) - - _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) - - # big one - # index and columns are strings as all unserialised JSON object keys - # are assumed to be strings - biggie = DataFrame(np.zeros((200, 4)), - columns=[str(i) for i in range(4)], - index=[str(i) for i in range(200)]) - _check_all_orients(biggie) - - # dtypes - _check_all_orients(DataFrame(biggie, dtype=np.float64), - dtype=np.float64) - _check_all_orients(DataFrame(biggie, dtype=np.int64), dtype=np.int64) - _check_all_orients(DataFrame(biggie, dtype=' Date: Fri, 7 Jun 2013 21:55:35 -0400 Subject: [PATCH 31/71] DOC: io.rst doc updates --- doc/source/io.rst | 34 ++++++++++++++++++++++++++++++++++ pandas/io/json.py | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 625ff39cd7eba..c98b49be9827f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -949,6 +949,22 @@ Read and write ``JSON`` format files. Writing JSON ~~~~~~~~~~~~ + +A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` +with optional parameters: + +- orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` + + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array + +- double_precision : The number of decimal places to use when encoding floating point values, default 10. +- force_ascii : force encoded string to be ASCII, default True. + +Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. .. ipython:: python @@ -959,6 +975,24 @@ Writing JSON Reading JSON ~~~~~~~~~~~~ +Reading a JSON string to pandas object can take a number of parameters. +The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or +is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` + +- json : The JSON string to parse. +- typ : type of object to recover (series or frame), default 'frame' +- orient : The format of the JSON string, one of the following + + * split : dict like {index -> [index], name -> name, data -> [values]} + * records : list like [value, ... , value] + * index : dict like {index -> value} + +- dtype : dtype of the resulting Series +- numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. + +The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is +not parsable. + .. ipython:: python pd.read_json(s) diff --git a/pandas/io/json.py b/pandas/io/json.py index 7c8f6f40bfd4e..76d8ae05b07c0 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -36,6 +36,7 @@ def to_json(obj, orient=None, double_precision=10, ------- result : JSON compatible string """ + if orient is None: if isinstance(obj, Series): orient = 'index' @@ -60,7 +61,7 @@ def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): records : list like [value, ... , value] index : dict like {index -> value} dtype : dtype of the resulting Series - nupmpy: direct decoding to numpy arrays. default True but falls back + numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. Returns From 64220419afd6b711eecf24c9acf300b5d1dd5110 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 8 Jun 2013 09:02:12 -0400 Subject: [PATCH 32/71] API: to_json now writes to a file by default (if None is provided it will return a StringIO object) read_json will read from a string-like or filebuf or url (consistent with other parsers) --- doc/source/io.rst | 23 ++++++-- pandas/core/generic.py | 11 +++- pandas/io/common.py | 1 + pandas/io/excel.py | 2 +- pandas/io/json.py | 41 +++++++++++--- pandas/io/parsers.py | 1 - pandas/io/tests/test_json/test_pandas.py | 70 +++++++++++++++--------- pandas/io/tests/test_json/test_ujson.py | 30 +++++----- 8 files changed, 122 insertions(+), 57 deletions(-) mode change 100644 => 100755 pandas/io/tests/test_json/test_pandas.py diff --git a/doc/source/io.rst b/doc/source/io.rst index c98b49be9827f..f1480b6546e04 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -939,7 +939,6 @@ The Series object also has a ``to_string`` method, but with only the ``buf``, which, if set to ``True``, will additionally output the length of the Series. - JSON ---- @@ -953,6 +952,8 @@ Writing JSON A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` with optional parameters: +- path_or_buf : the pathname or buffer to write the output + This can be ``None`` in which case a ``StringIO`` converted string is returned - orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` * split : dict like {index -> [index], columns -> [columns], data -> [values]} @@ -969,8 +970,8 @@ Note NaN's and None will be converted to null and datetime objects will be conve .. ipython:: python df = DataFrame(randn(10, 2), columns=list('AB')) - s = df.to_json() - s + json = df.to_json(None) + json.getvalue() Reading JSON ~~~~~~~~~~~~ @@ -979,7 +980,11 @@ Reading a JSON string to pandas object can take a number of parameters. The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` -- json : The JSON string to parse. +- filepath_or_buffer : a **VALID** JSON string or file handle / StringIO. The string could be + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host + is expected. For instance, a local file could be + file ://localhost/path/to/table.json +- json : a VALID JSON string, optional, used if filepath_or_buffer is not provided - typ : type of object to recover (series or frame), default 'frame' - orient : The format of the JSON string, one of the following @@ -993,9 +998,17 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. +Reading from a JSON string + +.. ipython:: python + + pd.read_json(json='{"0":{"0":1,"1":3},"1":{"0":2,"1":4}}') + +Reading from a StringIO + .. ipython:: python - pd.read_json(s) + pd.read_json(json) HTML ---- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7a947f9b4f96b..ac9663a34e748 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,7 +495,7 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) - def to_json(self, orient=None, double_precision=10, + def to_json(self, path_or_buf, orient=None, double_precision=10, force_ascii=True): """ Convert the object to a JSON string. @@ -505,6 +505,8 @@ def to_json(self, orient=None, double_precision=10, Parameters ---------- + path_or_buf : the path or buffer to write the result string + if this is None, return a StringIO of the converted string orient : {'split', 'records', 'index', 'columns', 'values'}, default is 'index' for Series, 'columns' for DataFrame @@ -521,10 +523,13 @@ def to_json(self, orient=None, double_precision=10, Returns ------- - result : JSON compatible string + result : a JSON compatible string written to the path_or_buf; + if the path_or_buf is none, return a StringIO of the result + """ + from pandas.io import json - return json.to_json(self, orient=orient, double_precision=double_precision, + return json.to_json(path_or_buf, self, orient=orient, double_precision=double_precision, force_ascii=force_ascii) # install the indexerse diff --git a/pandas/io/common.py b/pandas/io/common.py index 46b47c06f7f5d..353930482c8b8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,6 +2,7 @@ import urlparse from pandas.util import py3compat +from StringIO import StringIO _VALID_URLS = set(urlparse.uses_relative + urlparse.uses_netloc + urlparse.uses_params) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5b7d13acd99ec..95702847d9c7f 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -11,7 +11,7 @@ from pandas.io.parsers import TextParser from pandas.tseries.period import Period -import json +from pandas import json def read_excel(path_or_buf, sheetname, kind=None, **kwds): """Read an Excel table into a pandas DataFrame diff --git a/pandas/io/json.py b/pandas/io/json.py index 76d8ae05b07c0..48412f21fbbdd 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -1,6 +1,8 @@ # pylint: disable-msg=E1101,W0613,W0603 from pandas import Series, DataFrame +from pandas.io.common import get_filepath_or_buffer +from StringIO import StringIO import pandas.json as _json loads = _json.loads @@ -8,16 +10,18 @@ ### interface to/from ### -def to_json(obj, orient=None, double_precision=10, +def to_json(path_or_buf, obj, orient=None, double_precision=10, force_ascii=True): """ - Convert the object to a JSON string. + Convert the object to a JSON string Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. Parameters ---------- + path_or_buf : the pathname or buffer to write the output + if this is None, return a StringIO of the converted string orient : {'split', 'records', 'index', 'columns', 'values'}, default is 'index' for Series, 'columns' for DataFrame @@ -34,7 +38,9 @@ def to_json(obj, orient=None, double_precision=10, Returns ------- - result : JSON compatible string + result : a JSON compatible string written to the path_or_buf; + if the path_or_buf is none, return a StringIO of the result + """ if orient is None: @@ -43,16 +49,27 @@ def to_json(obj, orient=None, double_precision=10, elif isinstance(obj, DataFrame): orient = 'columns' - return dumps(obj, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) + s = dumps(obj, orient=orient, double_precision=double_precision, + ensure_ascii=force_ascii) + if isinstance(path_or_buf, basestring): + with open(path_or_buf,'w') as fh: + fh.write(s) + elif path_or_buf is None: + return StringIO(s) + else: + path_or_buf.write(s) -def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): +def read_json(filepath_or_buffer=None, json=None, typ='frame', orient=None, dtype=None, numpy=True): """ Convert JSON string to pandas object Parameters ---------- - json : The JSON string to parse. + filepath_or_buffer : a VALID JSON StringIO or file handle / StringIO. The string could be + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host + is expected. For instance, a local file could be + file ://localhost/path/to/table.json + json : a VALID JSON string, optional, used if filepath_or_buffer is not provided typ : type of object to recover (series or frame), default 'frame' orient : {'split', 'records', 'index'}, default 'index' The format of the JSON string @@ -69,6 +86,16 @@ def read_json(json, typ='frame', orient=None, dtype=None, numpy=True): result : Series or DataFrame """ + if json is None: + filepath_or_buffer,_ = get_filepath_or_buffer(filepath_or_buffer) + if isinstance(filepath_or_buffer, basestring): + with open(filepath_or_buffer,'r') as fh: + json = fh.read() + elif hasattr(filepath_or_buffer, 'read'): + json = filepath_or_buffer.read() + else: + json = filepath_or_buffer + obj = None if typ == 'frame': if orient is None: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 6e937ba696e39..faf439d87a5f2 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -23,7 +23,6 @@ import pandas.tslib as tslib import pandas.parser as _parser from pandas.tseries.period import Period -import json class DateConversionError(Exception): diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py old mode 100644 new mode 100755 index f4cb7ed03c026..e9bb358763fd8 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -15,8 +15,9 @@ read_json = pd.read_json from pandas.util.testing import (assert_almost_equal, assert_frame_equal, - assert_series_equal) + assert_series_equal, network) import pandas.util.testing as tm +from numpy.testing.decorators import slow _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -56,7 +57,7 @@ def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=True): df = df.sort() - dfjson = df.to_json(orient=orient) + dfjson = df.to_json(None, orient=orient) unser = read_json(dfjson, orient=orient, dtype=dtype, numpy=numpy) unser = unser.sort() @@ -93,8 +94,8 @@ def _check_all_orients(df, dtype=None): # basic _check_all_orients(self.frame) - self.assertEqual(self.frame.to_json(), - self.frame.to_json(orient="columns")) + self.assertEqual(self.frame.to_json(None).read(), + self.frame.to_json(None,orient="columns").read()) _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) @@ -138,61 +139,61 @@ def _check_all_orients(df, dtype=None): _check_orient(df.transpose().transpose(), "index") def test_frame_from_json_bad_data(self): - self.assertRaises(ValueError, read_json, '{"key":b:a:d}') + self.assertRaises(ValueError, read_json, StringIO('{"key":b:a:d}')) # too few indices - json = ('{"columns":["A","B"],' - '"index":["2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') + json = StringIO('{"columns":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') self.assertRaises(ValueError, read_json, json, orient="split") # too many columns - json = ('{"columns":["A","B","C"],' - '"index":["1","2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') + json = StringIO('{"columns":["A","B","C"],' + '"index":["1","2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') self.assertRaises(AssertionError, read_json, json, orient="split") # bad key - json = ('{"badkey":["A","B"],' - '"index":["2","3"],' - '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') + json = StringIO('{"badkey":["A","B"],' + '"index":["2","3"],' + '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}"') self.assertRaises(TypeError, read_json, json, orient="split") def test_frame_from_json_nones(self): df = DataFrame([[1, 2], [4, 5, 6]]) - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(np.isnan(unser['2'][0])) df = DataFrame([['1', '2'], ['4', '5', '6']]) - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(unser['2'][0] is None) - unser = read_json(df.to_json(), numpy=False) + unser = read_json(df.to_json(None), numpy=False) self.assert_(unser['2'][0] is None) # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) df[2][0] = np.inf - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(np.isnan(unser['2'][0])) df[2][0] = np.NINF - unser = read_json(df.to_json()) + unser = read_json(df.to_json(None)) self.assert_(np.isnan(unser['2'][0])) def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) - self.assertRaises(ValueError, df.to_json, orient="garbage") + self.assertRaises(ValueError, df.to_json, None, orient="garbage") def test_series_from_json_to_json(self): def _check_orient(series, orient, dtype=None, numpy=True): series = series.sort_index() - unser = read_json(series.to_json(orient=orient), typ='series', + unser = read_json(series.to_json(None,orient=orient), typ='series', orient=orient, numpy=numpy, dtype=dtype) unser = unser.sort_index() if series.index.dtype.type == np.datetime64: @@ -222,8 +223,8 @@ def _check_all_orients(series, dtype=None): # basic _check_all_orients(self.series) - self.assertEqual(self.series.to_json(), - self.series.to_json(orient="index")) + self.assertEqual(self.series.to_json(None).read(), + self.series.to_json(None,orient="index").read()) objSeries = Series([str(d) for d in self.objSeries], index=self.objSeries.index, @@ -239,18 +240,35 @@ def _check_all_orients(series, dtype=None): def test_series_to_json_except(self): s = Series([1, 2, 3]) - self.assertRaises(ValueError, s.to_json, orient="garbage") + self.assertRaises(ValueError, s.to_json, None, orient="garbage") def test_typ(self): s = Series(range(6), index=['a','b','c','d','e','f']) - result = read_json(s.to_json(),typ=None) + result = read_json(s.to_json(None),typ=None) assert_series_equal(result,s) def test_reconstruction_index(self): df = DataFrame([[1, 2, 3], [4, 5, 6]]) - result = read_json(df.to_json()) + result = read_json(df.to_json(None)) # the index is serialized as strings....correct? #assert_frame_equal(result,df) + + @network + @slow + def test_url(self): + import urllib2 + try: + # HTTP(S) + url = 'https://api.github.com/repos/pydata/pandas/issues?per_page=5' + result = read_json(url) + #print result + + url = 'http://search.twitter.com/search.json?q=pandas%20python' + result = read_json(url) + #print result + + except urllib2.URLError: + raise nose.SkipTest diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py index 833abcb32fa98..2e775b4a541ea 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/test_json/test_ujson.py @@ -955,20 +955,22 @@ def testArrayNumpyLabelled(self): self.assertTrue(output[1] is None) self.assertTrue((np.array([u'a']) == output[2]).all()) - input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] - output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) - expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) - self.assertTrue((expectedvals == output[0]).all()) - self.assertTrue(output[1] is None) - self.assertTrue((np.array([u'a', 'b']) == output[2]).all()) - - - input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} - output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) - expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) - self.assertTrue((expectedvals == output[0]).all()) - self.assertTrue((np.array(['1','2','3']) == output[1]).all()) - self.assertTrue((np.array(['a', 'b']) == output[2]).all()) + # py3 is non-determinstic on the ordering...... + if not py3compat.PY3: + input = [{'a': 42, 'b':31}, {'a': 24, 'c': 99}, {'a': 2.4, 'b': 78}] + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue(output[1] is None) + self.assertTrue((np.array([u'a', 'b']) == output[2]).all()) + + + input = {1: {'a': 42, 'b':31}, 2: {'a': 24, 'c': 99}, 3: {'a': 2.4, 'b': 78}} + output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) + expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3,2)) + self.assertTrue((expectedvals == output[0]).all()) + self.assertTrue((np.array(['1','2','3']) == output[1]).all()) + self.assertTrue((np.array(['a', 'b']) == output[2]).all()) class PandasJSONTests(TestCase): From 8e673cf0766b697952522aa593e53bb80b1fbce2 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Jun 2013 01:19:52 -0400 Subject: [PATCH 33/71] ENH: removed json argument, now path_or_buf can be a path,buffer,url,or JSON string added keywords parse_dates,keep_default_dates to allow for date parsing in columns of a Frame (default is False, not to parse dates) --- doc/source/io.rst | 35 ++- pandas/core/generic.py | 4 +- pandas/io/json.py | 295 ++++++++++++++--------- pandas/io/tests/test_json/test_pandas.py | 71 ++++-- 4 files changed, 263 insertions(+), 142 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f1480b6546e04..ee234bc352090 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -953,7 +953,7 @@ A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_ with optional parameters: - path_or_buf : the pathname or buffer to write the output - This can be ``None`` in which case a ``StringIO`` converted string is returned + This can be ``None`` in which case a JSON string is returned - orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` * split : dict like {index -> [index], columns -> [columns], data -> [values]} @@ -969,9 +969,19 @@ Note NaN's and None will be converted to null and datetime objects will be conve .. ipython:: python - df = DataFrame(randn(10, 2), columns=list('AB')) - json = df.to_json(None) - json.getvalue() + dfj = DataFrame(randn(5, 2), columns=list('AB')) + json = dfj.to_json() + json + +Writing to a file, with a date index and a date column + +.. ipython:: python + + dfj2 = dfj.copy() + dfj2['date'] = Timestamp('20130101') + dfj2.index = date_range('20130101',periods=5) + dfj2.to_json('test.json') + open('test.json').read() Reading JSON ~~~~~~~~~~~~ @@ -984,7 +994,6 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json -- json : a VALID JSON string, optional, used if filepath_or_buffer is not provided - typ : type of object to recover (series or frame), default 'frame' - orient : The format of the JSON string, one of the following @@ -992,8 +1001,10 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` * records : list like [value, ... , value] * index : dict like {index -> value} -- dtype : dtype of the resulting Series +- dtype : dtype of the resulting object - numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. +- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True +- keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. @@ -1002,13 +1013,19 @@ Reading from a JSON string .. ipython:: python - pd.read_json(json='{"0":{"0":1,"1":3},"1":{"0":2,"1":4}}') + pd.read_json(json) + +Reading from a file, parsing dates + +.. ipython:: python -Reading from a StringIO + pd.read_json('test.json',parse_dates=True) .. ipython:: python + :suppress: - pd.read_json(json) + import os + os.remove('test.json') HTML ---- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ac9663a34e748..7e6ac4d5bbf68 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,7 +495,7 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) - def to_json(self, path_or_buf, orient=None, double_precision=10, + def to_json(self, path_or_buf=None, orient=None, double_precision=10, force_ascii=True): """ Convert the object to a JSON string. @@ -529,7 +529,7 @@ def to_json(self, path_or_buf, orient=None, double_precision=10, """ from pandas.io import json - return json.to_json(path_or_buf, self, orient=orient, double_precision=double_precision, + return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, double_precision=double_precision, force_ascii=force_ascii) # install the indexerse diff --git a/pandas/io/json.py b/pandas/io/json.py index 48412f21fbbdd..446cadf473325 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -1,47 +1,19 @@ # pylint: disable-msg=E1101,W0613,W0603 -from pandas import Series, DataFrame -from pandas.io.common import get_filepath_or_buffer from StringIO import StringIO +import os +from pandas import Series, DataFrame, to_datetime +from pandas.io.common import get_filepath_or_buffer import pandas.json as _json loads = _json.loads dumps = _json.dumps -### interface to/from ### +import numpy as np -def to_json(path_or_buf, obj, orient=None, double_precision=10, - force_ascii=True): - """ - Convert the object to a JSON string - - Note NaN's and None will be converted to null and datetime objects - will be converted to UNIX timestamps. - - Parameters - ---------- - path_or_buf : the pathname or buffer to write the output - if this is None, return a StringIO of the converted string - orient : {'split', 'records', 'index', 'columns', 'values'}, - default is 'index' for Series, 'columns' for DataFrame - - The format of the JSON string - split : dict like - {index -> [index], columns -> [columns], data -> [values]} - records : list like [{column -> value}, ... , {column -> value}] - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array - double_precision : The number of decimal places to use when encoding - floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. - - Returns - ------- - result : a JSON compatible string written to the path_or_buf; - if the path_or_buf is none, return a StringIO of the result +### interface to/from ### - """ +def to_json(path_or_buf, obj, orient=None, double_precision=10, force_ascii=True): if orient is None: if isinstance(obj, Series): @@ -55,126 +27,229 @@ def to_json(path_or_buf, obj, orient=None, double_precision=10, with open(path_or_buf,'w') as fh: fh.write(s) elif path_or_buf is None: - return StringIO(s) + return s else: path_or_buf.write(s) -def read_json(filepath_or_buffer=None, json=None, typ='frame', orient=None, dtype=None, numpy=True): +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, + parse_dates=False, keep_default_dates=True): """ Convert JSON string to pandas object Parameters ---------- - filepath_or_buffer : a VALID JSON StringIO or file handle / StringIO. The string could be + filepath_or_buffer : a VALID JSON string or file handle / StringIO. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json - json : a VALID JSON string, optional, used if filepath_or_buffer is not provided - typ : type of object to recover (series or frame), default 'frame' orient : {'split', 'records', 'index'}, default 'index' The format of the JSON string split : dict like {index -> [index], name -> name, data -> [values]} records : list like [value, ... , value] index : dict like {index -> value} - dtype : dtype of the resulting Series + typ : type of object to recover (series or frame), default 'frame' + dtype : dtype of the resulting object numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. + parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns + default is False + keep_default_dates : boolean, default True. If parsing dates, + then parse the default datelike columns Returns ------- result : Series or DataFrame """ - if json is None: - filepath_or_buffer,_ = get_filepath_or_buffer(filepath_or_buffer) - if isinstance(filepath_or_buffer, basestring): - with open(filepath_or_buffer,'r') as fh: - json = fh.read() - elif hasattr(filepath_or_buffer, 'read'): - json = filepath_or_buffer.read() + filepath_or_buffer,_ = get_filepath_or_buffer(path_or_buf) + if isinstance(filepath_or_buffer, basestring): + if os.path.exists(filepath_or_buffer): + with open(filepath_or_buffer,'r') as fh: + json = fh.read() else: - json = filepath_or_buffer + json = filepath_or_buffer + elif hasattr(filepath_or_buffer, 'read'): + json = filepath_or_buffer.read() + else: + json = filepath_or_buffer obj = None if typ == 'frame': - if orient is None: - orient = 'columns' - obj = load_frame(json, orient, dtype, numpy) + obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() if typ == 'series' or obj is None: - if orient == 'columns': - orient = 'index' - obj = load_series(json, orient, dtype, numpy) + obj = SeriesParser(json, orient, dtype, numpy).parse() return obj -def load_series(json, orient, dtype, numpy): - s = None +class Parser(object): + _min_date = 31536000000000000L + + def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_dates=False): + self.json = json - if dtype is not None and orient == "split": - numpy = False + if orient is None: + orient = self._default_orient + + self.orient = orient + self.dtype = dtype - if numpy: - try: - if orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - s = Series(**decoded) - elif orient == "columns" or orient == "index": - s = Series(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - else: - s = Series(loads(json, dtype=dtype, numpy=True)) - except ValueError: + if dtype is not None and orient == "split": numpy = False - if not numpy: - if orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - s = Series(dtype=dtype, **decoded) - else: - s = Series(loads(json), dtype=dtype) - - return s - - -def load_frame(json, orient, dtype, numpy): - """ try to recover a frame, return None if we didn't get anything """ + self.numpy = numpy + self.parse_dates = parse_dates + self.keep_default_dates = keep_default_dates + self.obj = None + + def parse(self): + self._parse() + if self.obj is not None: + self.convert_axes() + if self.parse_dates: + self.try_parse_dates() + return self.obj + + def try_parse_dates(self): + raise NotImplementedError + +class SeriesParser(Parser): + _default_orient = 'index' + + def _parse(self): + + json = self.json + dtype = self.dtype + orient = self.orient + numpy = self.numpy + + if numpy: + try: + if orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = Series(**decoded) + elif orient == "columns" or orient == "index": + self.obj = Series(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + else: + self.obj = Series(loads(json, dtype=dtype, numpy=True)) + except ValueError: + numpy = False + + if not numpy: + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = Series(dtype=dtype, **decoded) + else: + self.obj = Series(loads(json), dtype=dtype) - if dtype is not None and orient == "split": - numpy = False + def convert_axes(self): + """ try to axes if they are datelike """ + if self.obj is None: return - if numpy: try: + self.obj.index = to_datetime(self.obj.index.astype('int64')) + except: + pass + +class FrameParser(Parser): + _default_orient = 'columns' + + def _parse(self): + + json = self.json + dtype = self.dtype + orient = self.orient + numpy = self.numpy + + if numpy: + try: + if orient == "columns": + args = loads(json, dtype=dtype, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + self.obj = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=dtype, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = DataFrame(**decoded) + elif orient == "values": + self.obj = DataFrame(loads(json, dtype=dtype, numpy=True)) + else: + self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, + labelled=True)) + except ValueError: + numpy = False + + if not numpy: if orient == "columns": - args = loads(json, dtype=dtype, numpy=True, labelled=True) - if args: - args = (args[0].T, args[2], args[1]) - df = DataFrame(*args) + self.obj = DataFrame(loads(json), dtype=dtype) elif orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - df = DataFrame(**decoded) - elif orient == "values": - df = DataFrame(loads(json, dtype=dtype, numpy=True)) + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = DataFrame(dtype=dtype, **decoded) + elif orient == "index": + self.obj = DataFrame(loads(json), dtype=dtype).T else: - df = DataFrame(*loads(json, dtype=dtype, numpy=True, - labelled=True)) - except ValueError: - numpy = False + self.obj = DataFrame(loads(json), dtype=dtype) - if not numpy: - if orient == "columns": - df = DataFrame(loads(json), dtype=dtype) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - df = DataFrame(dtype=dtype, **decoded) - elif orient == "index": - df = DataFrame(loads(json), dtype=dtype).T + def convert_axes(self): + """ try to axes if they are datelike """ + if self.obj is None: return + + if self.orient == 'columns': + axis = 'index' + elif self.orient == 'index': + axis = 'columns' else: - df = DataFrame(loads(json), dtype=dtype) + return + + try: + a = getattr(self.obj,axis).astype('int64') + if (a>self._min_date).all(): + setattr(self.obj,axis,to_datetime(a)) + except: + pass + + def try_parse_dates(self): + """ + try to parse out dates + these are only in in64 columns + """ - return df + if self.obj is None: return + + # our columns to parse + parse_dates = self.parse_dates + if parse_dates is True: + parse_dates = [] + parse_dates = set(parse_dates) + + def is_ok(col, c): + """ return if this col is ok to try for a date parse """ + if not isinstance(col, basestring): return False + + if issubclass(c.dtype.type,np.number) and (c Date: Sun, 9 Jun 2013 10:34:40 -0400 Subject: [PATCH 34/71] ENH: added date_format parm to to_josn to allow epoch or iso formats (which both can be can be parsed with parse_dates=True in read_json) --- doc/source/io.rst | 14 +- pandas/core/generic.py | 10 +- pandas/io/json.py | 192 +++++++++++++++++------ pandas/io/tests/test_json/test_pandas.py | 33 +++- 4 files changed, 194 insertions(+), 55 deletions(-) mode change 100755 => 100644 pandas/io/tests/test_json/test_pandas.py diff --git a/doc/source/io.rst b/doc/source/io.rst index ee234bc352090..e64cbc4bc8101 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -962,10 +962,11 @@ with optional parameters: * columns : dict like {column -> {index -> value}} * values : just the values array +- date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch - double_precision : The number of decimal places to use when encoding floating point values, default 10. - force_ascii : force encoded string to be ASCII, default True. -Note NaN's and None will be converted to null and datetime objects will be converted to UNIX timestamps. +Note NaN's and None will be converted to null and datetime objects will be converted based on the date_format parameter .. ipython:: python @@ -973,6 +974,15 @@ Note NaN's and None will be converted to null and datetime objects will be conve json = dfj.to_json() json +Writing in iso date format + +.. ipython:: python + + dfd = DataFrame(randn(5, 2), columns=list('AB')) + dfd['date'] = Timestamp('20130101') + json = dfd.to_json(date_format='iso') + json + Writing to a file, with a date index and a date column .. ipython:: python @@ -1003,7 +1013,7 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` - dtype : dtype of the resulting object - numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. -- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True +- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is False - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7e6ac4d5bbf68..0d2612d7aed7a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -495,8 +495,8 @@ def to_clipboard(self): from pandas.io import clipboard clipboard.to_clipboard(self) - def to_json(self, path_or_buf=None, orient=None, double_precision=10, - force_ascii=True): + def to_json(self, path_or_buf=None, orient=None, date_format='epoch', + double_precision=10, force_ascii=True): """ Convert the object to a JSON string. @@ -517,6 +517,8 @@ def to_json(self, path_or_buf=None, orient=None, double_precision=10, index : dict like {index -> {column -> value}} columns : dict like {column -> {index -> value}} values : just the values array + date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), + default is epoch double_precision : The number of decimal places to use when encoding floating point values, default 10. force_ascii : force encoded string to be ASCII, default True. @@ -529,8 +531,8 @@ def to_json(self, path_or_buf=None, orient=None, double_precision=10, """ from pandas.io import json - return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, double_precision=double_precision, - force_ascii=force_ascii) + return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, date_format=date_format, + double_precision=double_precision, force_ascii=force_ascii) # install the indexerse for _name, _indexer in indexing.get_indexers_list(): diff --git a/pandas/io/json.py b/pandas/io/json.py index 446cadf473325..17b33931bee5a 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -10,26 +10,107 @@ dumps = _json.dumps import numpy as np +from pandas.tslib import iNaT ### interface to/from ### -def to_json(path_or_buf, obj, orient=None, double_precision=10, force_ascii=True): +def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True): + if isinstance(obj, Series): + s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, + ensure_ascii=force_ascii).write() + elif isinstance(obj, DataFrame): + s = FrameWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, + ensure_ascii=force_ascii).write() + else: + raise NotImplementedError + + if isinstance(path_or_buf, basestring): + with open(path_or_buf,'w') as fh: + fh.write(s) + elif path_or_buf is None: + return s + else: + path_or_buf.write(s) + +class Writer(object): + + def __init__(self, obj, orient, date_format, double_precision, ensure_ascii): + self.obj = obj + if orient is None: - if isinstance(obj, Series): - orient = 'index' - elif isinstance(obj, DataFrame): - orient = 'columns' - - s = dumps(obj, orient=orient, double_precision=double_precision, - ensure_ascii=force_ascii) - if isinstance(path_or_buf, basestring): - with open(path_or_buf,'w') as fh: - fh.write(s) - elif path_or_buf is None: - return s + orient = self._default_orient + + self.orient = orient + self.date_format = date_format + self.double_precision = double_precision + self.ensure_ascii = ensure_ascii + + self.is_copy = False + self._format_axes() + self._format_dates() + + def _format_dates(self): + raise NotImplementedError + + def _format_axes(self): + raise NotImplementedError + + def _needs_to_date(self, data): + return self.date_format == 'iso' and data.dtype == 'datetime64[ns]' + + def _format_to_date(self, data): + if self._needs_to_date(data): + return data.apply(lambda x: x.isoformat()) + return data + + def copy_if_needed(self): + """ copy myself if necessary """ + if not self.is_copy: + self.obj = self.obj.copy() + self.is_copy = True + + def write(self): + return dumps(self.obj, orient=self.orient, double_precision=self.double_precision, ensure_ascii=self.ensure_ascii) + +class SeriesWriter(Writer): + _default_orient = 'index' + + def _format_axes(self): + if self._needs_to_date(self.obj.index): + self.copy_if_needed() + self.obj.index = self._format_to_date(self.obj.index.to_series()) + + def _format_dates(self): + if self._needs_to_date(self.obj): + self.copy_if_needed() + self.obj = self._format_to_date(self.obj) + +class FrameWriter(Writer): + _default_orient = 'columns' + + def _format_axes(self): + """ try to axes if they are datelike """ + if self.orient == 'columns': + axis = 'index' + elif self.orient == 'index': + axis = 'columns' else: - path_or_buf.write(s) + return + + a = getattr(self.obj,axis) + if self._needs_to_date(a): + self.copy_if_needed() + setattr(self.obj,axis,self._format_to_date(a.to_series())) + + def _format_dates(self): + if self.date_format == 'iso': + dtypes = self.obj.dtypes + dtypes = dtypes[dtypes == 'datetime64[ns]'] + if len(dtypes): + self.copy_if_needed() + for c in dtypes.index: + self.obj[c] = self._format_to_date(self.obj[c]) def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, parse_dates=False, keep_default_dates=True): @@ -79,12 +160,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() if typ == 'series' or obj is None: - obj = SeriesParser(json, orient, dtype, numpy).parse() + obj = SeriesParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() return obj class Parser(object): - _min_date = 31536000000000000L def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_dates=False): self.json = json @@ -106,12 +186,43 @@ def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_d def parse(self): self._parse() if self.obj is not None: - self.convert_axes() + self._convert_axes() if self.parse_dates: - self.try_parse_dates() + self._try_parse_dates() return self.obj - def try_parse_dates(self): + + def _try_parse_to_date(self, data): + """ try to parse a ndarray like into a date column + try to coerce object in epoch/iso formats and + integer/float in epcoh formats """ + + new_data = data + if new_data.dtype == 'object': + try: + new_data = data.astype('int64') + except: + pass + + + # ignore numbers that are out of range + if issubclass(new_data.dtype.type,np.number): + if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all(): + return data + + try: + new_data = to_datetime(new_data) + except: + try: + new_data = to_datetime(new_data.astype('int64')) + except: + + # return old, noting more we can do + new_data = data + + return new_data + + def _try_parse_dates(self): raise NotImplementedError class SeriesParser(Parser): @@ -146,15 +257,19 @@ def _parse(self): else: self.obj = Series(loads(json), dtype=dtype) - def convert_axes(self): + def _convert_axes(self): """ try to axes if they are datelike """ - if self.obj is None: return - try: - self.obj.index = to_datetime(self.obj.index.astype('int64')) + self.obj.index = self._try_parse_to_date(self.obj.index) except: pass + def _try_parse_dates(self): + if self.obj is None: return + + if self.parse_dates: + self.obj = self._try_parse_to_date(self.obj) + class FrameParser(Parser): _default_orient = 'columns' @@ -196,10 +311,8 @@ def _parse(self): else: self.obj = DataFrame(loads(json), dtype=dtype) - def convert_axes(self): + def _convert_axes(self): """ try to axes if they are datelike """ - if self.obj is None: return - if self.orient == 'columns': axis = 'index' elif self.orient == 'index': @@ -208,18 +321,12 @@ def convert_axes(self): return try: - a = getattr(self.obj,axis).astype('int64') - if (a>self._min_date).all(): - setattr(self.obj,axis,to_datetime(a)) + a = getattr(self.obj,axis) + setattr(self.obj,axis,self._try_parse_to_date(a)) except: pass - def try_parse_dates(self): - """ - try to parse out dates - these are only in in64 columns - """ - + def _try_parse_dates(self): if self.obj is None: return # our columns to parse @@ -228,13 +335,10 @@ def try_parse_dates(self): parse_dates = [] parse_dates = set(parse_dates) - def is_ok(col, c): + def is_ok(col): """ return if this col is ok to try for a date parse """ if not isinstance(col, basestring): return False - if issubclass(c.dtype.type,np.number) and (c Date: Tue, 11 Jun 2013 10:01:26 -0400 Subject: [PATCH 35/71] BUG: patch in weird nested decoding issue, courtesy of @Komnomnomnom --- pandas/io/tests/test_json/test_pandas.py | 23 +++++++++++++++++++++++ pandas/src/ujson/python/JSONtoObj.c | 10 ++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 7a639457e51e9..b64bfaacd38f2 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -314,6 +314,29 @@ def test_date_format(self): result = read_json(json,typ='series',parse_dates=True) assert_series_equal(result,ts) + def test_weird_nested_json(self): + + # this used to core dump the parser + s = r'''{ + "status": "success", + "data": { + "posts": [ + { + "id": 1, + "title": "A blog post", + "body": "Some useful content" + }, + { + "id": 2, + "title": "Another blog post", + "body": "More content" + } + ] + } +}''' + + read_json(s) + @network @slow def test_url(self): diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c index 1db7586ad17f7..bc42269d9698b 100644 --- a/pandas/src/ujson/python/JSONtoObj.c +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -10,6 +10,7 @@ typedef struct __PyObjectDecoder JSONObjectDecoder dec; void* npyarr; // Numpy context buffer + void* npyarr_addr; // Ref to npyarr ptr to track DECREF calls npy_intp curdim; // Current array dimension PyArray_Descr* dtype; @@ -67,9 +68,7 @@ void Npy_releaseContext(NpyArrContext* npyarr) } if (npyarr->dec) { - // Don't set to null, used to make sure we don't Py_DECREF npyarr - // in releaseObject - // npyarr->dec->npyarr = NULL; + npyarr->dec->npyarr = NULL; npyarr->dec->curdim = 0; } Py_XDECREF(npyarr->labels[0]); @@ -88,6 +87,7 @@ JSOBJ Object_npyNewArray(void* _decoder) { // start of array - initialise the context buffer npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + decoder->npyarr_addr = npyarr; if (!npyarr) { @@ -515,7 +515,7 @@ JSOBJ Object_newDouble(double value) static void Object_releaseObject(JSOBJ obj, void* _decoder) { PyObjectDecoder* decoder = (PyObjectDecoder*) _decoder; - if (obj != decoder->npyarr) + if (obj != decoder->npyarr_addr) { Py_XDECREF( ((PyObject *)obj)); } @@ -555,11 +555,13 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) pyDecoder.dec = dec; pyDecoder.curdim = 0; pyDecoder.npyarr = NULL; + pyDecoder.npyarr_addr = NULL; decoder = (JSONObjectDecoder*) &pyDecoder; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|iiO&", kwlist, &sarg, &numpy, &labelled, PyArray_DescrConverter2, &dtype)) { + Npy_releaseContext(pyDecoder.npyarr); return NULL; } From a7d069dc2dbf451e5286cfca3497fa03c77dc900 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Tue, 11 Jun 2013 11:25:32 +0100 Subject: [PATCH 36/71] ENH use pyperclip for read and to_clipboard --- pandas/util/clipboard.py | 275 ++++++++++++++++++++++----------------- 1 file changed, 158 insertions(+), 117 deletions(-) diff --git a/pandas/util/clipboard.py b/pandas/util/clipboard.py index bc58af8c0ea3c..9f3ee0638352f 100644 --- a/pandas/util/clipboard.py +++ b/pandas/util/clipboard.py @@ -1,119 +1,160 @@ -""" -Taken from the IPython project http://ipython.org - -Used under the terms of the BSD license -""" - -import subprocess -import sys - - -def clipboard_get(): - """ Get text from the clipboard. - """ - if sys.platform == 'win32': - try: - return win32_clipboard_get() - except Exception: - pass - elif sys.platform == 'darwin': - try: - return osx_clipboard_get() - except Exception: - pass - return tkinter_clipboard_get() - - -def clipboard_set(text): - """ Get text from the clipboard. - """ - if sys.platform == 'win32': - try: - return win32_clipboard_set(text) - except Exception: - raise - elif sys.platform == 'darwin': - try: - return osx_clipboard_set(text) - except Exception: - pass - xsel_clipboard_set(text) - - -def win32_clipboard_get(): - """ Get the current clipboard's text on Windows. - - Requires Mark Hammond's pywin32 extensions. - """ - try: - import win32clipboard - except ImportError: - message = ("Getting text from the clipboard requires the pywin32 " - "extensions: http://sourceforge.net/projects/pywin32/") - raise Exception(message) - win32clipboard.OpenClipboard() - text = win32clipboard.GetClipboardData(win32clipboard.CF_TEXT) - # FIXME: convert \r\n to \n? - win32clipboard.CloseClipboard() - return text - - -def osx_clipboard_get(): - """ Get the clipboard's text on OS X. - """ - p = subprocess.Popen(['pbpaste', '-Prefer', 'ascii'], - stdout=subprocess.PIPE) - text, stderr = p.communicate() - # Text comes in with old Mac \r line endings. Change them to \n. - text = text.replace('\r', '\n') - return text - - -def tkinter_clipboard_get(): - """ Get the clipboard's text using Tkinter. - - This is the default on systems that are not Windows or OS X. It may - interfere with other UI toolkits and should be replaced with an - implementation that uses that toolkit. - """ +# Pyperclip v1.3 +# A cross-platform clipboard module for Python. (only handles plain text for now) +# By Al Sweigart al@coffeeghost.net + +# Usage: +# import pyperclip +# pyperclip.copy('The text to be copied to the clipboard.') +# spam = pyperclip.paste() + +# On Mac, this module makes use of the pbcopy and pbpaste commands, which should come with the os. +# On Linux, this module makes use of the xclip command, which should come with the os. Otherwise run "sudo apt-get install xclip" + + +# Copyright (c) 2010, Albert Sweigart +# All rights reserved. +# +# BSD-style license: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the pyperclip nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Change Log: +# 1.2 Use the platform module to help determine OS. +# 1.3 Changed ctypes.windll.user32.OpenClipboard(None) to ctypes.windll.user32.OpenClipboard(0), after some people ran into some TypeError + +import platform, os + +def winGetClipboard(): + ctypes.windll.user32.OpenClipboard(0) + pcontents = ctypes.windll.user32.GetClipboardData(1) # 1 is CF_TEXT + data = ctypes.c_char_p(pcontents).value + #ctypes.windll.kernel32.GlobalUnlock(pcontents) + ctypes.windll.user32.CloseClipboard() + return data + +def winSetClipboard(text): + GMEM_DDESHARE = 0x2000 + ctypes.windll.user32.OpenClipboard(0) + ctypes.windll.user32.EmptyClipboard() try: - import Tkinter - except ImportError: - message = ("Getting text from the clipboard on this platform " - "requires Tkinter.") - raise Exception(message) - root = Tkinter.Tk() - root.withdraw() - text = root.clipboard_get() - root.destroy() - return text - - -def win32_clipboard_set(text): - # idiosyncratic win32 import issues - import pywintypes as _ - import win32clipboard - win32clipboard.OpenClipboard() + # works on Python 2 (bytes() only takes one argument) + hCd = ctypes.windll.kernel32.GlobalAlloc(GMEM_DDESHARE, len(bytes(text))+1) + except TypeError: + # works on Python 3 (bytes() requires an encoding) + hCd = ctypes.windll.kernel32.GlobalAlloc(GMEM_DDESHARE, len(bytes(text, 'ascii'))+1) + pchData = ctypes.windll.kernel32.GlobalLock(hCd) try: - win32clipboard.EmptyClipboard() - win32clipboard.SetClipboardText(_fix_line_endings(text)) - finally: - win32clipboard.CloseClipboard() - - -def _fix_line_endings(text): - return '\r\n'.join(text.splitlines()) - - -def osx_clipboard_set(text): - """ Get the clipboard's text on OS X. - """ - p = subprocess.Popen(['pbcopy', '-Prefer', 'ascii'], - stdin=subprocess.PIPE) - p.communicate(input=text) - - -def xsel_clipboard_set(text): - from subprocess import Popen, PIPE - p = Popen(['xsel', '-bi'], stdin=PIPE) - p.communicate(input=text) + # works on Python 2 (bytes() only takes one argument) + ctypes.cdll.msvcrt.strcpy(ctypes.c_char_p(pchData), bytes(text)) + except TypeError: + # works on Python 3 (bytes() requires an encoding) + ctypes.cdll.msvcrt.strcpy(ctypes.c_char_p(pchData), bytes(text, 'ascii')) + ctypes.windll.kernel32.GlobalUnlock(hCd) + ctypes.windll.user32.SetClipboardData(1,hCd) + ctypes.windll.user32.CloseClipboard() + +def macSetClipboard(text): + outf = os.popen('pbcopy', 'w') + outf.write(text) + outf.close() + +def macGetClipboard(): + outf = os.popen('pbpaste', 'r') + content = outf.read() + outf.close() + return content + +def gtkGetClipboard(): + return gtk.Clipboard().wait_for_text() + +def gtkSetClipboard(text): + cb = gtk.Clipboard() + cb.set_text(text) + cb.store() + +def qtGetClipboard(): + return str(cb.text()) + +def qtSetClipboard(text): + cb.setText(text) + +def xclipSetClipboard(text): + outf = os.popen('xclip -selection c', 'w') + outf.write(text) + outf.close() + +def xclipGetClipboard(): + outf = os.popen('xclip -selection c -o', 'r') + content = outf.read() + outf.close() + return content + +def xselSetClipboard(text): + outf = os.popen('xsel -i', 'w') + outf.write(text) + outf.close() + +def xselGetClipboard(): + outf = os.popen('xsel -o', 'r') + content = outf.read() + outf.close() + return content + + +if os.name == 'nt' or platform.system() == 'Windows': + import ctypes + getcb = winGetClipboard + setcb = winSetClipboard +elif os.name == 'mac' or platform.system() == 'Darwin': + getcb = macGetClipboard + setcb = macSetClipboard +elif os.name == 'posix' or platform.system() == 'Linux': + xclipExists = os.system('which xclip') == 0 + if xclipExists: + getcb = xclipGetClipboard + setcb = xclipSetClipboard + else: + xselExists = os.system('which xsel') == 0 + if xselExists: + getcb = xselGetClipboard + setcb = xselSetClipboard + try: + import gtk + getcb = gtkGetClipboard + setcb = gtkSetClipboard + except: + try: + import PyQt4.QtCore + import PyQt4.QtGui + app = QApplication([]) + cb = PyQt4.QtGui.QApplication.clipboard() + getcb = qtGetClipboard + setcb = qtSetClipboard + except: + raise Exception('Pyperclip requires the gtk or PyQt4 module installed, or the xclip command.') +copy = setcb +paste = getcb + +## pandas aliases +clipboard_get = paste +clipboard_set = copy \ No newline at end of file From e958833d2cc60363491b3f593bf0c41da530e163 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 11 Jun 2013 14:11:36 -0400 Subject: [PATCH 37/71] DOC: cooking.rst update --- doc/source/cookbook.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 71f47b385e236..da8d90c8367ce 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -239,6 +239,9 @@ The :ref:`Plotting ` docs. `Creating a multi-line plot `__ +`Plotting a heatmap +`__ + Data In/Out ----------- From c3d20cba83c78e89b55e803c441ec63677f7e38b Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Tue, 11 Jun 2013 19:45:34 -0400 Subject: [PATCH 38/71] =?UTF-8?q?TST:=20Fix=20missing=20import=20in=20io/t?= =?UTF-8?q?ests/test=5Fjson=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nose import is missing. If you get to the error at the last line (under urllib), python complains because nose is never imported. --- pandas/io/tests/test_json/test_pandas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index b64bfaacd38f2..4b1294b786df7 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -8,6 +8,7 @@ import os import unittest +import nose import numpy as np from pandas import Series, DataFrame, DatetimeIndex, Timestamp From 5e2d66da4ad7cc87c1852a5c94007e8af756b28d Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Wed, 12 Jun 2013 03:21:03 +0100 Subject: [PATCH 39/71] TST slicing regression test --- pandas/tests/test_indexing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index e7f824ace983c..295eaede443b1 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -724,6 +724,15 @@ def test_ix_general(self): df.sortlevel(inplace=True) df.ix[(4.0,2012)] + def test_ix_weird_slicing(self): + ## http://stackoverflow.com/q/17056560/1240268 + df = DataFrame({'one' : [1, 2, 3, np.nan, np.nan], 'two' : [1, 2, 3, 4, 5]}) + df.ix[df['one']>1, 'two'] = -df['two'] + + expected = DataFrame({'one': {0: 1.0, 1: 2.0, 2: 3.0, 3: nan, 4: nan}, + 'two': {0: 1, 1: -2, 2: -3, 3: 4, 4: 5}}) + assert_frame_equal(df, expected) + def test_xs_multiindex(self): # GH2903 From f3b70e05841aab3d3f76b9a263ee1f6dcfb1d8ae Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Jun 2013 10:20:37 -0400 Subject: [PATCH 40/71] CLN: avoid Unboundlocal error in tools/merge/_get_concatenaged_data (GH3833) --- pandas/tools/merge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 9cdddc47acac1..75e35b403dd78 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -984,11 +984,11 @@ def _prepare_blocks(self): return blockmaps, reindexed_data def _get_concatenated_data(self): - try: - # need to conform to same other (joined) axes for block join - blockmaps, rdata = self._prepare_blocks() - kinds = _get_all_block_kinds(blockmaps) + # need to conform to same other (joined) axes for block join + blockmaps, rdata = self._prepare_blocks() + kinds = _get_all_block_kinds(blockmaps) + try: new_blocks = [] for kind in kinds: klass_blocks = [mapping.get(kind) for mapping in blockmaps] From 9854fdf29f6b8377402864ef8026aaf0b012acc3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 12 Jun 2013 14:30:34 -0400 Subject: [PATCH 41/71] BLD: remove after_script since it does not exist anymore --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b48f6d834b62d..8e2bb49d9df93 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,4 +55,3 @@ script: after_script: - ci/print_versions.py - - ci/after_script.sh From 94f1d2296aafae52a211388c27d2ec313b7323d9 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 10 Jun 2013 14:23:27 -0400 Subject: [PATCH 42/71] ENH/CLN: add figsize kwargs to hist method --- RELEASE.rst | 3 ++ doc/source/v0.11.1.txt | 3 ++ pandas/tests/test_graphics.py | 58 +++++++++++++++++++++-------------- pandas/tools/plotting.py | 42 +++++++++++++++---------- 4 files changed, 67 insertions(+), 39 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 307986ab81681..8256b13b4e553 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -79,6 +79,7 @@ pandas 0.11.1 spurious plots from showing up. - Added Faq section on repr display options, to help users customize their setup. - ``where`` operations that result in block splitting are much faster (GH3733_) + - Series and DataFrame hist methods now take a ``figsize`` argument (GH3834_) **API Changes** @@ -312,6 +313,8 @@ pandas 0.11.1 .. _GH3726: https://github.com/pydata/pandas/issues/3726 .. _GH3795: https://github.com/pydata/pandas/issues/3795 .. _GH3814: https://github.com/pydata/pandas/issues/3814 +.. _GH3834: https://github.com/pydata/pandas/issues/3834 + pandas 0.11.0 ============= diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 5045f73375a97..34ba9f0859641 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -288,6 +288,8 @@ Enhancements dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + - Series and DataFrame hist methods now take a ``figsize`` argument (GH3834_) + Bug Fixes ~~~~~~~~~ @@ -396,3 +398,4 @@ on GitHub for a complete list. .. _GH3741: https://github.com/pydata/pandas/issues/3741 .. _GH3726: https://github.com/pydata/pandas/issues/3726 .. _GH3425: https://github.com/pydata/pandas/issues/3425 +.. _GH3834: https://github.com/pydata/pandas/issues/3834 diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 5a1411ccf577e..0755caf45d336 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -8,7 +8,7 @@ from pandas import Series, DataFrame, MultiIndex, PeriodIndex, date_range import pandas.util.testing as tm from pandas.util.testing import ensure_clean -from pandas.core.config import set_option,get_option,config_prefix +from pandas.core.config import set_option import numpy as np @@ -28,11 +28,6 @@ class TestSeriesPlots(unittest.TestCase): @classmethod def setUpClass(cls): - import sys - - # if 'IPython' in sys.modules: - # raise nose.SkipTest - try: import matplotlib as mpl mpl.use('Agg', warn=False) @@ -150,9 +145,16 @@ def test_irregular_datetime(self): def test_hist(self): _check_plot_works(self.ts.hist) _check_plot_works(self.ts.hist, grid=False) - + _check_plot_works(self.ts.hist, figsize=(8, 10)) _check_plot_works(self.ts.hist, by=self.ts.index.month) + def test_plot_fails_when_ax_differs_from_figure(self): + from pylab import figure + fig1 = figure() + fig2 = figure() + ax1 = fig1.add_subplot(111) + self.assertRaises(AssertionError, self.ts.hist, ax=ax1, figure=fig2) + @slow def test_kde(self): _skip_if_no_scipy() @@ -258,7 +260,8 @@ def test_plot(self): (u'\u03b4', 6), (u'\u03b4', 7)], names=['i0', 'i1']) columns = MultiIndex.from_tuples([('bar', u'\u0394'), - ('bar', u'\u0395')], names=['c0', 'c1']) + ('bar', u'\u0395')], names=['c0', + 'c1']) df = DataFrame(np.random.randint(0, 10, (8, 2)), columns=columns, index=index) @@ -269,9 +272,9 @@ def test_nonnumeric_exclude(self): import matplotlib.pyplot as plt plt.close('all') - df = DataFrame({'A': ["x", "y", "z"], 'B': [1,2,3]}) + df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}) ax = df.plot() - self.assert_(len(ax.get_lines()) == 1) #B was plotted + self.assert_(len(ax.get_lines()) == 1) # B was plotted @slow def test_label(self): @@ -434,21 +437,24 @@ def test_bar_center(self): ax = df.plot(kind='bar', grid=True) self.assertEqual(ax.xaxis.get_ticklocs()[0], ax.patches[0].get_x() + ax.patches[0].get_width()) + @slow def test_bar_log(self): # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 # regressions in 1.2.1 - df = DataFrame({'A': [3] * 5, 'B': range(1,6)}, index=range(5)) - ax = df.plot(kind='bar', grid=True,log=True) - self.assertEqual(ax.yaxis.get_ticklocs()[0],1.0) + df = DataFrame({'A': [3] * 5, 'B': range(1, 6)}, index=range(5)) + ax = df.plot(kind='bar', grid=True, log=True) + self.assertEqual(ax.yaxis.get_ticklocs()[0], 1.0) - p1 = Series([200,500]).plot(log=True,kind='bar') - p2 = DataFrame([Series([200,300]),Series([300,500])]).plot(log=True,kind='bar',subplots=True) + p1 = Series([200, 500]).plot(log=True, kind='bar') + p2 = DataFrame([Series([200, 300]), + Series([300, 500])]).plot(log=True, kind='bar', + subplots=True) - (p1.yaxis.get_ticklocs() == np.array([ 0.625, 1.625])) - (p2[0].yaxis.get_ticklocs() == np.array([ 1., 10., 100., 1000.])).all() - (p2[1].yaxis.get_ticklocs() == np.array([ 1., 10., 100., 1000.])).all() + (p1.yaxis.get_ticklocs() == np.array([0.625, 1.625])) + (p2[0].yaxis.get_ticklocs() == np.array([1., 10., 100., 1000.])).all() + (p2[1].yaxis.get_ticklocs() == np.array([1., 10., 100., 1000.])).all() @slow def test_boxplot(self): @@ -508,6 +514,9 @@ def test_hist(self): # make sure sharex, sharey is handled _check_plot_works(df.hist, sharex=True, sharey=True) + # handle figsize arg + _check_plot_works(df.hist, figsize=(8, 10)) + # make sure xlabelsize and xrot are handled ser = df[0] xf, yf = 20, 20 @@ -727,6 +736,7 @@ def test_invalid_kind(self): df = DataFrame(np.random.randn(10, 2)) self.assertRaises(ValueError, df.plot, kind='aasdf') + class TestDataFrameGroupByPlots(unittest.TestCase): @classmethod @@ -786,10 +796,10 @@ def test_time_series_plot_color_with_empty_kwargs(self): plt.close('all') for i in range(3): - ax = Series(np.arange(12) + 1, index=date_range( - '1/1/2000', periods=12)).plot() + ax = Series(np.arange(12) + 1, index=date_range('1/1/2000', + periods=12)).plot() - line_colors = [ l.get_color() for l in ax.get_lines() ] + line_colors = [l.get_color() for l in ax.get_lines()] self.assert_(line_colors == ['b', 'g', 'r']) @slow @@ -829,7 +839,6 @@ def test_grouped_hist(self): self.assertRaises(AttributeError, plotting.grouped_hist, df.A, by=df.C, foo='bar') - def test_option_mpl_style(self): # just a sanity check try: @@ -845,6 +854,7 @@ def test_option_mpl_style(self): except ValueError: pass + def _check_plot_works(f, *args, **kwargs): import matplotlib.pyplot as plt @@ -852,7 +862,7 @@ def _check_plot_works(f, *args, **kwargs): plt.clf() ax = fig.add_subplot(211) ret = f(*args, **kwargs) - assert(ret is not None) # do something more intelligent + assert ret is not None # do something more intelligent ax = fig.add_subplot(212) try: @@ -865,10 +875,12 @@ def _check_plot_works(f, *args, **kwargs): with ensure_clean() as path: plt.savefig(path) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index e25e83a40b267..83ad58c1eb41c 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -658,9 +658,9 @@ def r(h): return ax -def grouped_hist(data, column=None, by=None, ax=None, bins=50, - figsize=None, layout=None, sharex=False, sharey=False, - rot=90, grid=True, **kwargs): +def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, + layout=None, sharex=False, sharey=False, rot=90, grid=True, + **kwargs): """ Grouped histogram @@ -1839,10 +1839,9 @@ def plot_group(group, ax): return fig -def hist_frame( - data, column=None, by=None, grid=True, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None, ax=None, - sharex=False, sharey=False, **kwds): +def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, + xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, + sharey=False, figsize=None, **kwds): """ Draw Histogram the DataFrame's series using matplotlib / pylab. @@ -1866,17 +1865,20 @@ def hist_frame( ax : matplotlib axes object, default None sharex : bool, if True, the X axis will be shared amongst all subplots. sharey : bool, if True, the Y axis will be shared amongst all subplots. + figsize : tuple + The size of the figure to create in inches by default kwds : other plotting keyword arguments To be passed to hist function """ if column is not None: if not isinstance(column, (list, np.ndarray)): column = [column] - data = data.ix[:, column] + data = data[column] if by is not None: - axes = grouped_hist(data, by=by, ax=ax, grid=grid, **kwds) + axes = grouped_hist(data, by=by, ax=ax, grid=grid, figsize=figsize, + **kwds) for ax in axes.ravel(): if xlabelsize is not None: @@ -1898,11 +1900,11 @@ def hist_frame( rows += 1 else: cols += 1 - _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False, - sharex=sharex, sharey=sharey) + fig, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False, + sharex=sharex, sharey=sharey, figsize=figsize) for i, col in enumerate(com._try_sort(data.columns)): - ax = axes[i / cols][i % cols] + ax = axes[i / cols, i % cols] ax.xaxis.set_visible(True) ax.yaxis.set_visible(True) ax.hist(data[col].dropna().values, **kwds) @@ -1922,13 +1924,13 @@ def hist_frame( ax = axes[j / cols, j % cols] ax.set_visible(False) - ax.get_figure().subplots_adjust(wspace=0.3, hspace=0.3) + fig.subplots_adjust(wspace=0.3, hspace=0.3) return axes def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, **kwds): + xrot=None, ylabelsize=None, yrot=None, figsize=None, **kwds): """ Draw histogram of the input series using matplotlib @@ -1948,6 +1950,8 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, If specified changes the y-axis label size yrot : float, default None rotation of y axis labels + figsize : tuple, default None + figure size in inches by default kwds : keywords To be passed to the actual plotting function @@ -1958,16 +1962,22 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, """ import matplotlib.pyplot as plt + fig = kwds.setdefault('figure', plt.figure(figsize=figsize)) + if by is None: if ax is None: - ax = plt.gca() + ax = fig.add_subplot(111) + else: + if ax.get_figure() != fig: + raise AssertionError('passed axis not bound to passed figure') values = self.dropna().values ax.hist(values, **kwds) ax.grid(grid) axes = np.array([ax]) else: - axes = grouped_hist(self, by=by, ax=ax, grid=grid, **kwds) + axes = grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize, + **kwds) for ax in axes.ravel(): if xlabelsize is not None: From 06a154fe82da77e6c7e77270215f82850bace504 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 13 Jun 2013 01:44:24 -0400 Subject: [PATCH 43/71] BUG: allow itertuples to work with duplicate columns --- pandas/core/frame.py | 4 +++- pandas/tests/test_frame.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c0a2843370f4..b6e29204fc0d8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -818,7 +818,9 @@ def itertuples(self, index=True): arrays = [] if index: arrays.append(self.index) - arrays.extend(self[k] for k in self.columns) + + # use integer indexing because of possible duplicate column names + arrays.extend(self.iloc[:, k] for k in xrange(len(self.columns))) return izip(*arrays) iterkv = iteritems diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2c6d3b221c6ff..9b2f078e3b95a 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3951,6 +3951,10 @@ def test_itertuples(self): for tup in df.itertuples(index=False): self.assert_(isinstance(tup[1], np.integer)) + df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) + dfaa = df[['a', 'a']] + self.assertEqual(list(dfaa.itertuples()), [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) + def test_len(self): self.assertEqual(len(self.frame), len(self.frame.index)) From e7addae6f7c166dac55aece63a5c55c5dfdba6df Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 13 Jun 2013 01:45:51 -0400 Subject: [PATCH 44/71] DOC: add release notes and bug fix notes --- RELEASE.rst | 3 +++ doc/source/v0.11.1.txt | 3 +++ 2 files changed, 6 insertions(+) diff --git a/RELEASE.rst b/RELEASE.rst index 8256b13b4e553..0fcd9bd3731fe 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -223,6 +223,8 @@ pandas 0.11.1 - ``read_html`` now correctly skips tests (GH3741_) - Fix incorrect arguments passed to concat that are not list-like (e.g. concat(df1,df2)) (GH3481_) - Correctly parse when passed the ``dtype=str`` (or other variable-len string dtypes) in ``read_csv`` (GH3795_) + - ``DataFrame.itertuples()`` now works with frames with duplicate column + names (GH3873_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 @@ -314,6 +316,7 @@ pandas 0.11.1 .. _GH3795: https://github.com/pydata/pandas/issues/3795 .. _GH3814: https://github.com/pydata/pandas/issues/3814 .. _GH3834: https://github.com/pydata/pandas/issues/3834 +.. _GH3873: https://github.com/pydata/pandas/issues/3873 pandas 0.11.0 diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 34ba9f0859641..564939c596ced 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -349,6 +349,8 @@ Bug Fixes - ``DataFrame.from_records`` did not accept empty recarrays (GH3682_) - ``read_html`` now correctly skips tests (GH3741_) + - ``DataFrame.itertuples()`` now works with frames with duplicate column + names (GH3873_) See the `full release notes `__ or issue tracker @@ -399,3 +401,4 @@ on GitHub for a complete list. .. _GH3726: https://github.com/pydata/pandas/issues/3726 .. _GH3425: https://github.com/pydata/pandas/issues/3425 .. _GH3834: https://github.com/pydata/pandas/issues/3834 +.. _GH3873: https://github.com/pydata/pandas/issues/3873 From 0b16a3af3f70f5bca5e761a2257e9b3fd836ed63 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 13 Jun 2013 00:34:18 -0400 Subject: [PATCH 45/71] ENH: do not convert mixed-integer type indexes to datetimeindex --- pandas/tseries/index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index a918e9eb18e8b..51e657d1723b2 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -910,7 +910,8 @@ def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join """ - if not isinstance(other, DatetimeIndex) and len(other) > 0: + if (not isinstance(other, DatetimeIndex) and len(other) > 0 and + other.inferred_type != 'mixed-integer'): try: other = DatetimeIndex(other) except TypeError: From bd811dc682fb2a4e5c75681a2288ff1b156ca969 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 13 Jun 2013 00:45:36 -0400 Subject: [PATCH 46/71] DOC/TST: add test and release notes --- RELEASE.rst | 3 +++ doc/source/v0.11.1.txt | 3 +++ pandas/tseries/tests/test_timeseries.py | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 0fcd9bd3731fe..161047c478d88 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -80,6 +80,8 @@ pandas 0.11.1 - Added Faq section on repr display options, to help users customize their setup. - ``where`` operations that result in block splitting are much faster (GH3733_) - Series and DataFrame hist methods now take a ``figsize`` argument (GH3834_) + - DatetimeIndexes no longer try to convert mixed-integer indexes during join + operations (GH3877_) **API Changes** @@ -317,6 +319,7 @@ pandas 0.11.1 .. _GH3814: https://github.com/pydata/pandas/issues/3814 .. _GH3834: https://github.com/pydata/pandas/issues/3834 .. _GH3873: https://github.com/pydata/pandas/issues/3873 +.. _GH3877: https://github.com/pydata/pandas/issues/3877 pandas 0.11.0 diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 564939c596ced..1a43e9e6a49e0 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -289,6 +289,8 @@ Enhancements dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) - Series and DataFrame hist methods now take a ``figsize`` argument (GH3834_) + - DatetimeIndexes no longer try to convert mixed-integer indexes during join + operations (GH3877_) Bug Fixes @@ -402,3 +404,4 @@ on GitHub for a complete list. .. _GH3425: https://github.com/pydata/pandas/issues/3425 .. _GH3834: https://github.com/pydata/pandas/issues/3834 .. _GH3873: https://github.com/pydata/pandas/issues/3873 +.. _GH3877: https://github.com/pydata/pandas/issues/3877 diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index beee5caa871c5..f5415a195db77 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -18,7 +18,6 @@ import pandas.core.datetools as datetools import pandas.tseries.offsets as offsets import pandas.tseries.frequencies as fmod -from pandas.tseries.index import TimeSeriesError import pandas as pd from pandas.util.testing import assert_series_equal, assert_almost_equal @@ -1853,6 +1852,14 @@ def test_date(self): expected = [t.date() for t in rng] self.assert_((result == expected).all()) + def test_does_not_convert_mixed_integer(self): + df = tm.makeCustomDataframe(10, 10, data_gen_f=lambda *args, **kwargs: + randn(), r_idx_type='i', c_idx_type='dt') + cols = df.columns.join(df.index, how='outer') + joined = cols.join(df.columns) + self.assertEqual(cols.dtype, np.dtype('O')) + self.assertEqual(cols.dtype, joined.dtype) + self.assert_(np.array_equal(cols.values, joined.values)) class TestLegacySupport(unittest.TestCase): _multiprocess_can_split_ = True From 4b13264cdce01028cea94b2719802740ed19d894 Mon Sep 17 00:00:00 2001 From: nipunreddevil Date: Thu, 13 Jun 2013 18:39:31 +0530 Subject: [PATCH 47/71] Modified to reflect usage of pypaperclip and put all code in one block --- doc/source/io.rst | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9d937bddf7cc2..39211c5c0db49 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1,4 +1,4 @@ -.. _io: +False.. _io: .. currentmodule:: pandas @@ -1231,10 +1231,6 @@ And then import the data directly to a DataFrame by calling: clipdf -.. note:: - - You may need to install xsel on Linux to be able to read from the clipboard. - The ``to_clipboard`` method can be used to write the contents of a DataFrame to the clipboard. Following which you can paste the clipboard contents into other applications (CTRL-V on many operating systems). Here we illustrate writing a @@ -1243,24 +1239,15 @@ DataFrame into clipboard and reading it back. .. ipython:: python df=pd.DataFrame(randn(5,3)) - -.. ipython:: python - df - -.. ipython:: python - - df.to_clipboard() - -.. ipython:: python - - obj=pd.read_clipboard() + df.to_clipboard() + pd.read_clipboard() -.. ipython:: python +We can see that we got the same content back, which we had earlier written to the clipboard. - obj +.. note:: -We can see that we got the same content back, which we had earlier written to the clipboard. + You may need to install xlip or xsel (with gtk or PyQt4 modules) on Linux to use these methods. From be1931c73087ea1a16bc2f8146874c43a201b042 Mon Sep 17 00:00:00 2001 From: nipunreddevil Date: Thu, 13 Jun 2013 19:17:50 +0530 Subject: [PATCH 48/71] fixed typo --- doc/source/io.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 39211c5c0db49..d01b671bbae67 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1,4 +1,4 @@ -False.. _io: +.. _io: .. currentmodule:: pandas @@ -1247,7 +1247,7 @@ We can see that we got the same content back, which we had earlier written to th .. note:: - You may need to install xlip or xsel (with gtk or PyQt4 modules) on Linux to use these methods. + You may need to install xclip or xsel (with gtk or PyQt4 modules) on Linux to use these methods. From 98ecede16ef227538aba542a4ec10b0ddabdfc09 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Thu, 13 Jun 2013 14:26:58 +0100 Subject: [PATCH 49/71] FIX hash of NDFrame raises TypeError --- RELEASE.rst | 1 + pandas/core/generic.py | 4 ++++ pandas/core/series.py | 3 ++- pandas/tests/test_frame.py | 5 +++++ pandas/tests/test_panel.py | 6 ++++++ pandas/tests/test_panel4d.py | 5 +++++ pandas/tests/test_series.py | 6 ++++++ 7 files changed, 29 insertions(+), 1 deletion(-) diff --git a/RELEASE.rst b/RELEASE.rst index 307986ab81681..9febddb852eb1 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -220,6 +220,7 @@ pandas 0.11.1 - Groupby transform with item-by-item not upcasting correctly (GH3740_) - Incorrectly read a HDFStore multi-index Frame witha column specification (GH3748_) - ``read_html`` now correctly skips tests (GH3741_) + - DataFrames/Panel raise Type error when trying to hash (GH3882_) - Fix incorrect arguments passed to concat that are not list-like (e.g. concat(df1,df2)) (GH3481_) - Correctly parse when passed the ``dtype=str`` (or other variable-len string dtypes) in ``read_csv`` (GH3795_) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0d2612d7aed7a..6f1475e3fa8df 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -594,6 +594,10 @@ def axes(self): def __repr__(self): return 'NDFrame' + def __hash__(self): + raise TypeError('{0!r} objects are mutable, thus they cannot be' + ' hashed'.format(self.__class__.__name__)) + @property def values(self): return self._data.as_matrix() diff --git a/pandas/core/series.py b/pandas/core/series.py index 3a7a7d0f49b66..44398caa2547b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -528,7 +528,8 @@ def _can_hold_na(self): return not is_integer_dtype(self.dtype) def __hash__(self): - raise TypeError('unhashable type') + raise TypeError('{0!r} objects are mutable, thus they cannot be' + ' hashed'.format(self.__class__.__name__)) _index = None index = lib.SeriesIndex() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2c6d3b221c6ff..1e8fa91548145 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3109,6 +3109,11 @@ def test_constructor_for_list_with_dtypes(self): expected.sort() assert_series_equal(result, expected) + def test_not_hashable(self): + df = pd.DataFrame([1]) + self.assertRaises(TypeError, hash, df) + self.assertRaises(TypeError, hash, self.empty) + def test_timedeltas(self): df = DataFrame(dict(A = Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 58b7ac272401f..380604b0de32e 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -46,6 +46,12 @@ def test_cumsum(self): cumsum = self.panel.cumsum() assert_frame_equal(cumsum['ItemA'], self.panel['ItemA'].cumsum()) + def not_hashable(self): + c_empty = Panel() + c = Panel(pd.Panel([[[1]]])) + self.assertRaises(TypeError, hash, c_empty) + self.assertRaises(TypeError, hash, c) + class SafeForLongAndSparse(object): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index a2e08bc744ab0..9c3a66c32c501 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -785,6 +785,11 @@ def test_reindex(self): major=self.panel4d.major_axis, copy=False) self.assert_(result is self.panel4d) + def test_not_hashable(self): + p4D_empty = Panel4D() + self.assertRaises(TypeError, hash, p4D_empty) + self.assertRaises(TypeError, hash, self.panel4d) + def test_reindex_like(self): # reindex_like smaller = self.panel4d.reindex(labels=self.panel4d.labels[:-1], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 88990bdde98b8..d04da38f0e526 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -579,6 +579,12 @@ def test_setindex(self): def test_array_finalize(self): pass + def test_not_hashable(self): + s_empty = Series() + s = Series([1]) + self.assertRaises(TypeError, hash, s_empty) + self.assertRaises(TypeError, hash, s) + def test_fromValue(self): nans = Series(np.NaN, index=self.ts.index) From 8f8b1775ee53ac39a7a57674ae16873fe3176a97 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Jun 2013 15:02:42 -0400 Subject: [PATCH 50/71] BUG: not processing TypeError on reading some json (so was failing rather than trying not-numpy for dtypes) --- pandas/io/json.py | 4 ++-- pandas/io/tests/test_json/test_pandas.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 17b33931bee5a..d1c81d625d98d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -246,7 +246,7 @@ def _parse(self): labelled=True)) else: self.obj = Series(loads(json, dtype=dtype, numpy=True)) - except ValueError: + except (ValueError,TypeError): numpy = False if not numpy: @@ -296,7 +296,7 @@ def _parse(self): else: self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, labelled=True)) - except ValueError: + except (ValueError,TypeError): numpy = False if not numpy: diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 4b1294b786df7..cb6e4711f9c42 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -338,6 +338,18 @@ def test_weird_nested_json(self): read_json(s) + @network + @slow + def test_round_trip_exception_(self): + # GH 3867 + + df = pd.read_csv('https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv') + s = df.to_json() + result = pd.read_json(s) + result.index = result.index.astype(int) + result = result.reindex(columns=df.columns,index=df.index) + assert_frame_equal(result,df) + @network @slow def test_url(self): From 186a4f80d30e45501557a4a8081e910e787e7dc3 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Jun 2013 22:48:32 -0400 Subject: [PATCH 51/71] ENH: added convert_axes argument to control whether to coerce axes ENH: changed dtype argument to accept a dict for a per-column dtype conversion, or turn off conversion (default is True) ENH: changed parse_dates to convert_dates, now defaulting to True BUG: not processing correctly some parsable JSON --- doc/source/io.rst | 56 ++++++- pandas/io/json.py | 203 +++++++++++++++-------- pandas/io/tests/test_json/test_pandas.py | 133 ++++++++++----- 3 files changed, 272 insertions(+), 120 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index e64cbc4bc8101..aec963ca81cf0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -989,6 +989,8 @@ Writing to a file, with a date index and a date column dfj2 = dfj.copy() dfj2['date'] = Timestamp('20130101') + dfj2['ints'] = range(5) + dfj2['bools'] = True dfj2.index = date_range('20130101',periods=5) dfj2.to_json('test.json') open('test.json').read() @@ -1011,25 +1013,69 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` * records : list like [value, ... , value] * index : dict like {index -> value} -- dtype : dtype of the resulting object -- numpy : direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. -- parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is False +- dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data +- convert_axes : boolean, try to convert the axes to the proper dtypes, default is True +- convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns +- numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. +The default of ``convert_axes=True``, ``dtype=True``, and ``convert_dates=True`` will try to parse the axes, and all of the data +into appropriate types, including dates. If you need to override specific dtypes, pass a dict to ``dtype``. ``convert_axes`` should only +be set to ``False`` if you need to preserve string-like numbers (e.g. '1', '2') in an axes. + +.. warning:: + + When reading JSON data, automatic coercing into dtypes has some quirks: + + * an index can be in a different order, that is the returned order is not guaranteed to be the same as before serialization + * a column that was ``float`` data can safely be converted to ``integer``, e.g. a column of ``1.`` + * bool columns will be converted to ``integer`` on reconstruction + + Thus there are times where you may want to specify specific dtypes via the ``dtype`` keyword argument. + Reading from a JSON string .. ipython:: python pd.read_json(json) -Reading from a file, parsing dates +Reading from a file + +.. ipython:: python + + pd.read_json('test.json') + +Don't convert any data (but still convert axes and dates) + +.. ipython:: python + + pd.read_json('test.json',dtype=object).dtypes + +Specify how I want to convert data + +.. ipython:: python + + pd.read_json('test.json',dtype={'A' : 'float32', 'bools' : 'int8'}).dtypes + +I like my string indicies .. ipython:: python - pd.read_json('test.json',parse_dates=True) + si = DataFrame(np.zeros((4, 4)), + columns=range(4), + index=[str(i) for i in range(4)]) + si + si.index + si.columns + json = si.to_json() + + sij = pd.read_json(json,convert_axes=False) + sij + sij.index + sij.columns .. ipython:: python :suppress: diff --git a/pandas/io/json.py b/pandas/io/json.py index d1c81d625d98d..537d06f094cd4 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -11,6 +11,7 @@ import numpy as np from pandas.tslib import iNaT +import pandas.lib as lib ### interface to/from ### @@ -86,6 +87,11 @@ def _format_dates(self): self.copy_if_needed() self.obj = self._format_to_date(self.obj) + def _format_bools(self): + if self._needs_to_bool(self.obj): + self.copy_if_needed() + self.obj = self._format_to_bool(self.obj) + class FrameWriter(Writer): _default_orient = 'columns' @@ -112,8 +118,8 @@ def _format_dates(self): for c in dtypes.index: self.obj[c] = self._format_to_date(self.obj[c]) -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True, - parse_dates=False, keep_default_dates=True): +def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, + convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=True): """ Convert JSON string to pandas object @@ -130,13 +136,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True records : list like [value, ... , value] index : dict like {index -> value} typ : type of object to recover (series or frame), default 'frame' - dtype : dtype of the resulting object - numpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. - parse_dates : a list of columns to parse for dates; If True, then try to parse datelike columns - default is False + dtype : if True, infer dtypes, if a dict of column to dtype, then use those, + if False, then don't infer dtypes at all, default is True, + apply only to the data + convert_axes : boolean, try to convert the axes to the proper dtypes, default is True + convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns + default is True keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns + numpy: direct decoding to numpy arrays. default True but falls back + to standard decoding if a problem occurs. Returns ------- @@ -157,16 +166,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, numpy=True obj = None if typ == 'frame': - obj = FrameParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() + obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() if typ == 'series' or obj is None: - obj = SeriesParser(json, orient, dtype, numpy, parse_dates, keep_default_dates).parse() + if not isinstance(dtype,bool): + dtype = dict(data = dtype) + obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() return obj class Parser(object): - def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_dates=False): + def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=True): self.json = json if orient is None: @@ -175,27 +186,92 @@ def __init__(self, json, orient, dtype, numpy, parse_dates=False, keep_default_d self.orient = orient self.dtype = dtype - if dtype is not None and orient == "split": + if orient == "split": numpy = False self.numpy = numpy - self.parse_dates = parse_dates + self.convert_axes = convert_axes + self.convert_dates = convert_dates self.keep_default_dates = keep_default_dates self.obj = None def parse(self): self._parse() - if self.obj is not None: + if self.obj is None: return None + if self.convert_axes: self._convert_axes() - if self.parse_dates: - self._try_parse_dates() + self._try_convert_types() return self.obj + def _convert_axes(self): + """ try to convert axes """ + for axis in self.obj._AXIS_NUMBERS.keys(): + new_axis, result = self._try_convert_data(axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True) + if result: + setattr(self.obj,axis,new_axis) - def _try_parse_to_date(self, data): + def _try_convert_types(self): + raise NotImplementedError + + def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): + """ try to parse a ndarray like into a column by inferring dtype """ + + # don't try to coerce, unless a force conversion + if use_dtypes: + if self.dtype is False: + return data, False + elif self.dtype is True: + pass + + else: + + # dtype to force + dtype = self.dtype.get(name) if isinstance(self.dtype,dict) else self.dtype + if dtype is not None: + try: + dtype = np.dtype(dtype) + return data.astype(dtype), True + except: + return data, False + + if convert_dates: + new_data, result = self._try_convert_to_date(data) + if result: + return new_data, True + + result = False + + if data.dtype == 'object': + + # try float + try: + data = data.astype('float64') + result = True + except: + pass + + # do't coerce 0-len data + if len(data) and (data.dtype == 'float' or data.dtype == 'object'): + + # coerce ints if we can + try: + new_data = data.astype('int64') + if (new_data == data).all(): + data = new_data + result = True + except: + pass + + return data, result + + def _try_convert_to_date(self, data): """ try to parse a ndarray like into a date column try to coerce object in epoch/iso formats and - integer/float in epcoh formats """ + integer/float in epcoh formats, return a boolean if parsing + was successful """ + + # no conversion on empty + if not len(data): return data, False new_data = data if new_data.dtype == 'object': @@ -208,7 +284,7 @@ def _try_parse_to_date(self, data): # ignore numbers that are out of range if issubclass(new_data.dtype.type,np.number): if not ((new_data == iNaT) | (new_data > 31536000000000000L)).all(): - return data + return data, False try: new_data = to_datetime(new_data) @@ -218,11 +294,11 @@ def _try_parse_to_date(self, data): except: # return old, noting more we can do - new_data = data + return data, False - return new_data + return new_data, True - def _try_parse_dates(self): + def _try_convert_dates(self): raise NotImplementedError class SeriesParser(Parser): @@ -231,21 +307,20 @@ class SeriesParser(Parser): def _parse(self): json = self.json - dtype = self.dtype orient = self.orient numpy = self.numpy if numpy: try: if orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) + decoded = loads(json, dtype=None, numpy=True) decoded = dict((str(k), v) for k, v in decoded.iteritems()) self.obj = Series(**decoded) elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=dtype, numpy=True, + self.obj = Series(*loads(json, dtype=None, numpy=True, labelled=True)) else: - self.obj = Series(loads(json, dtype=dtype, numpy=True)) + self.obj = Series(loads(json, dtype=None, numpy=True)) except (ValueError,TypeError): numpy = False @@ -253,22 +328,15 @@ def _parse(self): if orient == "split": decoded = dict((str(k), v) for k, v in loads(json).iteritems()) - self.obj = Series(dtype=dtype, **decoded) + self.obj = Series(dtype=None, **decoded) else: - self.obj = Series(loads(json), dtype=dtype) - - def _convert_axes(self): - """ try to axes if they are datelike """ - try: - self.obj.index = self._try_parse_to_date(self.obj.index) - except: - pass + self.obj = Series(loads(json), dtype=None) - def _try_parse_dates(self): + def _try_convert_types(self): if self.obj is None: return - - if self.parse_dates: - self.obj = self._try_parse_to_date(self.obj) + obj, result = self._try_convert_data('data', self.obj, convert_dates=self.convert_dates) + if result: + self.obj = obj class FrameParser(Parser): _default_orient = 'columns' @@ -276,64 +344,57 @@ class FrameParser(Parser): def _parse(self): json = self.json - dtype = self.dtype orient = self.orient numpy = self.numpy if numpy: try: if orient == "columns": - args = loads(json, dtype=dtype, numpy=True, labelled=True) + args = loads(json, dtype=None, numpy=True, labelled=True) if args: args = (args[0].T, args[2], args[1]) self.obj = DataFrame(*args) elif orient == "split": - decoded = loads(json, dtype=dtype, numpy=True) + decoded = loads(json, dtype=None, numpy=True) decoded = dict((str(k), v) for k, v in decoded.iteritems()) self.obj = DataFrame(**decoded) elif orient == "values": - self.obj = DataFrame(loads(json, dtype=dtype, numpy=True)) + self.obj = DataFrame(loads(json, dtype=None, numpy=True)) else: - self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, + self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True)) except (ValueError,TypeError): numpy = False if not numpy: if orient == "columns": - self.obj = DataFrame(loads(json), dtype=dtype) + self.obj = DataFrame(loads(json), dtype=None) elif orient == "split": decoded = dict((str(k), v) for k, v in loads(json).iteritems()) - self.obj = DataFrame(dtype=dtype, **decoded) + self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": - self.obj = DataFrame(loads(json), dtype=dtype).T + self.obj = DataFrame(loads(json), dtype=None).T else: - self.obj = DataFrame(loads(json), dtype=dtype) - - def _convert_axes(self): - """ try to axes if they are datelike """ - if self.orient == 'columns': - axis = 'index' - elif self.orient == 'index': - axis = 'columns' - else: - return - - try: - a = getattr(self.obj,axis) - setattr(self.obj,axis,self._try_parse_to_date(a)) - except: - pass + self.obj = DataFrame(loads(json), dtype=None) - def _try_parse_dates(self): + def _try_convert_types(self): + if self.obj is None: return + if self.convert_dates: + self._try_convert_dates() + for col in self.obj.columns: + new_data, result = self._try_convert_data(col, self.obj[col], convert_dates=False) + if result: + self.obj[col] = new_data + + def _try_convert_dates(self): if self.obj is None: return # our columns to parse - parse_dates = self.parse_dates - if parse_dates is True: - parse_dates = [] - parse_dates = set(parse_dates) + convert_dates = self.convert_dates + if convert_dates is True: + convert_dates = [] + convert_dates = set(convert_dates) def is_ok(col): """ return if this col is ok to try for a date parse """ @@ -348,6 +409,8 @@ def is_ok(col): return False - for col, c in self.obj.iteritems(): - if (self.keep_default_dates and is_ok(col)) or col in parse_dates: - self.obj[col] = self._try_parse_to_date(c) + for col in self.obj.columns: + if (self.keep_default_dates and is_ok(col)) or col in convert_dates: + new_data, result = self._try_convert_to_date(self.obj[col]) + if result: + self.obj[col] = new_data diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index cb6e4711f9c42..bcbd4d4b91e70 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -56,13 +56,14 @@ def setUp(self): def test_frame_from_json_to_json(self): - def _check_orient(df, orient, dtype=None, numpy=True): + def _check_orient(df, orient, dtype=None, numpy=True, convert_axes=True, check_dtype=True): df = df.sort() dfjson = df.to_json(orient=orient) unser = read_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy) + numpy=numpy, convert_axes=convert_axes) unser = unser.sort() - if df.index.dtype.type == np.datetime64: + + if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) if orient == "records": # index is not captured in this orientation @@ -78,20 +79,37 @@ def _check_orient(df, orient, dtype=None, numpy=True): unser = unser.sort() assert_almost_equal(df.values, unser.values) else: - assert_frame_equal(df, unser) - - def _check_all_orients(df, dtype=None): - _check_orient(df, "columns", dtype=dtype) - _check_orient(df, "records", dtype=dtype) - _check_orient(df, "split", dtype=dtype) - _check_orient(df, "index", dtype=dtype) - _check_orient(df, "values", dtype=dtype) - - _check_orient(df, "columns", dtype=dtype, numpy=False) - _check_orient(df, "records", dtype=dtype, numpy=False) - _check_orient(df, "split", dtype=dtype, numpy=False) - _check_orient(df, "index", dtype=dtype, numpy=False) - _check_orient(df, "values", dtype=dtype, numpy=False) + if convert_axes: + assert_frame_equal(df, unser, check_dtype=check_dtype) + else: + assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) + + def _check_all_orients(df, dtype=None, convert_axes=True): + if convert_axes: + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, convert_axes=False) + _check_orient(df, "records", dtype=dtype, convert_axes=False) + _check_orient(df, "split", dtype=dtype, convert_axes=False) + _check_orient(df, "index", dtype=dtype, convert_axes=False) + _check_orient(df, "values", dtype=dtype ,convert_axes=False) + + if convert_axes: + _check_orient(df, "columns", dtype=dtype, numpy=False) + _check_orient(df, "records", dtype=dtype, numpy=False) + _check_orient(df, "split", dtype=dtype, numpy=False) + _check_orient(df, "index", dtype=dtype, numpy=False) + _check_orient(df, "values", dtype=dtype, numpy=False) + + _check_orient(df, "columns", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "records", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "split", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "index", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "values", dtype=dtype, numpy=False, convert_axes=False) # basic _check_all_orients(self.frame) @@ -99,6 +117,7 @@ def _check_all_orients(df, dtype=None): self.frame.to_json(orient="columns")) _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + _check_all_orients(self.intframe, dtype=False) # big one # index and columns are strings as all unserialised JSON object keys @@ -106,13 +125,13 @@ def _check_all_orients(df, dtype=None): biggie = DataFrame(np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)]) - _check_all_orients(biggie) + _check_all_orients(biggie,dtype=False,convert_axes=False) # dtypes _check_all_orients(DataFrame(biggie, dtype=np.float64), - dtype=np.float64) - _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) - _check_all_orients(DataFrame(biggie, dtype=' Date: Thu, 13 Jun 2013 09:40:56 -0400 Subject: [PATCH 52/71] TST: tests for numpy=True/False differeing in parsing --- pandas/io/tests/test_json/test_pandas.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index bcbd4d4b91e70..23ac4c4df15e3 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -383,6 +383,16 @@ def test_doc_example(self): result = read_json(json,dtype={'ints' : np.int64, 'bools' : np.bool_}) assert_frame_equal(result,result) + def test_misc_example(self): + #import pdb; pdb.set_trace() + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=True) + expected = DataFrame([[1,2],[1,2]],columns=['a','b']) + #assert_frame_equal(result,expected) + + result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]',numpy=False) + expected = DataFrame([[1,2],[1,2]],columns=['a','b']) + assert_frame_equal(result,expected) + @network @slow def test_round_trip_exception_(self): From 6701ed3f3a17bf9598c406aa2d6176802365a856 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Thu, 13 Jun 2013 16:54:02 +0100 Subject: [PATCH 53/71] FIX PandasObjects unhashable --- RELEASE.rst | 2 +- pandas/core/generic.py | 9 +++++---- pandas/core/series.py | 4 ---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 9febddb852eb1..072f40d927108 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -220,7 +220,7 @@ pandas 0.11.1 - Groupby transform with item-by-item not upcasting correctly (GH3740_) - Incorrectly read a HDFStore multi-index Frame witha column specification (GH3748_) - ``read_html`` now correctly skips tests (GH3741_) - - DataFrames/Panel raise Type error when trying to hash (GH3882_) + - PandasObjects raise TypeError when trying to hash (GH3882_) - Fix incorrect arguments passed to concat that are not list-like (e.g. concat(df1,df2)) (GH3481_) - Correctly parse when passed the ``dtype=str`` (or other variable-len string dtypes) in ``read_csv`` (GH3795_) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6f1475e3fa8df..3a3ce49d50c5a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -31,6 +31,11 @@ def save(self, path): def load(cls, path): return com.load(path) + def __hash__(self): + raise TypeError('{0!r} objects are mutable, thus they cannot be' + ' hashed'.format(self.__class__.__name__)) + + #---------------------------------------------------------------------- # Axis name business @@ -594,10 +599,6 @@ def axes(self): def __repr__(self): return 'NDFrame' - def __hash__(self): - raise TypeError('{0!r} objects are mutable, thus they cannot be' - ' hashed'.format(self.__class__.__name__)) - @property def values(self): return self._data.as_matrix() diff --git a/pandas/core/series.py b/pandas/core/series.py index 44398caa2547b..2621c64afc205 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -527,10 +527,6 @@ def _constructor(self): def _can_hold_na(self): return not is_integer_dtype(self.dtype) - def __hash__(self): - raise TypeError('{0!r} objects are mutable, thus they cannot be' - ' hashed'.format(self.__class__.__name__)) - _index = None index = lib.SeriesIndex() From 740b10fe1d5de2bf027a65c668cbb692d7237867 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 11:30:05 -0400 Subject: [PATCH 54/71] PERF: changed default to numpy=False to have correct parsing using unordered JSON eliminated fallback parsing with numpy=True; This will raise ValueError if it fails to parse (a known case are strings in the frame data) --- doc/source/io.rst | 41 +++++-- pandas/core/generic.py | 12 +- pandas/io/json.py | 145 ++++++++++++----------- pandas/io/tests/test_json/test_pandas.py | 61 ++++++---- 4 files changed, 156 insertions(+), 103 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index aec963ca81cf0..c182d456315ec 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -954,13 +954,21 @@ with optional parameters: - path_or_buf : the pathname or buffer to write the output This can be ``None`` in which case a JSON string is returned -- orient : The format of the JSON string, default is ``index`` for ``Series``, ``columns`` for ``DataFrame`` +- orient : - * split : dict like {index -> [index], columns -> [columns], data -> [values]} - * records : list like [{column -> value}, ... , {column -> value}] - * index : dict like {index -> {column -> value}} - * columns : dict like {column -> {index -> value}} - * values : just the values array + Series : + default is 'index', allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns', allowed values are: {'split','records','index','columns','values'} + + The format of the JSON string + + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array - date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch - double_precision : The number of decimal places to use when encoding floating point values, default 10. @@ -1007,17 +1015,28 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` is expected. For instance, a local file could be file ://localhost/path/to/table.json - typ : type of object to recover (series or frame), default 'frame' -- orient : The format of the JSON string, one of the following +- orient : + + Series : + default is 'index', allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns', allowed values are: {'split','records','index','columns','values'} + + The format of the JSON string - * split : dict like {index -> [index], name -> name, data -> [values]} - * records : list like [value, ... , value] - * index : dict like {index -> value} + * split : dict like {index -> [index], columns -> [columns], data -> [values]} + * records : list like [{column -> value}, ... , {column -> value}] + * index : dict like {index -> {column -> value}} + * columns : dict like {column -> {index -> value}} + * values : just the values array - dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data - convert_axes : boolean, try to convert the axes to the proper dtypes, default is True - convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True - keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns -- numpy: direct decoding to numpy arrays. default True but falls back to standard decoding if a problem occurs. +- numpy: direct decoding to numpy arrays. default is False; + Note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0d2612d7aed7a..55347aef078ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -507,8 +507,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', ---------- path_or_buf : the path or buffer to write the result string if this is None, return a StringIO of the converted string - orient : {'split', 'records', 'index', 'columns', 'values'}, - default is 'index' for Series, 'columns' for DataFrame + orient : + + Series : + default is 'index' + allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns' + allowed values are: {'split','records','index','columns','values'} The format of the JSON string split : dict like @@ -517,6 +524,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', index : dict like {index -> {column -> value}} columns : dict like {column -> {index -> value}} values : just the values array + date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch double_precision : The number of decimal places to use when encoding diff --git a/pandas/io/json.py b/pandas/io/json.py index 537d06f094cd4..fcecb31bb77a7 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -119,7 +119,7 @@ def _format_dates(self): self.obj[c] = self._format_to_date(self.obj[c]) def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, - convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=True): + convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False): """ Convert JSON string to pandas object @@ -129,12 +129,22 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json - orient : {'split', 'records', 'index'}, default 'index' + orient : + Series : + default is 'index' + allowed values are: {'split','records','index'} + + DataFrame : + default is 'columns' + allowed values are: {'split','records','index','columns','values'} + The format of the JSON string - split : dict like - {index -> [index], name -> name, data -> [values]} - records : list like [value, ... , value] - index : dict like {index -> value} + split : dict like {index -> [index], columns -> [columns], data -> [values]} + records : list like [{column -> value}, ... , {column -> value}] + index : dict like {index -> {column -> value}} + columns : dict like {column -> {index -> value}} + values : just the values array + typ : type of object to recover (series or frame), default 'frame' dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, @@ -144,8 +154,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, default is True keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns - numpy: direct decoding to numpy arrays. default True but falls back - to standard decoding if a problem occurs. + numpy: direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same + for each term if numpy=True. Returns ------- @@ -177,7 +187,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, class Parser(object): - def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=True): + def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False): self.json = json if orient is None: @@ -196,7 +206,15 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=Tr self.obj = None def parse(self): - self._parse() + + # try numpy + numpy = self.numpy + if numpy: + self._parse_numpy() + + else: + self._parse_no_numpy() + if self.obj is None: return None if self.convert_axes: self._convert_axes() @@ -304,33 +322,30 @@ def _try_convert_dates(self): class SeriesParser(Parser): _default_orient = 'index' - def _parse(self): + def _parse_no_numpy(self): + + json = self.json + orient = self.orient + if orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = Series(dtype=None, **decoded) + else: + self.obj = Series(loads(json), dtype=None) + + def _parse_numpy(self): json = self.json orient = self.orient - numpy = self.numpy - - if numpy: - try: - if orient == "split": - decoded = loads(json, dtype=None, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - self.obj = Series(**decoded) - elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=None, numpy=True, - labelled=True)) - else: - self.obj = Series(loads(json, dtype=None, numpy=True)) - except (ValueError,TypeError): - numpy = False - - if not numpy: - if orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - self.obj = Series(dtype=None, **decoded) - else: - self.obj = Series(loads(json), dtype=None) + if orient == "split": + decoded = loads(json, dtype=None, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = Series(**decoded) + elif orient == "columns" or orient == "index": + self.obj = Series(*loads(json, dtype=None, numpy=True, + labelled=True)) + else: + self.obj = Series(loads(json, dtype=None, numpy=True)) def _try_convert_types(self): if self.obj is None: return @@ -341,42 +356,40 @@ def _try_convert_types(self): class FrameParser(Parser): _default_orient = 'columns' - def _parse(self): + def _parse_numpy(self): json = self.json orient = self.orient - numpy = self.numpy - if numpy: - try: - if orient == "columns": - args = loads(json, dtype=None, numpy=True, labelled=True) - if args: - args = (args[0].T, args[2], args[1]) - self.obj = DataFrame(*args) - elif orient == "split": - decoded = loads(json, dtype=None, numpy=True) - decoded = dict((str(k), v) for k, v in decoded.iteritems()) - self.obj = DataFrame(**decoded) - elif orient == "values": - self.obj = DataFrame(loads(json, dtype=None, numpy=True)) - else: - self.obj = DataFrame(*loads(json, dtype=None, numpy=True, - labelled=True)) - except (ValueError,TypeError): - numpy = False - - if not numpy: - if orient == "columns": - self.obj = DataFrame(loads(json), dtype=None) - elif orient == "split": - decoded = dict((str(k), v) - for k, v in loads(json).iteritems()) - self.obj = DataFrame(dtype=None, **decoded) - elif orient == "index": - self.obj = DataFrame(loads(json), dtype=None).T - else: - self.obj = DataFrame(loads(json), dtype=None) + if orient == "columns": + args = loads(json, dtype=None, numpy=True, labelled=True) + if args: + args = (args[0].T, args[2], args[1]) + self.obj = DataFrame(*args) + elif orient == "split": + decoded = loads(json, dtype=None, numpy=True) + decoded = dict((str(k), v) for k, v in decoded.iteritems()) + self.obj = DataFrame(**decoded) + elif orient == "values": + self.obj = DataFrame(loads(json, dtype=None, numpy=True)) + else: + self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True)) + + def _parse_no_numpy(self): + + json = self.json + orient = self.orient + + if orient == "columns": + self.obj = DataFrame(loads(json), dtype=None) + elif orient == "split": + decoded = dict((str(k), v) + for k, v in loads(json).iteritems()) + self.obj = DataFrame(dtype=None, **decoded) + elif orient == "index": + self.obj = DataFrame(loads(json), dtype=None).T + else: + self.obj = DataFrame(loads(json), dtype=None) def _try_convert_types(self): if self.obj is None: return diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 23ac4c4df15e3..bdd700bdbcec3 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -56,11 +56,19 @@ def setUp(self): def test_frame_from_json_to_json(self): - def _check_orient(df, orient, dtype=None, numpy=True, convert_axes=True, check_dtype=True): + def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None): df = df.sort() dfjson = df.to_json(orient=orient) - unser = read_json(dfjson, orient=orient, dtype=dtype, - numpy=numpy, convert_axes=convert_axes) + + try: + unser = read_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy, convert_axes=convert_axes) + except (Exception), detail: + if raise_ok is not None: + if type(detail) == raise_ok: + return + raise + unser = unser.sort() if not convert_axes and df.index.dtype.type == np.datetime64: @@ -84,7 +92,9 @@ def _check_orient(df, orient, dtype=None, numpy=True, convert_axes=True, check_d else: assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) - def _check_all_orients(df, dtype=None, convert_axes=True): + def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None): + + # numpy=False if convert_axes: _check_orient(df, "columns", dtype=dtype) _check_orient(df, "records", dtype=dtype) @@ -98,18 +108,19 @@ def _check_all_orients(df, dtype=None, convert_axes=True): _check_orient(df, "index", dtype=dtype, convert_axes=False) _check_orient(df, "values", dtype=dtype ,convert_axes=False) + # numpy=True and raise_ok might be not None, so ignore the error if convert_axes: - _check_orient(df, "columns", dtype=dtype, numpy=False) - _check_orient(df, "records", dtype=dtype, numpy=False) - _check_orient(df, "split", dtype=dtype, numpy=False) - _check_orient(df, "index", dtype=dtype, numpy=False) - _check_orient(df, "values", dtype=dtype, numpy=False) - - _check_orient(df, "columns", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "records", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "split", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "index", dtype=dtype, numpy=False, convert_axes=False) - _check_orient(df, "values", dtype=dtype, numpy=False, convert_axes=False) + _check_orient(df, "columns", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, raise_ok=raise_ok) + + _check_orient(df, "columns", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "records", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "split", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "index", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) + _check_orient(df, "values", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok) # basic _check_all_orients(self.frame) @@ -131,7 +142,8 @@ def _check_all_orients(df, dtype=None, convert_axes=True): _check_all_orients(DataFrame(biggie, dtype=np.float64), dtype=np.float64, convert_axes=False) _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, convert_axes=False) - _check_all_orients(DataFrame(biggie, dtype=' Date: Sat, 8 Jun 2013 23:36:16 -0400 Subject: [PATCH 55/71] ENH: Add unit keyword to Timestamp and to_datetime to enable passing of integers or floats that are in an epoch unit of s, ms, us, ns (e.g. unix timestamps or epoch s, with fracional seconds allowed) (GH 3540) --- RELEASE.rst | 4 ++ pandas/src/inference.pyx | 4 +- pandas/src/offsets.pyx | 2 +- pandas/tseries/tests/test_timeseries.py | 43 +++++++++++++++ pandas/tseries/tools.py | 6 ++- pandas/tslib.pxd | 2 +- pandas/tslib.pyx | 70 ++++++++++++++++++------- 7 files changed, 105 insertions(+), 26 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 161047c478d88..0d94337ffea78 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -82,6 +82,9 @@ pandas 0.11.1 - Series and DataFrame hist methods now take a ``figsize`` argument (GH3834_) - DatetimeIndexes no longer try to convert mixed-integer indexes during join operations (GH3877_) + - Add ``unit`` keyword to ``Timestamp`` and ``to_datetime`` to enable passing of + integers or floats that are in an epoch unit of ``s, ms, us, ns`` + (e.g. unix timestamps or epoch ``s``, with fracional seconds allowed) (GH3540_) **API Changes** @@ -264,6 +267,7 @@ pandas 0.11.1 .. _GH3499: https://github.com/pydata/pandas/issues/3499 .. _GH3495: https://github.com/pydata/pandas/issues/3495 .. _GH3492: https://github.com/pydata/pandas/issues/3492 +.. _GH3540: https://github.com/pydata/pandas/issues/3540 .. _GH3552: https://github.com/pydata/pandas/issues/3552 .. _GH3562: https://github.com/pydata/pandas/issues/3562 .. _GH3586: https://github.com/pydata/pandas/issues/3586 diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 5343819b9fbfe..270fb01a42033 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -471,7 +471,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_float = 1 elif util.is_datetime64_object(val): if convert_datetime: - idatetimes[i] = convert_to_tsobject(val, None).value + idatetimes[i] = convert_to_tsobject(val, None, None).value seen_datetime = 1 else: seen_object = 1 @@ -493,7 +493,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, elif PyDateTime_Check(val) or util.is_datetime64_object(val): if convert_datetime: seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None).value + idatetimes[i] = convert_to_tsobject(val, None, None).value else: seen_object = 1 break diff --git a/pandas/src/offsets.pyx b/pandas/src/offsets.pyx index 5868ca5210e33..1823edeb0a4d9 100644 --- a/pandas/src/offsets.pyx +++ b/pandas/src/offsets.pyx @@ -76,7 +76,7 @@ cdef class _Offset: cpdef anchor(self, object start=None): if start is not None: self.start = start - self.ts = convert_to_tsobject(self.start) + self.ts = convert_to_tsobject(self.start, None, None) self._setup() cdef _setup(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index f5415a195db77..6efddb281d894 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -593,6 +593,14 @@ def test_frame_add_datetime64_col_other_units(self): self.assert_((tmp['dates'].values == ex_vals).all()) + def test_to_datetime_unit(self): + + epoch = 1370745748 + s = Series([ epoch + t for t in range(20) ]) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) + assert_series_equal(result,expected) + def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') @@ -2691,6 +2699,41 @@ def test_basics_nanos(self): self.assert_(stamp.microsecond == 0) self.assert_(stamp.nanosecond == 500) + def test_unit(self): + def check(val,unit=None,s=1,us=0): + stamp = Timestamp(val, unit=unit) + self.assert_(stamp.year == 2000) + self.assert_(stamp.month == 1) + self.assert_(stamp.day == 1) + self.assert_(stamp.hour == 1) + self.assert_(stamp.minute == 1) + self.assert_(stamp.second == s) + self.assert_(stamp.microsecond == us) + self.assert_(stamp.nanosecond == 0) + + val = Timestamp('20000101 01:01:01').value + + check(val) + check(val/1000L,unit='us') + check(val/1000000L,unit='ms') + check(val/1000000000L,unit='s') + + # get chopped + check((val+500000)/1000000000L,unit='s') + check((val+500000000)/1000000000L,unit='s') + check((val+500000)/1000000L,unit='ms') + + # ok + check((val+500000)/1000L,unit='us',us=500) + check((val+500000000)/1000000L,unit='ms',us=500000) + + # floats + check(val/1000.0 + 5,unit='us',us=5) + check(val/1000.0 + 5000,unit='us',us=5000) + check(val/1000000.0 + 0.5,unit='ms',us=500) + check(val/1000000.0 + 0.005,unit='ms',us=5) + check(val/1000000000.0 + 0.5,unit='s',us=500000) + def test_comparison(self): # 5-18-2012 00:00:00.000 stamp = 1337299200000000000L diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 62ee19da6b845..46bcee6f907cf 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -50,7 +50,7 @@ def _maybe_get_tz(tz): def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, - format=None, coerce=False): + format=None, coerce=False, unit=None): """ Convert argument to datetime @@ -69,6 +69,8 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, format : string, default None strftime to parse time, eg "%d/%m/%Y" coerce : force errors to NaT (False by default) + unit : unit of the arg (s,ms,us,ns) denote the unit in epoch + (e.g. a unix timestamp) Returns ------- @@ -86,7 +88,7 @@ def _convert_f(arg): else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, - coerce=coerce) + coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result diff --git a/pandas/tslib.pxd b/pandas/tslib.pxd index 3e7a6ef615e00..a70f9883c5bb1 100644 --- a/pandas/tslib.pxd +++ b/pandas/tslib.pxd @@ -1,3 +1,3 @@ from numpy cimport ndarray, int64_t -cdef convert_to_tsobject(object, object) +cdef convert_to_tsobject(object, object, object) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index abec45b52a363..94279e61e440e 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -131,21 +131,17 @@ class Timestamp(_Timestamp): note: by definition there cannot be any tz info on the ordinal itself """ return cls(datetime.fromordinal(ordinal),offset=offset,tz=tz) - def __new__(cls, object ts_input, object offset=None, tz=None): + def __new__(cls, object ts_input, object offset=None, tz=None, unit=None): cdef _TSObject ts cdef _Timestamp ts_base - if PyFloat_Check(ts_input): - # to do, do we want to support this, ie with fractional seconds? - raise TypeError("Cannot convert a float to datetime") - if util.is_string_object(ts_input): try: ts_input = parse_date(ts_input) except Exception: pass - ts = convert_to_tsobject(ts_input, tz) + ts = convert_to_tsobject(ts_input, tz, unit) if ts.value == NPY_NAT: return NaT @@ -311,7 +307,7 @@ class Timestamp(_Timestamp): if self.nanosecond != 0 and warn: print 'Warning: discarding nonzero nanoseconds' - ts = convert_to_tsobject(self, self.tzinfo) + ts = convert_to_tsobject(self, self.tzinfo, None) return datetime(ts.dts.year, ts.dts.month, ts.dts.day, ts.dts.hour, ts.dts.min, ts.dts.sec, @@ -530,7 +526,7 @@ cdef class _Timestamp(datetime): cdef: pandas_datetimestruct dts _TSObject ts - ts = convert_to_tsobject(self, self.tzinfo) + ts = convert_to_tsobject(self, self.tzinfo, None) dts = ts.dts return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, @@ -623,12 +619,13 @@ cpdef _get_utcoffset(tzinfo, obj): return tzinfo.utcoffset(obj) # helper to extract datetime and int64 from several different possibilities -cdef convert_to_tsobject(object ts, object tz): +cdef convert_to_tsobject(object ts, object tz, object unit): """ Extract datetime and int64 from any of: - - np.int64 + - np.int64 (with unit providing a possible modifier) - np.datetime64 - - python int or long object + - a float (with unit providing a possible modifier) + - python int or long object (with unit providing a possible modifier) - iso8601 string object - python datetime object - another timestamp object @@ -647,6 +644,11 @@ cdef convert_to_tsobject(object ts, object tz): obj.value = _get_datetime64_nanos(ts) pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): + ts = ts * cast_from_unit(unit,None) + obj.value = ts + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) + elif util.is_float_object(ts): + ts = cast_from_unit(unit,ts) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_string_object(ts): @@ -699,7 +701,7 @@ cdef convert_to_tsobject(object ts, object tz): elif PyDate_Check(ts): # Keep the converter same as PyDateTime's ts = datetime.combine(ts, datetime_time()) - return convert_to_tsobject(ts, tz) + return convert_to_tsobject(ts, tz, None) else: raise ValueError("Could not construct Timestamp from argument %s" % type(ts)) @@ -804,7 +806,7 @@ def datetime_to_datetime64(ndarray[object] values): else: inferred_tz = _get_zone(val.tzinfo) - _ts = convert_to_tsobject(val, None) + _ts = convert_to_tsobject(val, None, None) iresult[i] = _ts.value _check_dts_bounds(iresult[i], &_ts.dts) else: @@ -819,7 +821,7 @@ def datetime_to_datetime64(ndarray[object] values): def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, - format=None, utc=None, coerce=False): + format=None, utc=None, coerce=False, unit=None): cdef: Py_ssize_t i, n = len(values) object val @@ -828,6 +830,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, pandas_datetimestruct dts bint utc_convert = bool(utc) _TSObject _ts + int64_t m = cast_from_unit(unit,None) from dateutil.parser import parse @@ -841,7 +844,7 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, elif PyDateTime_Check(val): if val.tzinfo is not None: if utc_convert: - _ts = convert_to_tsobject(val, None) + _ts = convert_to_tsobject(val, None, unit) iresult[i] = _ts.value _check_dts_bounds(iresult[i], &_ts.dts) else: @@ -861,7 +864,9 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, # if we are coercing, dont' allow integers elif util.is_integer_object(val) and not coerce: - iresult[i] = val + iresult[i] = val*m + elif util.is_float_object(val) and not coerce: + iresult[i] = cast_from_unit(unit,val) else: try: if len(val) == 0: @@ -1246,6 +1251,31 @@ cdef inline _get_datetime64_nanos(object val): else: return ival +cdef inline int64_t cast_from_unit(object unit, object ts): + """ return a casting of the unit represented to nanoseconds + round the fractional part of a float to our precision, p """ + p = 0 + if unit == 's': + m = 1000000000L + p = 6 + elif unit == 'ms': + m = 1000000L + p = 3 + elif unit == 'us': + m = 1000L + p = 0 + else: + m = 1L + + # just give me the unit back + if ts is None: + return m + + # cast the unit, multiply base/frace separately + # to avoid precision issues from float -> int + base = ts + frac = ts-base + return (base*m) + (round(frac,p)*m) def cast_to_nanoseconds(ndarray arr): cdef: @@ -1286,7 +1316,7 @@ def pydt_to_i8(object pydt): cdef: _TSObject ts - ts = convert_to_tsobject(pydt, None) + ts = convert_to_tsobject(pydt, None, None) return ts.value @@ -1784,7 +1814,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - ts = convert_to_tsobject(dtindex[i], None) + ts = convert_to_tsobject(dtindex[i], None, None) out[i] = ts_dayofweek(ts) return out @@ -1793,7 +1823,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): if dtindex[i] == NPY_NAT: out[i] = -1; continue pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) - ts = convert_to_tsobject(dtindex[i], None) + ts = convert_to_tsobject(dtindex[i], None, None) isleap = is_leapyear(dts.year) isleap_prev = is_leapyear(dts.year - 1) mo_off = _month_offset[isleap, dts.month - 1] @@ -1831,7 +1861,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): cdef inline int m8_weekday(int64_t val): - ts = convert_to_tsobject(val, None) + ts = convert_to_tsobject(val, None, None) return ts_dayofweek(ts) cdef int64_t DAY_NS = 86400000000000LL From 7e4ccbe9a40ff88d38dffc2d77c17f680267e8d9 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Jun 2013 17:41:17 -0400 Subject: [PATCH 56/71] TST: disallow slicing a timeseries with floats TST: manage truediv in py3 for unit comparisons --- pandas/tseries/index.py | 3 +++ pandas/tseries/tests/test_timeseries.py | 15 +++++++++++---- pandas/tseries/tools.py | 4 ++-- pandas/tslib.pyx | 2 +- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 51e657d1723b2..1cb986ee6cd7c 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1204,6 +1204,9 @@ def slice_indexer(self, start=None, end=None, step=None): if isinstance(start, time) or isinstance(end, time): raise KeyError('Cannot mix time and non-time slice keys') + if isinstance(start, float) or isinstance(end, float): + raise TypeError('Cannot index datetime64 with float keys') + return Index.slice_indexer(self, start, end, step) def slice_locs(self, start=None, end=None): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 6efddb281d894..88dee987f4ba2 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2718,10 +2718,17 @@ def check(val,unit=None,s=1,us=0): check(val/1000000L,unit='ms') check(val/1000000000L,unit='s') - # get chopped - check((val+500000)/1000000000L,unit='s') - check((val+500000000)/1000000000L,unit='s') - check((val+500000)/1000000L,unit='ms') + # using truediv, so these are like floats + if py3compat.PY3: + check((val+500000)/1000000000L,unit='s',us=500) + check((val+500000000)/1000000000L,unit='s',us=500000) + check((val+500000)/1000000L,unit='ms',us=500) + + # get chopped in py2 + else: + check((val+500000)/1000000000L,unit='s') + check((val+500000000)/1000000000L,unit='s') + check((val+500000)/1000000L,unit='ms') # ok check((val+500000)/1000L,unit='us',us=500) diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 46bcee6f907cf..90bc0beb8eb84 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -50,7 +50,7 @@ def _maybe_get_tz(tz): def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, - format=None, coerce=False, unit=None): + format=None, coerce=False, unit='ns'): """ Convert argument to datetime @@ -70,7 +70,7 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, strftime to parse time, eg "%d/%m/%Y" coerce : force errors to NaT (False by default) unit : unit of the arg (s,ms,us,ns) denote the unit in epoch - (e.g. a unix timestamp) + (e.g. a unix timestamp), which is an integer/float number Returns ------- diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 94279e61e440e..c2a3f429e60f7 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1254,7 +1254,6 @@ cdef inline _get_datetime64_nanos(object val): cdef inline int64_t cast_from_unit(object unit, object ts): """ return a casting of the unit represented to nanoseconds round the fractional part of a float to our precision, p """ - p = 0 if unit == 's': m = 1000000000L p = 6 @@ -1266,6 +1265,7 @@ cdef inline int64_t cast_from_unit(object unit, object ts): p = 0 else: m = 1L + p = 0 # just give me the unit back if ts is None: From fbcd5abcadb37c612804aefc5255e2e99b009444 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 12 Jun 2013 09:04:55 -0400 Subject: [PATCH 57/71] BUG: make sure that nan/none like values to Timestamp are returned as NaT --- pandas/tseries/tests/test_timeseries.py | 35 ++++++++++++++++++++++++- pandas/tslib.pyx | 32 +++++++++++++++------- 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 88dee987f4ba2..ac02dee335afc 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -38,6 +38,7 @@ import pandas.util.py3compat as py3compat from pandas.core.datetools import BDay import pandas.core.common as com +from pandas import concat from numpy.testing.decorators import slow @@ -171,7 +172,6 @@ def test_indexing_over_size_cutoff(self): def test_indexing_unordered(self): # GH 2437 - from pandas import concat rng = date_range(start='2011-01-01', end='2011-01-15') ts = Series(randn(len(rng)), index=rng) ts2 = concat([ts[0:4],ts[-4:],ts[4:-4]]) @@ -601,6 +601,26 @@ def test_to_datetime_unit(self): expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) assert_series_equal(result,expected) + s = Series([ epoch + t for t in range(20) ]).astype(float) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ]) + assert_series_equal(result,expected) + + s = Series([ epoch + t for t in range(20) ] + [iNaT]) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + + s = Series([ epoch + t for t in range(20) ] + [iNaT]).astype(float) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + + s = concat([Series([ epoch + t for t in range(20) ]).astype(float),Series([np.nan])],ignore_index=True) + result = to_datetime(s,unit='s') + expected = Series([ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]) + assert_series_equal(result,expected) + def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') @@ -2741,6 +2761,19 @@ def check(val,unit=None,s=1,us=0): check(val/1000000.0 + 0.005,unit='ms',us=5) check(val/1000000000.0 + 0.5,unit='s',us=500000) + # nan + result = Timestamp(np.nan) + self.assert_(result is NaT) + + result = Timestamp(None) + self.assert_(result is NaT) + + result = Timestamp(iNaT) + self.assert_(result is NaT) + + result = Timestamp(NaT) + self.assert_(result is NaT) + def test_comparison(self): # 5-18-2012 00:00:00.000 stamp = 1337299200000000000L diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index c2a3f429e60f7..ec11de7392680 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -640,17 +640,25 @@ cdef convert_to_tsobject(object ts, object tz, object unit): obj = _TSObject() - if is_datetime64_object(ts): + if ts is None or ts is NaT: + obj.value = NPY_NAT + elif is_datetime64_object(ts): obj.value = _get_datetime64_nanos(ts) pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): - ts = ts * cast_from_unit(unit,None) - obj.value = ts - pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) + if ts == NPY_NAT: + obj.value = NPY_NAT + else: + ts = ts * cast_from_unit(unit,None) + obj.value = ts + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_float_object(ts): - ts = cast_from_unit(unit,ts) - obj.value = ts - pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) + if ts != ts or ts == NPY_NAT: + obj.value = NPY_NAT + else: + ts = cast_from_unit(unit,ts) + obj.value = ts + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_string_object(ts): if ts in _nat_strings: obj.value = NPY_NAT @@ -864,9 +872,15 @@ def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, # if we are coercing, dont' allow integers elif util.is_integer_object(val) and not coerce: - iresult[i] = val*m + if val == iNaT: + iresult[i] = iNaT + else: + iresult[i] = val*m elif util.is_float_object(val) and not coerce: - iresult[i] = cast_from_unit(unit,val) + if val != val or val == iNaT: + iresult[i] = iNaT + else: + iresult[i] = cast_from_unit(unit,val) else: try: if len(val) == 0: From a469d335ff5d3600009a72b9fb0778a4ec2121bb Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Tue, 11 Jun 2013 15:36:52 +0100 Subject: [PATCH 58/71] DOC updated to reflect using pyperclip for clipboard --- LICENSES/OTHER | 30 ++++++++++++++++++++++++++++++ RELEASE.rst | 2 ++ pandas/core/generic.py | 10 ++++++++++ pandas/io/clipboard.py | 4 ++-- 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/LICENSES/OTHER b/LICENSES/OTHER index a1b367fe6061c..f0550b4ee208a 100644 --- a/LICENSES/OTHER +++ b/LICENSES/OTHER @@ -48,3 +48,33 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +Pyperclip v1.3 license +---------------------- + +Copyright (c) 2010, Albert Sweigart +All rights reserved. + +BSD-style license: + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the pyperclip nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/RELEASE.rst b/RELEASE.rst index 307986ab81681..a03451542796a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -73,6 +73,8 @@ pandas 0.11.1 - ``melt`` now accepts the optional parameters ``var_name`` and ``value_name`` to specify custom column names of the returned DataFrame (GH3649_), thanks @hoechenberger + - clipboard functions use pyperclip (no dependencies on Windows, alternative + dependencies offered for Linux) (GH3837_). - Plotting functions now raise a ``TypeError`` before trying to plot anything if the associated objects have have a dtype of ``object`` (GH1818_, GH3572_). This happens before any drawing takes place which elimnates any diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5533584745167..1ea9c48f45269 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -492,6 +492,16 @@ def to_hdf(self, path_or_buf, key, **kwargs): return pytables.to_hdf(path_or_buf, key, self, **kwargs) def to_clipboard(self): + """ + Attempt to write text representation of object to the system clipboard + + Notes + ----- + Requirements for your platform + - Linux: xclip, or xsel (with gtk or PyQt4 modules) + - Windows: + - OS X: + """ from pandas.io import clipboard clipboard.to_clipboard(self) diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index c763c1e8faadb..4e3f7203a279e 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -23,8 +23,8 @@ def to_clipboard(obj): # pragma: no cover Notes ----- Requirements for your platform - - Linux: xsel command line tool - - Windows: Python win32 extensions + - Linux: xclip, or xsel (with gtk or PyQt4 modules) + - Windows: - OS X: """ from pandas.util.clipboard import clipboard_set From 3ca3222e95560b5f17c7ca2e3e1f8f9fbf63f899 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 11:19:32 -0400 Subject: [PATCH 59/71] BUG: (GH3880) index names are now propogated with loc/ix --- RELEASE.rst | 6 ++++-- doc/source/v0.11.1.txt | 9 +++++++-- pandas/core/frame.py | 4 ++-- pandas/core/index.py | 10 ++++++++-- pandas/core/internals.py | 4 ++-- pandas/tests/test_frame.py | 1 + pandas/tests/test_indexing.py | 13 +++++++++++++ 7 files changed, 37 insertions(+), 10 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 03cfc4f6bcafc..839c472da1610 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -177,6 +177,8 @@ pandas 0.11.1 - Non-unique indexing with a slice via ``loc`` and friends fixed (GH3659_) - Allow insert/delete to non-unique columns (GH3679_) - Extend ``reindex`` to correctly deal with non-unique indices (GH3679_) + - ``DataFrame.itertuples()`` now works with frames with duplicate column + names (GH3873_) - Fixed bug in groupby with empty series referencing a variable before assignment. (GH3510_) - Fixed bug in mixed-frame assignment with aligned series (GH3492_) - Fixed bug in selecting month/quarter/year from a series would not select the time element @@ -228,8 +230,7 @@ pandas 0.11.1 - PandasObjects raise TypeError when trying to hash (GH3882_) - Fix incorrect arguments passed to concat that are not list-like (e.g. concat(df1,df2)) (GH3481_) - Correctly parse when passed the ``dtype=str`` (or other variable-len string dtypes) in ``read_csv`` (GH3795_) - - ``DataFrame.itertuples()`` now works with frames with duplicate column - names (GH3873_) + - Fix index name not propogating when using ``loc/ix`` (GH3880_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 @@ -323,6 +324,7 @@ pandas 0.11.1 .. _GH3834: https://github.com/pydata/pandas/issues/3834 .. _GH3873: https://github.com/pydata/pandas/issues/3873 .. _GH3877: https://github.com/pydata/pandas/issues/3877 +.. _GH3880: https://github.com/pydata/pandas/issues/3880 pandas 0.11.0 diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 1a43e9e6a49e0..dfc36258a680f 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -348,11 +348,14 @@ Bug Fixes - Duplicate indexes with and empty DataFrame.from_records will return a correct frame (GH3562_) - Concat to produce a non-unique columns when duplicates are across dtypes is fixed (GH3602_) - Allow insert/delete to non-unique columns (GH3679_) + - Non-unique indexing with a slice via ``loc`` and friends fixed (GH3659_) + - Allow insert/delete to non-unique columns (GH3679_) + - Extend ``reindex`` to correctly deal with non-unique indices (GH3679_) + - ``DataFrame.itertuples()`` now works with frames with duplicate column + names (GH3873_) - ``DataFrame.from_records`` did not accept empty recarrays (GH3682_) - ``read_html`` now correctly skips tests (GH3741_) - - ``DataFrame.itertuples()`` now works with frames with duplicate column - names (GH3873_) See the `full release notes `__ or issue tracker @@ -405,3 +408,5 @@ on GitHub for a complete list. .. _GH3834: https://github.com/pydata/pandas/issues/3834 .. _GH3873: https://github.com/pydata/pandas/issues/3873 .. _GH3877: https://github.com/pydata/pandas/issues/3877 +.. _GH3659: https://github.com/pydata/pandas/issues/3659 +.. _GH3679: https://github.com/pydata/pandas/issues/3679 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6e29204fc0d8..f9f8a424f8d96 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2712,14 +2712,14 @@ def _reindex_multi(self, new_index, new_columns, copy, fill_value): def _reindex_index(self, new_index, method, copy, level, fill_value=NA, limit=None): new_index, indexer = self.index.reindex(new_index, method, level, - limit=limit) + limit=limit, copy_if_needed=True) return self._reindex_with_indexers(new_index, indexer, None, None, copy, fill_value) def _reindex_columns(self, new_columns, copy, level, fill_value=NA, limit=None): new_columns, indexer = self.columns.reindex(new_columns, level=level, - limit=limit) + limit=limit, copy_if_needed=True) return self._reindex_with_indexers(None, None, new_columns, indexer, copy, fill_value) diff --git a/pandas/core/index.py b/pandas/core/index.py index 51ebd58c33343..a5880b9f18670 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -920,7 +920,7 @@ def _get_method(self, method): } return aliases.get(method, method) - def reindex(self, target, method=None, level=None, limit=None): + def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False): """ For Index, simply returns the new index and the results of get_indexer. Provided here to enable an interface that is amenable for @@ -939,6 +939,12 @@ def reindex(self, target, method=None, level=None, limit=None): else: if self.equals(target): indexer = None + + # to avoid aliasing an existing index + if copy_if_needed and target.name != self.name and self.name is not None: + if target.name is None: + target = self.copy() + else: if self.is_unique: indexer = self.get_indexer(target, method=method, @@ -2196,7 +2202,7 @@ def get_indexer(self, target, method=None, limit=None): return com._ensure_platform_int(indexer) - def reindex(self, target, method=None, level=None, limit=None): + def reindex(self, target, method=None, level=None, limit=None, copy_if_needed=False): """ Performs any necessary conversion on the input index and calls get_indexer. This method is here so MultiIndex and an Index of diff --git a/pandas/core/internals.py b/pandas/core/internals.py index af1543dad0314..49d92afc46848 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1948,7 +1948,7 @@ def reindex_axis(self, new_axis, method=None, axis=0, copy=True): 'axis == 0') return self.reindex_items(new_axis) - new_axis, indexer = cur_axis.reindex(new_axis, method) + new_axis, indexer = cur_axis.reindex(new_axis, method, copy_if_needed=True) return self.reindex_indexer(new_axis, indexer, axis=axis) def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=np.nan): @@ -2014,7 +2014,7 @@ def reindex_items(self, new_items, copy=True, fill_value=np.nan): return data.reindex_items(new_items) # TODO: this part could be faster (!) - new_items, indexer = self.items.reindex(new_items) + new_items, indexer = self.items.reindex(new_items, copy_if_needed=True) new_axes = [new_items] + self.axes[1:] # could have so me pathological (MultiIndex) issues here diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2b2d59306da6e..5b4d582e5e42e 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7207,6 +7207,7 @@ def test_reindex_name_remains(self): s = Series(random.rand(10)) df = DataFrame(s, index=np.arange(len(s))) i = Series(np.arange(10), name='iname') + df = df.reindex(i) self.assert_(df.index.name == 'iname') diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 295eaede443b1..0719d9c9a87db 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1024,6 +1024,19 @@ def test_non_unique_loc(self): expected = DataFrame({'A' : [2,4,5], 'B' : [4,6,7]}, index = [1,1,2]) assert_frame_equal(result,expected) + def test_loc_name(self): + # GH 3880 + df = DataFrame([[1, 1], [1, 1]]) + df.index.name = 'index_name' + result = df.iloc[[0, 1]].index.name + self.assert_(result == 'index_name') + + result = df.ix[[0, 1]].index.name + self.assert_(result == 'index_name') + + result = df.loc[[0, 1]].index.name + self.assert_(result == 'index_name') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 77b42053600ee13dc6b58e1144fe7a045fcd49bd Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 16:36:41 -0400 Subject: [PATCH 60/71] BLD: install older versions of numexpr/pytables on fulldeps/2 build --- ci/install.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ci/install.sh b/ci/install.sh index c9b76b88721e9..294db286a1001 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -67,14 +67,17 @@ if ( ! $VENV_FILE_AVAILABLE ); then if [ x"$FULL_DEPS" == x"true" ]; then echo "Installing FULL_DEPS" pip install $PIP_ARGS cython - pip install $PIP_ARGS numexpr if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then pip install $PIP_ARGS xlwt pip install $PIP_ARGS bottleneck + pip install $PIP_ARGS numexpr==2.0.1 + pip install $PIP_ARGS tables==2.3.1 + else + pip install $PIP_ARGS numexpr + pip install $PIP_ARGS tables fi - pip install $PIP_ARGS tables pip install $PIP_ARGS matplotlib pip install $PIP_ARGS openpyxl pip install $PIP_ARGS xlrd>=0.9.0 From 3349ea78cfdd7f1506791ae37fbdc8369a0e3364 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 21:05:13 -0400 Subject: [PATCH 61/71] DOC: minor io.rst edits --- doc/source/io.rst | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 7f43546f77cd4..905f7f24ac427 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -952,9 +952,9 @@ Writing JSON A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` with optional parameters: -- path_or_buf : the pathname or buffer to write the output +- ``path_or_buf`` : the pathname or buffer to write the output This can be ``None`` in which case a JSON string is returned -- orient : +- ``orient`` : Series : default is 'index', allowed values are: {'split','records','index'} @@ -970,9 +970,9 @@ with optional parameters: * columns : dict like {column -> {index -> value}} * values : just the values array -- date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch -- double_precision : The number of decimal places to use when encoding floating point values, default 10. -- force_ascii : force encoded string to be ASCII, default True. +- ``date_format`` : type of date conversion (epoch = epoch milliseconds, iso = ISO8601), default is epoch +- ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. +- ``force_ascii`` : force encoded string to be ASCII, default True. Note NaN's and None will be converted to null and datetime objects will be converted based on the date_format parameter @@ -1010,12 +1010,12 @@ Reading a JSON string to pandas object can take a number of parameters. The parser will try to parse a ``DataFrame`` if ``typ`` is not supplied or is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` -- filepath_or_buffer : a **VALID** JSON string or file handle / StringIO. The string could be +- ``filepath_or_buffer`` : a **VALID** JSON string or file handle / StringIO. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.json -- typ : type of object to recover (series or frame), default 'frame' -- orient : +- ``typ`` : type of object to recover (series or frame), default 'frame' +- ``orient`` : Series : default is 'index', allowed values are: {'split','records','index'} @@ -1031,11 +1031,11 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` * columns : dict like {column -> {index -> value}} * values : just the values array -- dtype : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data -- convert_axes : boolean, try to convert the axes to the proper dtypes, default is True -- convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True -- keep_default_dates : boolean, default True. If parsing dates, then parse the default datelike columns -- numpy: direct decoding to numpy arrays. default is False; +- ``dtype`` : if True, infer dtypes, if a dict of column to dtype, then use those, if False, then don't infer dtypes at all, default is True, apply only to the data +- ``convert_axes`` : boolean, try to convert the axes to the proper dtypes, default is True +- ``convert_dates`` : a list of columns to parse for dates; If True, then try to parse datelike columns, default is True +- ``keep_default_dates`` : boolean, default True. If parsing dates, then parse the default datelike columns +- ``numpy`` : direct decoding to numpy arrays. default is False; Note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is @@ -1049,8 +1049,8 @@ be set to ``False`` if you need to preserve string-like numbers (e.g. '1', '2') When reading JSON data, automatic coercing into dtypes has some quirks: - * an index can be in a different order, that is the returned order is not guaranteed to be the same as before serialization - * a column that was ``float`` data can safely be converted to ``integer``, e.g. a column of ``1.`` + * an index can be reconstructed in a different order from serialization, that is, the returned order is not guaranteed to be the same as before serialization + * a column that was ``float`` data will be converted to ``integer`` if it can be done safely, e.g. a column of ``1.`` * bool columns will be converted to ``integer`` on reconstruction Thus there are times where you may want to specify specific dtypes via the ``dtype`` keyword argument. From d52e6999c54c984c962b68a1c36c81f5f5a785d5 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 18:44:49 -0400 Subject: [PATCH 62/71] TST: json tests to int64 to avoid dtype issues --- pandas/io/json.py | 19 +++++++++++++++++++ pandas/io/tests/test_json/test_pandas.py | 7 +++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index fcecb31bb77a7..ce95c3394ce2c 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -268,6 +268,15 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): except: pass + if data.dtype == 'float': + + # coerce floats to 64 + try: + data = data.astype('float64') + result = True + except: + pass + # do't coerce 0-len data if len(data) and (data.dtype == 'float' or data.dtype == 'object'): @@ -280,6 +289,16 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): except: pass + # coerce ints to 64 + if data.dtype == 'int': + + # coerce floats to 64 + try: + data = data.astype('int64') + result = True + except: + pass + return data, result def _try_convert_to_date(self, data): diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index bdd700bdbcec3..fe717f56e6bea 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -26,7 +26,7 @@ _frame = DataFrame(_seriesd) _frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) -_intframe = DataFrame(dict((k, v.astype(int)) +_intframe = DataFrame(dict((k, v.astype(np.int64)) for k, v in _seriesd.iteritems())) _tsframe = DataFrame(_tsd) @@ -71,6 +71,9 @@ def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_ unser = unser.sort() + if dtype is False: + check_dtype=False + if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex(unser.index.values.astype('i8')) if orient == "records": @@ -288,7 +291,7 @@ def test_series_to_json_except(self): def test_typ(self): - s = Series(range(6), index=['a','b','c','d','e','f']) + s = Series(range(6), index=['a','b','c','d','e','f'], dtype='int64') result = read_json(s.to_json(),typ=None) assert_series_equal(result,s) From 37fce3c6e61e596f3ee49e98d75c9d1051fa0dc7 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 19:06:39 -0400 Subject: [PATCH 63/71] TST: skip tests if xlrd has lower than needed version --- pandas/io/tests/test_excel.py | 6 +----- pandas/tests/test_panel.py | 6 +++++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 39e1042d125a2..baa4f6b64ec0e 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -808,11 +808,7 @@ def test_to_excel_styleconverter(self): # self.assertTrue(ws.cell(maddr).merged) # os.remove(filename) def test_excel_010_hemstring(self): - try: - import xlwt - import openpyxl - except ImportError: - raise nose.SkipTest + _skip_if_no_excelsuite() from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 380604b0de32e..4e57977a787f2 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1381,7 +1381,11 @@ def test_to_excel(self): path = '__tmp__.' + ext with ensure_clean(path) as path: self.panel.to_excel(path) - reader = ExcelFile(path) + try: + reader = ExcelFile(path) + except ImportError: + raise nose.SkipTest + for item, df in self.panel.iterkv(): recdf = reader.parse(str(item), index_col=0) assert_frame_equal(df, recdf) From 95dfba48e5490524b17ca1613237225d7a07a289 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 19:12:30 -0400 Subject: [PATCH 64/71] TST: skip pickle tests on not-little endianess --- pandas/io/tests/test_pickle.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index d49597860cd16..f58ef3c6919ee 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -8,6 +8,7 @@ import unittest import nose import os +import sys import numpy as np import pandas.util.testing as tm @@ -16,6 +17,9 @@ from pandas.sparse.tests import test_sparse from pandas.util import py3compat +if sys.byteorder != 'little': + raise nose.SkipTest('system byteorder is not little!') + class TestPickle(unittest.TestCase): _multiprocess_can_split_ = True From 33922f80165bb960f09f4c39b484371877d9358e Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 19:16:29 -0400 Subject: [PATCH 65/71] TST: skip test_encoding on non-little endian in test_pytables --- pandas/io/tests/test_pytables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 8b3d4a475d952..3266a906dcfae 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -477,6 +477,9 @@ def test_append(self): def test_encoding(self): + if sys.byteorder != 'little': + raise nose.SkipTest('system byteorder is not little, skipping test_encoding!') + with ensure_clean(self.path) as store: df = DataFrame(dict(A='foo',B='bar'),index=range(5)) df.loc[2,'A'] = np.nan From f6949a87d588e6c2568d147247c85c70778bdcef Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 13 Jun 2013 19:23:03 -0400 Subject: [PATCH 66/71] TST: skip some stata tests on non-little endian --- pandas/io/tests/test_stata.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index d512b0267ed13..54875e62de218 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -3,7 +3,7 @@ from datetime import datetime import os import unittest - +import sys import warnings import nose @@ -15,6 +15,10 @@ import pandas.util.testing as tm from pandas.util.testing import ensure_clean +def _skip_if_not_little(name): + if sys.byteorder != 'little': + raise nose.SkipTest('system byteorder is not little, skipping %s' % name) + class StataTests(unittest.TestCase): def setUp(self): @@ -128,6 +132,7 @@ def test_read_dta4(self): tm.assert_frame_equal(parsed, expected) def test_write_dta5(self): + _skip_if_not_little('write_dta5') original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) original.index.name = 'index' @@ -138,6 +143,7 @@ def test_write_dta5(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), original) def test_write_dta6(self): + _skip_if_not_little('write_dta6') original = self.read_csv(self.csv3) original.index.name = 'index' From 265fabf98eba3fc1d73278d3a5680f52ed8633b6 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 14 Jun 2013 08:41:18 -0400 Subject: [PATCH 67/71] TST: convered skips on knownfailures to knownfailures (test_pickle/test_stata) --- pandas/io/tests/test_pickle.py | 8 ++++---- pandas/io/tests/test_stata.py | 10 ++++------ pandas/util/misc.py | 7 +++++++ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index f58ef3c6919ee..a7f0e3d3e37b1 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -8,7 +8,6 @@ import unittest import nose import os -import sys import numpy as np import pandas.util.testing as tm @@ -16,9 +15,8 @@ from pandas import Index from pandas.sparse.tests import test_sparse from pandas.util import py3compat - -if sys.byteorder != 'little': - raise nose.SkipTest('system byteorder is not little!') +from pandas.util.decorators import knownfailureif +from pandas.util.misc import is_little_endian class TestPickle(unittest.TestCase): _multiprocess_can_split_ = True @@ -60,6 +58,7 @@ def compare(self, vf): comparator = getattr(tm,"assert_%s_equal" % typ) comparator(result,expected) + @knownfailureif(not is_little_endian(), "known failure of test_read_pickles_0_10_1 on non-little endian") def test_read_pickles_0_10_1(self): pth = tm.get_data_path('legacy_pickle/0.10.1') @@ -67,6 +66,7 @@ def test_read_pickles_0_10_1(self): vf = os.path.join(pth,f) self.compare(vf) + @knownfailureif(not is_little_endian(), "known failure of test_read_pickles_0_11_0 on non-little endian") def test_read_pickles_0_11_0(self): pth = tm.get_data_path('legacy_pickle/0.11.0') diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 54875e62de218..794d303a68d79 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -14,10 +14,8 @@ from pandas.io.stata import read_stata, StataReader, StataWriter import pandas.util.testing as tm from pandas.util.testing import ensure_clean - -def _skip_if_not_little(name): - if sys.byteorder != 'little': - raise nose.SkipTest('system byteorder is not little, skipping %s' % name) +from pandas.util.decorators import knownfailureif +from pandas.util.misc import is_little_endian class StataTests(unittest.TestCase): @@ -131,8 +129,8 @@ def test_read_dta4(self): tm.assert_frame_equal(parsed, expected) + @knownfailureif(not is_little_endian(), "known failure of test_write_dta5 on non-little endian") def test_write_dta5(self): - _skip_if_not_little('write_dta5') original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) original.index.name = 'index' @@ -142,8 +140,8 @@ def test_write_dta5(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) + @knownfailureif(not is_little_endian(), "known failure of test_write_dta6 on non-little endian") def test_write_dta6(self): - _skip_if_not_little('write_dta6') original = self.read_csv(self.csv3) original.index.name = 'index' diff --git a/pandas/util/misc.py b/pandas/util/misc.py index 8372ba56d00cd..15492cde5a9f7 100644 --- a/pandas/util/misc.py +++ b/pandas/util/misc.py @@ -1,3 +1,10 @@ +""" various miscellaneous utilities """ + +def is_little_endian(): + """ am I little endian """ + import sys + return sys.byteorder == 'little' + def exclusive(*args): count = sum([arg is not None for arg in args]) return count == 1 From 4be46c324ab4df8eab714a3637d1500326bc9050 Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 14 Jun 2013 14:40:34 -0400 Subject: [PATCH 68/71] TST: convert knowntestfailures to skip tests --- pandas/io/tests/test_pickle.py | 7 ++++--- pandas/io/tests/test_stata.py | 9 ++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index a7f0e3d3e37b1..5c79c57c1e020 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -15,7 +15,6 @@ from pandas import Index from pandas.sparse.tests import test_sparse from pandas.util import py3compat -from pandas.util.decorators import knownfailureif from pandas.util.misc import is_little_endian class TestPickle(unittest.TestCase): @@ -58,16 +57,18 @@ def compare(self, vf): comparator = getattr(tm,"assert_%s_equal" % typ) comparator(result,expected) - @knownfailureif(not is_little_endian(), "known failure of test_read_pickles_0_10_1 on non-little endian") def test_read_pickles_0_10_1(self): + if not is_little_endian(): + raise nose.SkipTest("known failure of test_read_pickles_0_10_1 on non-little endian") pth = tm.get_data_path('legacy_pickle/0.10.1') for f in os.listdir(pth): vf = os.path.join(pth,f) self.compare(vf) - @knownfailureif(not is_little_endian(), "known failure of test_read_pickles_0_11_0 on non-little endian") def test_read_pickles_0_11_0(self): + if not is_little_endian(): + raise nose.SkipTest("known failure of test_read_pickles_0_11_0 on non-little endian") pth = tm.get_data_path('legacy_pickle/0.11.0') for f in os.listdir(pth): diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 794d303a68d79..4584976c41383 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -14,7 +14,6 @@ from pandas.io.stata import read_stata, StataReader, StataWriter import pandas.util.testing as tm from pandas.util.testing import ensure_clean -from pandas.util.decorators import knownfailureif from pandas.util.misc import is_little_endian class StataTests(unittest.TestCase): @@ -129,8 +128,10 @@ def test_read_dta4(self): tm.assert_frame_equal(parsed, expected) - @knownfailureif(not is_little_endian(), "known failure of test_write_dta5 on non-little endian") def test_write_dta5(self): + if not is_little_endian(): + raise nose.SkipTest("known failure of test_write_dta5 on non-little endian") + original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) original.index.name = 'index' @@ -140,8 +141,10 @@ def test_write_dta5(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) - @knownfailureif(not is_little_endian(), "known failure of test_write_dta6 on non-little endian") def test_write_dta6(self): + if not is_little_endian(): + raise nose.SkipTest("known failure of test_write_dta6 on non-little endian") + original = self.read_csv(self.csv3) original.index.name = 'index' From 2cfabee9c872199aa5ec43da0b65aefaddeeb794 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 13 Jun 2013 20:09:36 -0400 Subject: [PATCH 69/71] BUG: fix unicode -> str cast in tslib This should use format since calling str on a unicode string is a *bad* idea because it may or may not repr correctly. --- pandas/tslib.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index ec11de7392680..9b611032455ae 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -465,7 +465,8 @@ cdef class _Timestamp(datetime): elif op == 3: return True else: - raise TypeError('Cannot compare Timestamp with %s' % str(other)) + raise TypeError('Cannot compare Timestamp with ' + '{0!r}'.format(other.__class__.__name__)) self._assert_tzawareness_compat(other) From dcb901b76f750246c248d4209b106df65b731a6c Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Fri, 14 Jun 2013 22:32:13 -0400 Subject: [PATCH 70/71] TST: Change network decorator to auto-check for network errors --- pandas/util/testing.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index dd86862a2d551..3ef0d48e623e1 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -3,6 +3,7 @@ # pylint: disable-msg=W0402 from datetime import datetime +from functools import wraps import random import string import sys @@ -11,6 +12,8 @@ from contextlib import contextmanager # contextlib is available since 2.5 from distutils.version import LooseVersion +import urllib2 +import nose from numpy.random import randn import numpy as np @@ -35,7 +38,7 @@ N = 30 K = 4 - +_FORCE_NETWORK_ERROR = False def rands(n): choices = string.ascii_letters + string.digits @@ -662,10 +665,23 @@ def skip_if_no_package(*args, **kwargs): # Additional tags decorators for nose # +def _is_network_error(error): + """Checks if class is a subclass of httplib2's ServerNotFoundError, + urllib2's URLError, or IOErrors generally.""" + error_classes = [urllib2.URLError, IOError] + try: + # dynamic import because I don't think httplib2 is a dependency + import httplib2 + error_classes.append(httplib2.ServerNotFoundError) + except ImportError: + pass + + return isinstance(error, tuple(error_classes)) + def network(t): """ - Label a test as requiring network connection. + Label a test as requiring network connection and skip test if it encounters a ``URLError``. In some cases it is not possible to assume network presence (e.g. Debian build hosts). @@ -691,11 +707,25 @@ def test_network(self): print 'Fetch the stars from http://' And use ``nosetests -a '!network'`` to exclude running tests requiring - network connectivity. + network connectivity. You can set ``pandas.util.testing._FORCE_NETWORK_ERROR`` to ``True`` to + force these tests to fail on ``URLError`` """ t.network = True - return t + @wraps(t) + def network_wrapper(*args, **kwargs): + try: + return t(*args, **kwargs) + except Exception as e: + if _is_network_error(e) and not _FORCE_NETWORK_ERROR: + # Check if we can connect to Google, if not then okay + # to continue skipping. + try: + urllib2.urlopen("http://www.google.com") + except urllib2.URLError: + raise nose.SkipTest("Skipping because no network connectivity.") + raise + return network_wrapper class SimpleMock(object): From b6aee489543963530b6fd5f75ef6594e5a169476 Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Fri, 14 Jun 2013 22:34:17 -0400 Subject: [PATCH 71/71] TST: Remove explicit connectivity checks in test cases. Instead, network decorator in pandas.util.testing checks for that instead. You have to opt into failing on tests by setting `pandas.util.testing._FORCE_NETWORK_ERROR` to `True`. --- pandas/io/tests/test_fred.py | 22 +++++++--------------- pandas/io/tests/test_ga.py | 16 ---------------- pandas/io/tests/test_google.py | 31 ++++++++++--------------------- pandas/io/tests/test_yahoo.py | 28 +++++++--------------------- 4 files changed, 24 insertions(+), 73 deletions(-) diff --git a/pandas/io/tests/test_fred.py b/pandas/io/tests/test_fred.py index 00a90ec3da402..c1b59f782bd09 100644 --- a/pandas/io/tests/test_fred.py +++ b/pandas/io/tests/test_fred.py @@ -26,22 +26,14 @@ def test_fred(self): start = datetime(2010, 1, 1) end = datetime(2013, 01, 27) - try: - self.assertEquals( - web.DataReader("GDP", "fred", start, end)['GDP'].tail(1), - 16004.5) + self.assertEquals( + web.DataReader("GDP", "fred", start, end)['GDP'].tail(1), + 16004.5) - self.assertRaises( - Exception, - lambda: web.DataReader("NON EXISTENT SERIES", 'fred', - start, end)) - except urllib2.URLError: - try: - urllib2.urlopen('http://google.com') - except urllib2.URLError: - raise nose.SkipTest - else: - raise + self.assertRaises( + Exception, + lambda: web.DataReader("NON EXISTENT SERIES", 'fred', + start, end)) @slow @network diff --git a/pandas/io/tests/test_ga.py b/pandas/io/tests/test_ga.py index 5fa2120090025..25335b054de8a 100644 --- a/pandas/io/tests/test_ga.py +++ b/pandas/io/tests/test_ga.py @@ -76,13 +76,6 @@ def test_getdata(self): except AuthenticationConfigError: raise nose.SkipTest - except httplib2.ServerNotFoundError: - try: - h = httplib2.Http() - response, content = h.request("http://www.google.com") - raise - except httplib2.ServerNotFoundError: - raise nose.SkipTest @slow @network @@ -186,16 +179,7 @@ def test_segment(self): except AuthenticationConfigError: raise nose.SkipTest - except httplib2.ServerNotFoundError: - try: - h = httplib2.Http() - response, content = h.request("http://www.google.com") - raise - except httplib2.ServerNotFoundError: - raise nose.SkipTest - if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/tests/test_google.py b/pandas/io/tests/test_google.py index 7f4ca13c27e58..b961f29c224d3 100644 --- a/pandas/io/tests/test_google.py +++ b/pandas/io/tests/test_google.py @@ -4,10 +4,7 @@ import pandas as pd import pandas.io.data as web -from pandas.util.testing import (network, assert_series_equal) -from numpy.testing.decorators import slow - -import urllib2 +from pandas.util.testing import network class TestGoogle(unittest.TestCase): @@ -15,27 +12,19 @@ class TestGoogle(unittest.TestCase): @network def test_google(self): # asserts that google is minimally working and that it throws - # an excecption when DataReader can't get a 200 response from + # an exception when DataReader can't get a 200 response from # google start = datetime(2010, 1, 1) end = datetime(2013, 01, 27) - try: - self.assertEquals( - web.DataReader("F", 'google', start, end)['Close'][-1], - 13.68) + self.assertEquals( + web.DataReader("F", 'google', start, end)['Close'][-1], + 13.68) - self.assertRaises( - Exception, - lambda: web.DataReader("NON EXISTENT TICKER", 'google', - start, end)) - except urllib2.URLError: - try: - urllib2.urlopen('http://www.google.com') - except urllib2.URLError: - raise nose.SkipTest - else: - raise + self.assertRaises( + Exception, + lambda: web.DataReader("NON EXISTENT TICKER", 'google', + start, end)) @network @@ -74,7 +63,7 @@ def test_get_data(self): # sanity checking t= np.array(pan) - assert np.issubdtype(t.dtype, np.floating) + assert np.issubdtype(t.dtype, np.floating) if __name__ == '__main__': diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py index 0e2c2022af422..3cf800c887e93 100644 --- a/pandas/io/tests/test_yahoo.py +++ b/pandas/io/tests/test_yahoo.py @@ -4,12 +4,7 @@ import pandas as pd import pandas.io.data as web -from pandas.util.testing import (network, assert_frame_equal, - assert_series_equal, - assert_almost_equal) -from numpy.testing.decorators import slow - -import urllib2 +from pandas.util.testing import network, assert_series_equal class TestYahoo(unittest.TestCase): @@ -22,23 +17,14 @@ def test_yahoo(self): start = datetime(2010, 1, 1) end = datetime(2013, 01, 27) - try: - self.assertEquals( - web.DataReader("F", 'yahoo', start, end)['Close'][-1], - 13.68) + self.assertEquals( + web.DataReader("F", 'yahoo', start, end)['Close'][-1], + 13.68) - self.assertRaises( - Exception, - lambda: web.DataReader("NON EXISTENT TICKER", 'yahoo', + self.assertRaises( + Exception, + lambda: web.DataReader("NON EXISTENT TICKER", 'yahoo', start, end)) - except urllib2.URLError: - try: - urllib2.urlopen('http://www.google.com') - except urllib2.URLError: - raise nose.SkipTest - else: - raise - @network def test_get_quote(self):