From 590874d84eafad8fb7d12ff1236ecc7c21fac3dc Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 29 May 2016 15:47:15 +0200 Subject: [PATCH] BUG: Parse trailing NaN values for the Python parser --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 8 +++++--- pandas/io/tests/parser/c_parser_only.py | 9 --------- pandas/io/tests/parser/na_values.py | 9 +++++++++ pandas/src/inference.pyx | 21 +++++++++++++++++++-- pandas/tests/test_infer_and_convert.py | 17 +++++++++++++++++ 6 files changed, 51 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 33a48671a9b65..7736a26bb6947 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -349,6 +349,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 394fe1a98880a..1f0155c4cc7a0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2226,14 +2226,16 @@ def _get_index_name(self, columns): return index_name, orig_names, columns def _rows_to_cols(self, content): - zipped_content = list(lib.to_object_array(content).T) - col_len = self.num_original_columns - zip_len = len(zipped_content) if self._implicit_index: col_len += len(self.index_col) + # see gh-13320 + zipped_content = list(lib.to_object_array( + content, min_width=col_len).T) + zip_len = len(zipped_content) + if self.skip_footer < 0: raise ValueError('skip footer cannot be negative') diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 00c4e0a1c022b..7fca37cef473e 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -360,15 +360,6 @@ def test_raise_on_passed_int_dtype_with_nas(self): sep=",", skipinitialspace=True, dtype={'DOY': np.int64}) - def test_na_trailing_columns(self): - data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax -2012-03-14,USD,AAPL,BUY,1000 -2012-05-12,USD,SBUX,SELL,500""" - - result = self.read_csv(StringIO(data)) - self.assertEqual(result['Date'][1], '2012-05-12') - self.assertTrue(result['UnitPrice'].isnull().all()) - def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index d826ae536c6cc..2a8c934abce61 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -241,3 +241,12 @@ def test_na_values_na_filter_override(self): columns=['A', 'B']) out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False) tm.assert_frame_equal(out, expected) + + def test_na_trailing_columns(self): + data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" + + result = self.read_csv(StringIO(data)) + self.assertEqual(result['Date'][1], '2012-05-12') + self.assertTrue(result['UnitPrice'].isnull().all()) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index d4e149eb09b65..5f7c5478b5d87 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -1132,7 +1132,24 @@ def map_infer(ndarray arr, object f, bint convert=1): return result -def to_object_array(list rows): +def to_object_array(list rows, int min_width=0): + """ + Convert a list of lists into an object array. + + Parameters + ---------- + rows : 2-d array (N, K) + A list of lists to be converted into an array + min_width : int + The minimum width of the object array. If a list + in `rows` contains fewer than `width` elements, + the remaining elements in the corresponding row + will all be `NaN`. + + Returns + ------- + obj_array : numpy array of the object dtype + """ cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result @@ -1140,7 +1157,7 @@ def to_object_array(list rows): n = len(rows) - k = 0 + k = min_width for i from 0 <= i < n: tmp = len(rows[i]) if tmp > k: diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index 7558934c32bc8..68eac12e5ec4c 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -201,6 +201,23 @@ def test_to_object_array_tuples(self): except ImportError: pass + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array([[1, 2, 3, None, None], + [4, 5, 6, None, None]], dtype=object) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + def test_object(self): # GH 7431