Skip to content

BUG: Parse trailing NaN values for the Python parser #13320

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ Bug Fixes


- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`)



Expand Down
8 changes: 5 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2226,14 +2226,16 @@ def _get_index_name(self, columns):
return index_name, orig_names, columns

def _rows_to_cols(self, content):
zipped_content = list(lib.to_object_array(content).T)

col_len = self.num_original_columns
zip_len = len(zipped_content)

if self._implicit_index:
col_len += len(self.index_col)

# see gh-13320
zipped_content = list(lib.to_object_array(
content, min_width=col_len).T)
zip_len = len(zipped_content)

if self.skip_footer < 0:
raise ValueError('skip footer cannot be negative')

Expand Down
9 changes: 0 additions & 9 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,15 +360,6 @@ def test_raise_on_passed_int_dtype_with_nas(self):
sep=",", skipinitialspace=True,
dtype={'DOY': np.int64})

def test_na_trailing_columns(self):
data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""

result = self.read_csv(StringIO(data))
self.assertEqual(result['Date'][1], '2012-05-12')
self.assertTrue(result['UnitPrice'].isnull().all())

def test_parse_ragged_csv(self):
data = """1,2,3
1,2,3,4
Expand Down
9 changes: 9 additions & 0 deletions pandas/io/tests/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,12 @@ def test_na_values_na_filter_override(self):
columns=['A', 'B'])
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
tm.assert_frame_equal(out, expected)

def test_na_trailing_columns(self):
data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""

result = self.read_csv(StringIO(data))
self.assertEqual(result['Date'][1], '2012-05-12')
self.assertTrue(result['UnitPrice'].isnull().all())
21 changes: 19 additions & 2 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1132,15 +1132,32 @@ def map_infer(ndarray arr, object f, bint convert=1):
return result


def to_object_array(list rows):
def to_object_array(list rows, int min_width=0):
"""
Convert a list of lists into an object array.

Parameters
----------
rows : 2-d array (N, K)
A list of lists to be converted into an array
min_width : int
The minimum width of the object array. If a list
in `rows` contains fewer than `width` elements,
the remaining elements in the corresponding row
will all be `NaN`.

Returns
-------
obj_array : numpy array of the object dtype
"""
cdef:
Py_ssize_t i, j, n, k, tmp
ndarray[object, ndim=2] result
list row

n = len(rows)

k = 0
k = min_width
for i from 0 <= i < n:
tmp = len(rows[i])
if tmp > k:
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/test_infer_and_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,23 @@ def test_to_object_array_tuples(self):
except ImportError:
pass

def test_to_object_array_width(self):
# see gh-13320
rows = [[1, 2, 3], [4, 5, 6]]

expected = np.array(rows, dtype=object)
out = lib.to_object_array(rows)
tm.assert_numpy_array_equal(out, expected)

expected = np.array(rows, dtype=object)
out = lib.to_object_array(rows, min_width=1)
tm.assert_numpy_array_equal(out, expected)

expected = np.array([[1, 2, 3, None, None],
[4, 5, 6, None, None]], dtype=object)
out = lib.to_object_array(rows, min_width=5)
tm.assert_numpy_array_equal(out, expected)

def test_object(self):

# GH 7431
Expand Down