Skip to content

Commit 2c6f4c6

Browse files
committed
Merge pull request #3615 from jreback/read_csv_na_values
BUG: (GH3611) Fix read_csv to correctly encode identical na_values
2 parents 0a34464 + 4bf6727 commit 2c6f4c6

File tree

3 files changed

+18
-2
lines changed

3 files changed

+18
-2
lines changed

RELEASE.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ pandas 0.11.1
118118
- Fix modulo and integer division on Series,DataFrames to act similary to ``float`` dtypes to return
119119
``np.nan`` or ``np.inf`` as appropriate (GH3590_)
120120
- Fix incorrect dtype on groupby with ``as_index=False`` (GH3610_)
121+
- Fix ``read_csv`` to correctly encode identical na_values, e.g. ``na_values=[-999.0,-999]``
122+
was failing (GH3611_)
121123

122124
.. _GH3164: https://github.com/pydata/pandas/issues/3164
123125
.. _GH2786: https://github.com/pydata/pandas/issues/2786
@@ -166,6 +168,7 @@ pandas 0.11.1
166168
.. _GH3610: https://github.com/pydata/pandas/issues/3610
167169
.. _GH3596: https://github.com/pydata/pandas/issues/3596
168170
.. _GH3435: https://github.com/pydata/pandas/issues/3435
171+
.. _GH3611: https://github.com/pydata/pandas/issues/3611
169172

170173

171174
pandas 0.11.0

pandas/io/parsers.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1715,12 +1715,14 @@ def _clean_na_values(na_values, keep_default_na=True):
17151715
else:
17161716
if not com.is_list_like(na_values):
17171717
na_values = [na_values]
1718-
na_values = set(list(na_values))
1718+
na_values = set(_stringify_na_values(na_values))
17191719
if keep_default_na:
17201720
na_values = na_values | _NA_VALUES
17211721

17221722
return na_values
17231723

1724+
def _stringify_na_values(na_values):
1725+
return [ str(x) for x in na_values ]
17241726

17251727
def _clean_index_names(columns, index_col):
17261728
if not _is_index_col(index_col):
@@ -1771,7 +1773,7 @@ def _get_empty_meta(columns, index_col, index_names):
17711773
def _get_na_values(col, na_values):
17721774
if isinstance(na_values, dict):
17731775
if col in na_values:
1774-
return set(list(na_values[col]))
1776+
return set(_stringify_na_values(list(na_values[col])))
17751777
else:
17761778
return _NA_VALUES
17771779
else:

pandas/io/tests/test_parsers.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,17 @@ def test_quoting(self):
498498
df = self.read_table(StringIO(good_line_small), sep='\t')
499499
self.assert_(len(df) == 3)
500500

501+
def test_non_string_na_values(self):
502+
# GH3611, na_values that are not a string are an issue
503+
with ensure_clean('__non_string_na_values__.csv') as path:
504+
df = DataFrame({'A' : [-999, 2, 3], 'B' : [1.2, -999, 4.5]})
505+
df.to_csv(path, sep=' ', index=False)
506+
result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999'])
507+
result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0])
508+
result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999])
509+
tm.assert_frame_equal(result1,result2)
510+
tm.assert_frame_equal(result2,result3)
511+
501512
def test_custom_na_values(self):
502513
data = """A,B,C
503514
ignore,this,row

0 commit comments

Comments
 (0)