Skip to content

Commit ab7e1e8

Browse files
committed
add docs; test for conv cast
1 parent f78275f commit ab7e1e8

File tree

4 files changed

+78
-42
lines changed

4 files changed

+78
-42
lines changed

doc/source/io.rst

+5-4
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None``
157157
Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}``
158158
(unsupported with ``engine='python'``). Use `str` or `object` to preserve and
159159
not interpret dtype.
160+
161+
.. versionadded:: 0.20.0 support for the Python parser.
162+
160163
engine : {``'c'``, ``'python'``}
161164
Parser engine to use. The C engine is faster while the python engine is
162165
currently more feature-complete.
@@ -473,10 +476,8 @@ However, if you wanted for all the data to be coerced, no matter the type, then
473476
using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be
474477
worth trying.
475478

476-
.. note::
477-
The ``dtype`` option is currently only supported by the C engine.
478-
Specifying ``dtype`` with ``engine`` other than 'c' raises a
479-
``ValueError``.
479+
.. versionadded:: 0.20.0 support for the Python parser.
480+
The ``dtype`` option is supported by the 'python' engine
480481

481482
.. note::
482483
In some cases, reading in abnormal data with columns containing mixed dtypes

doc/source/whatsnew/v0.20.0.txt

+8
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,15 @@ Other enhancements
3131
^^^^^^^^^^^^^^^^^^
3232

3333

34+
- The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
35+
is now supported with the ``'python'`` engine. See the :ref:`io docs <io.dtypes>` for more information.
3436

37+
.. ipython:: python
38+
39+
from io import StringIO
40+
data = "a,b\n1,2\n3,4"
41+
pd.read_csv(StringIO(data), engine='python').dtypes
42+
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes
3543

3644
.. _whatsnew_0200.api_breaking:
3745

pandas/io/parsers.py

+55-38
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,11 @@
115115
dtype : Type name or dict of column -> type, default None
116116
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
117117
Use `str` or `object` to preserve and not interpret dtype.
118-
If converters are specified, they will be applied AFTER
119-
dtype conversion.
118+
If converters are specified, they will be applied INSTEAD
119+
of dtype conversion.
120+
121+
.. versionadded:: 0.20.0 support for the Python parser.
122+
120123
%s
121124
converters : dict, default None
122125
Dict of functions for converting values in certain columns. Keys can either
@@ -1292,20 +1295,6 @@ def _agg_index(self, index, try_parse_dates=True):
12921295

12931296
return index
12941297

1295-
def _apply_converter(self, values, conv_f, na_values, col_na_values,
1296-
col_na_fvalues):
1297-
""" apply converter function to values, respecting NAs """
1298-
try:
1299-
values = lib.map_infer(values, conv_f)
1300-
except ValueError:
1301-
mask = lib.ismember(values, na_values).view(np.uint8)
1302-
values = lib.map_infer_mask(values, conv_f, mask)
1303-
1304-
cvals, na_count = self._infer_types(
1305-
values, set(col_na_values) | col_na_fvalues,
1306-
try_num_bool=False)
1307-
return cvals, na_count
1308-
13091298
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13101299
converters=None, dtypes=None):
13111300
result = {}
@@ -1323,45 +1312,58 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13231312
else:
13241313
col_na_values, col_na_fvalues = set(), set()
13251314

1326-
if conv_f is not None and cast_type is None:
1327-
# if type is not specified, apply the conversion first, without
1328-
# inference
1329-
cvals, na_count = self._apply_converter(
1330-
values, conv_f, na_values,
1331-
col_na_values, col_na_fvalues)
1315+
if conv_f is not None:
1316+
# conv_f applied to data before inference
1317+
# dtype isn't used if a converted specified
1318+
try:
1319+
values = lib.map_infer(values, conv_f)
1320+
except ValueError:
1321+
mask = lib.ismember(values, na_values).view(np.uint8)
1322+
values = lib.map_infer_mask(values, conv_f, mask)
1323+
1324+
cvals, na_count = self._infer_types(
1325+
values, set(col_na_values) | col_na_fvalues,
1326+
try_num_bool=False)
13321327
else:
1333-
try_num_bool = True
1334-
if cast_type and is_object_dtype(cast_type):
1335-
# skip inference if specified dtype is object
1336-
try_num_bool = False
1328+
# skip inference if specified dtype is object
1329+
try_num_bool = not (cast_type and is_object_dtype(cast_type))
13371330

13381331
# general type inference and conversion
13391332
cvals, na_count = self._infer_types(
13401333
values, set(col_na_values) | col_na_fvalues,
13411334
try_num_bool)
13421335

1336+
# type specificed in dtype param
1337+
if cast_type and not is_dtype_equal(cvals, cast_type):
1338+
cvals = self._cast_types(cvals, cast_type, c)
1339+
13431340
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
13441341
cvals = lib.downcast_int64(
13451342
cvals, _parser.na_values,
13461343
self.use_unsigned)
13471344

1348-
if cast_type and not is_dtype_equal(cvals, cast_type):
1349-
# type specificed in dtype param
1350-
1351-
cvals = self._cast_types(cvals, cast_type, c)
1352-
# for consistency with c-parser, if a converter and dtype are
1353-
# specified, apply the converter last
1354-
if conv_f is not None:
1355-
values, na_count = self._apply_converter(
1356-
values, conv_f, na_values,
1357-
col_na_values, col_na_fvalues)
1358-
13591345
result[c] = cvals
13601346
if verbose and na_count:
13611347
print('Filled %d NA values in column %s' % (na_count, str(c)))
13621348
return result
13631349

13641350
def _infer_types(self, values, na_values, try_num_bool=True):
1351+
"""
1352+
Infer types of values, possibly casting
1353+
1354+
Parameters
1355+
----------
1356+
values : ndarray
1357+
na_values : set
1358+
try_num_bool : bool, default try
1359+
try to cast values to numeric (first preference) or boolean
1360+
1361+
Returns:
1362+
--------
1363+
converted : ndarray
1364+
na_count : int
1365+
"""
1366+
13651367
na_count = 0
13661368
if issubclass(values.dtype.type, (np.number, np.bool_)):
13671369
mask = lib.ismember(values, na_values)
@@ -1393,7 +1395,22 @@ def _infer_types(self, values, na_values, try_num_bool=True):
13931395
return result, na_count
13941396

13951397
def _cast_types(self, values, cast_type, column):
1396-
""" cast column to type specified in dtypes= param """
1398+
"""
1399+
Cast values to specified type
1400+
1401+
Parameters
1402+
----------
1403+
values : ndarray
1404+
cast_type : string or np.dtype
1405+
dtype to cast values to
1406+
column : string
1407+
column name - used only for error reporting
1408+
1409+
Returns
1410+
-------
1411+
converted : ndarray
1412+
"""
1413+
13971414
if is_categorical_dtype(cast_type):
13981415
# XXX this is for consistency with
13991416
# c-parser which parses all categories

pandas/io/tests/parser/dtypes.py

+10
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,13 @@ def test_raise_on_passed_int_dtype_with_nas(self):
214214
self.assertRaises(ValueError, self.read_csv, StringIO(data),
215215
sep=",", skipinitialspace=True,
216216
dtype={'DOY': np.int64})
217+
218+
def test_dtype_with_converter(self):
219+
data = """a,b
220+
1.1,2.2
221+
1.2,2.3"""
222+
result = self.read_csv(StringIO(data), dtype={'a': 'i8'},
223+
converters={'a': lambda x: str(x)})
224+
# dtype spec ignored if converted specified
225+
expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]})
226+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)