Skip to content

API: add dtype= option to python parser #14295

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Nov 26, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None``
Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}``
(unsupported with ``engine='python'``). Use `str` or `object` to preserve and
not interpret dtype.

.. versionadded:: 0.20.0 support for the Python parser.

engine : {``'c'``, ``'python'``}
Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.
Expand Down Expand Up @@ -473,10 +476,9 @@ However, if you wanted for all the data to be coerced, no matter the type, then
using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be
worth trying.

.. note::
The ``dtype`` option is currently only supported by the C engine.
Specifying ``dtype`` with ``engine`` other than 'c' raises a
``ValueError``.
.. versionadded:: 0.20.0 support for the Python parser.

The ``dtype`` option is supported by the 'python' engine
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you need a blank line here (to avoid warnings)


.. note::
In some cases, reading in abnormal data with columns containing mixed dtypes
Expand Down
9 changes: 9 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,17 @@ New features
~~~~~~~~~~~~


``read_csv`` supports ``dtype`` keyword for python engine
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs <io.dtypes>` for more information.

.. ipython:: python

data = "a,b\n1,2\n3,4"
pd.read_csv(StringIO(data), engine='python').dtypes
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes

.. _whatsnew_0200.enhancements.other:

Expand Down
132 changes: 108 additions & 24 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@
zip, string_types, map, u)
from pandas.types.common import (is_integer, _ensure_object,
is_list_like, is_integer_dtype,
is_float,
is_scalar)
is_float, is_dtype_equal,
is_object_dtype,
is_scalar, is_categorical_dtype)
from pandas.types.missing import isnull
from pandas.types.cast import _astype_nansafe
from pandas.core.index import Index, MultiIndex, RangeIndex
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.categorical import Categorical
from pandas.core.common import AbstractMethodError
from pandas.core.config import get_option
from pandas.io.date_converters import generic_parser
Expand Down Expand Up @@ -111,8 +115,9 @@
are duplicate names in the columns.
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
(Unsupported with engine='python'). Use `str` or `object` to preserve and
not interpret dtype.
Use `str` or `object` to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.
%s
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can either
Expand Down Expand Up @@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds):
'true_values': None,
'false_values': None,
'converters': None,
'dtype': None,
'skipfooter': 0,

'keep_default_na': True,
Expand Down Expand Up @@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds):
'buffer_lines': None,
'error_bad_lines': True,
'warn_bad_lines': True,
'dtype': None,
'float_precision': None
}

Expand All @@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds):
'buffer_lines',
'error_bad_lines',
'warn_bad_lines',
'dtype',
'float_precision',
])
_deprecated_args = set([
Expand Down Expand Up @@ -834,9 +838,6 @@ def _clean_options(self, options, engine):
" ignored as it is not supported by the 'python'"
" engine.").format(reason=fallback_reason,
option=arg)
if arg == 'dtype':
msg += " (Note the 'converters' option provides"\
" similar functionality.)"
raise ValueError(msg)
del result[arg]

Expand Down Expand Up @@ -1285,36 +1286,59 @@ def _agg_index(self, index, try_parse_dates=True):
col_na_values, col_na_fvalues = _get_na_values(
col_name, self.na_values, self.na_fvalues)

arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues)
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
arrays.append(arr)

index = MultiIndex.from_arrays(arrays, names=self.index_names)

return index

def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
converters=None):
converters=None, dtypes=None):
result = {}
for c, values in compat.iteritems(dct):
conv_f = None if converters is None else converters.get(c, None)
if isinstance(dtypes, dict):
cast_type = dtypes.get(c, None)
else:
# single dtype or None
cast_type = dtypes

if self.na_filter:
col_na_values, col_na_fvalues = _get_na_values(
c, na_values, na_fvalues)
else:
col_na_values, col_na_fvalues = set(), set()

coerce_type = True
if conv_f is not None:
# conv_f applied to data before inference
if cast_type is not None:
warnings.warn(("Both a converter and dtype were specified "
"for column {0} - only the converter will "
"be used").format(c), ParserWarning,
stacklevel=7)

try:
values = lib.map_infer(values, conv_f)
except ValueError:
mask = lib.ismember(values, na_values).view(np.uint8)
values = lib.map_infer_mask(values, conv_f, mask)
coerce_type = False

cvals, na_count = self._convert_types(
values, set(col_na_values) | col_na_fvalues, coerce_type)
cvals, na_count = self._infer_types(
values, set(col_na_values) | col_na_fvalues,
try_num_bool=False)
else:
# skip inference if specified dtype is object
try_num_bool = not (cast_type and is_object_dtype(cast_type))

# general type inference and conversion
cvals, na_count = self._infer_types(
values, set(col_na_values) | col_na_fvalues,
try_num_bool)

# type specificed in dtype param
if cast_type and not is_dtype_equal(cvals, cast_type):
cvals = self._cast_types(cvals, cast_type, c)

if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
cvals = lib.downcast_int64(
Expand All @@ -1326,7 +1350,23 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
print('Filled %d NA values in column %s' % (na_count, str(c)))
return result

def _convert_types(self, values, na_values, try_num_bool=True):
def _infer_types(self, values, na_values, try_num_bool=True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While you are at it, can you add a docstring here?

"""
Infer types of values, possibly casting

Parameters
----------
values : ndarray
na_values : set
try_num_bool : bool, default try
try to cast values to numeric (first preference) or boolean

Returns:
--------
converted : ndarray
na_count : int
"""

na_count = 0
if issubclass(values.dtype.type, (np.number, np.bool_)):
mask = lib.ismember(values, na_values)
Expand All @@ -1340,6 +1380,7 @@ def _convert_types(self, values, na_values, try_num_bool=True):
if try_num_bool:
try:
result = lib.maybe_convert_numeric(values, na_values, False)
na_count = isnull(result).sum()
except Exception:
result = values
if values.dtype == np.object_:
Expand All @@ -1356,6 +1397,38 @@ def _convert_types(self, values, na_values, try_num_bool=True):

return result, na_count

def _cast_types(self, values, cast_type, column):
"""
Cast values to specified type

Parameters
----------
values : ndarray
cast_type : string or np.dtype
dtype to cast values to
column : string
column name - used only for error reporting

Returns
-------
converted : ndarray
"""

if is_categorical_dtype(cast_type):
# XXX this is for consistency with
# c-parser which parses all categories
# as strings
if not is_object_dtype(values):
values = _astype_nansafe(values, str)
values = Categorical(values)
else:
try:
values = _astype_nansafe(values, cast_type, copy=True)
except ValueError:
raise ValueError("Unable to convert column %s to "
"type %s" % (column, cast_type))
return values

def _do_date_conversions(self, names, data):
# returns data, columns
if self.parse_dates is not None:
Expand Down Expand Up @@ -1784,6 +1857,7 @@ def __init__(self, f, **kwds):

self.verbose = kwds['verbose']
self.converters = kwds['converters']
self.dtype = kwds['dtype']

self.compact_ints = kwds['compact_ints']
self.use_unsigned = kwds['use_unsigned']
Expand Down Expand Up @@ -1982,7 +2056,7 @@ def read(self, rows=None):
# DataFrame with the right metadata, even though it's length 0
names = self._maybe_dedup_names(self.orig_names)
index, columns, col_dict = _get_empty_meta(
names, self.index_col, self.index_names)
names, self.index_col, self.index_names, self.dtype)
columns = self._maybe_make_multi_index_columns(
columns, self.col_names)
return index, columns, col_dict
Expand Down Expand Up @@ -2033,15 +2107,25 @@ def get_chunk(self, size=None):

def _convert_data(self, data):
# apply converters
clean_conv = {}

for col, f in compat.iteritems(self.converters):
if isinstance(col, int) and col not in self.orig_names:
col = self.orig_names[col]
clean_conv[col] = f
def _clean_mapping(mapping):
"converts col numbers to names"
clean = {}
for col, v in compat.iteritems(mapping):
if isinstance(col, int) and col not in self.orig_names:
col = self.orig_names[col]
clean[col] = v
return clean

clean_conv = _clean_mapping(self.converters)
if not isinstance(self.dtype, dict):
# handles single dtype applied to all columns
clean_dtypes = self.dtype
else:
clean_dtypes = _clean_mapping(self.dtype)

return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
self.verbose, clean_conv)
self.verbose, clean_conv,
clean_dtypes)

def _to_recarray(self, data, columns):
dtypes = []
Expand Down
Loading