diff --git a/doc/source/io.rst b/doc/source/io.rst index ee319092c6dd5..b1c151def26af 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None`` Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` (unsupported with ``engine='python'``). Use `str` or `object` to preserve and not interpret dtype. + + .. versionadded:: 0.20.0 support for the Python parser. + engine : {``'c'``, ``'python'``} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -473,10 +476,9 @@ However, if you wanted for all the data to be coerced, no matter the type, then using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be worth trying. -.. note:: - The ``dtype`` option is currently only supported by the C engine. - Specifying ``dtype`` with ``engine`` other than 'c' raises a - ``ValueError``. + .. versionadded:: 0.20.0 support for the Python parser. + + The ``dtype`` option is supported by the 'python' engine .. note:: In some cases, reading in abnormal data with columns containing mixed dtypes diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 65b62601c7022..6e3559bee728d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -22,8 +22,17 @@ New features ~~~~~~~~~~~~ +``read_csv`` supports ``dtype`` keyword for python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns + is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. +.. ipython:: python + + data = "a,b\n1,2\n3,4" + pd.read_csv(StringIO(data), engine='python').dtypes + pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes .. _whatsnew_0200.enhancements.other: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 929b360854d5b..0736535ce2d67 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,11 +17,15 @@ zip, string_types, map, u) from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, - is_float, - is_scalar) + is_float, is_dtype_equal, + is_object_dtype, + is_scalar, is_categorical_dtype) +from pandas.types.missing import isnull +from pandas.types.cast import _astype_nansafe from pandas.core.index import Index, MultiIndex, RangeIndex from pandas.core.series import Series from pandas.core.frame import DataFrame +from pandas.core.categorical import Categorical from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser @@ -111,8 +115,9 @@ are duplicate names in the columns. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} - (Unsupported with engine='python'). Use `str` or `object` to preserve and - not interpret dtype. + Use `str` or `object` to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. %s converters : dict, default None Dict of functions for converting values in certain columns. Keys can either @@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds): 'true_values': None, 'false_values': None, 'converters': None, + 'dtype': None, 'skipfooter': 0, 'keep_default_na': True, @@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines': None, 'error_bad_lines': True, 'warn_bad_lines': True, - 'dtype': None, 'float_precision': None } @@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', - 'dtype', 'float_precision', ]) _deprecated_args = set([ @@ -834,9 +838,6 @@ def _clean_options(self, options, engine): " ignored as it is not supported by the 'python'" " engine.").format(reason=fallback_reason, option=arg) - if arg == 'dtype': - msg += " (Note the 'converters' option provides"\ - " similar functionality.)" raise ValueError(msg) del result[arg] @@ -1285,7 +1286,7 @@ def _agg_index(self, index, try_parse_dates=True): col_na_values, col_na_fvalues = _get_na_values( col_name, self.na_values, self.na_fvalues) - arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) + arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_names) @@ -1293,10 +1294,15 @@ def _agg_index(self, index, try_parse_dates=True): return index def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, - converters=None): + converters=None, dtypes=None): result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) + if isinstance(dtypes, dict): + cast_type = dtypes.get(c, None) + else: + # single dtype or None + cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( @@ -1304,17 +1310,35 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: col_na_values, col_na_fvalues = set(), set() - coerce_type = True if conv_f is not None: + # conv_f applied to data before inference + if cast_type is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(c), ParserWarning, + stacklevel=7) + try: values = lib.map_infer(values, conv_f) except ValueError: mask = lib.ismember(values, na_values).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) - coerce_type = False - cvals, na_count = self._convert_types( - values, set(col_na_values) | col_na_fvalues, coerce_type) + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_num_bool=False) + else: + # skip inference if specified dtype is object + try_num_bool = not (cast_type and is_object_dtype(cast_type)) + + # general type inference and conversion + cvals, na_count = self._infer_types( + values, set(col_na_values) | col_na_fvalues, + try_num_bool) + + # type specificed in dtype param + if cast_type and not is_dtype_equal(cvals, cast_type): + cvals = self._cast_types(cvals, cast_type, c) if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: cvals = lib.downcast_int64( @@ -1326,7 +1350,23 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, print('Filled %d NA values in column %s' % (na_count, str(c))) return result - def _convert_types(self, values, na_values, try_num_bool=True): + def _infer_types(self, values, na_values, try_num_bool=True): + """ + Infer types of values, possibly casting + + Parameters + ---------- + values : ndarray + na_values : set + try_num_bool : bool, default try + try to cast values to numeric (first preference) or boolean + + Returns: + -------- + converted : ndarray + na_count : int + """ + na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = lib.ismember(values, na_values) @@ -1340,6 +1380,7 @@ def _convert_types(self, values, na_values, try_num_bool=True): if try_num_bool: try: result = lib.maybe_convert_numeric(values, na_values, False) + na_count = isnull(result).sum() except Exception: result = values if values.dtype == np.object_: @@ -1356,6 +1397,38 @@ def _convert_types(self, values, na_values, try_num_bool=True): return result, na_count + def _cast_types(self, values, cast_type, column): + """ + Cast values to specified type + + Parameters + ---------- + values : ndarray + cast_type : string or np.dtype + dtype to cast values to + column : string + column name - used only for error reporting + + Returns + ------- + converted : ndarray + """ + + if is_categorical_dtype(cast_type): + # XXX this is for consistency with + # c-parser which parses all categories + # as strings + if not is_object_dtype(values): + values = _astype_nansafe(values, str) + values = Categorical(values) + else: + try: + values = _astype_nansafe(values, cast_type, copy=True) + except ValueError: + raise ValueError("Unable to convert column %s to " + "type %s" % (column, cast_type)) + return values + def _do_date_conversions(self, names, data): # returns data, columns if self.parse_dates is not None: @@ -1784,6 +1857,7 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.dtype = kwds['dtype'] self.compact_ints = kwds['compact_ints'] self.use_unsigned = kwds['use_unsigned'] @@ -1982,7 +2056,7 @@ def read(self, rows=None): # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names) + names, self.index_col, self.index_names, self.dtype) columns = self._maybe_make_multi_index_columns( columns, self.col_names) return index, columns, col_dict @@ -2033,15 +2107,25 @@ def get_chunk(self, size=None): def _convert_data(self, data): # apply converters - clean_conv = {} - - for col, f in compat.iteritems(self.converters): - if isinstance(col, int) and col not in self.orig_names: - col = self.orig_names[col] - clean_conv[col] = f + def _clean_mapping(mapping): + "converts col numbers to names" + clean = {} + for col, v in compat.iteritems(mapping): + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + clean[col] = v + return clean + + clean_conv = _clean_mapping(self.converters) + if not isinstance(self.dtype, dict): + # handles single dtype applied to all columns + clean_dtypes = self.dtype + else: + clean_dtypes = _clean_mapping(self.dtype) return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, - self.verbose, clean_conv) + self.verbose, clean_conv, + clean_dtypes) def _to_recarray(self, data, columns): dtypes = [] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 9cbe88d4032a3..c781b0549ee60 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,10 +12,9 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas import DataFrame from pandas import compat from pandas.compat import StringIO, range, lrange -from pandas.types.dtypes import CategoricalDtype class CParserTests(object): @@ -100,29 +99,13 @@ def test_dtype_and_names_error(self): self.read_csv(StringIO(data), sep=r'\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) - def test_passing_dtype(self): - # see gh-6607 + def test_unsupported_dtype(self): df = DataFrame(np.random.rand(5, 2), columns=list( 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + with tm.ensure_clean('__unsupported_dtype__.csv') as path: df.to_csv(path) - # see gh-3795: passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes, Series( - {'A': 'object', 'B': 'object'})) - - # we expect all object columns, so need to - # convert to test for equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, - dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - # valid but we don't support it (date) self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, @@ -141,11 +124,6 @@ def test_passing_dtype(self): dtype={'A': 'U8'}, index_col=0) - # see gh-12048: empty frame - actual = self.read_csv(StringIO('A,B'), dtype=str) - expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) - tm.assert_frame_equal(actual, expected) - def test_precise_conversion(self): # see gh-8002 tm._skip_if_32bit() @@ -178,104 +156,6 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_pass_dtype(self): - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'object') - - def test_categorical_dtype(self): - # GH 10153 - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['a', 'a', 'b']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'a': 'category', - 'b': 'category', - 'c': CategoricalDtype()}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) - expected = pd.DataFrame({'a': [1, 1, 2], - 'b': Categorical(['a', 'a', 'b']), - 'c': [3.4, 3.4, 4.5]}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - # unsorted - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', 'b', 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - # missing - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', np.nan, 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_encoding(self): - # GH 10153 - pth = tm.get_data_path('unicode_series.csv') - encoding = 'latin-1' - expected = self.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - actual = self.read_csv(pth, header=None, encoding=encoding, - dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - pth = tm.get_data_path('utf16_ex.txt') - encoding = 'utf-16' - expected = self.read_table(pth, encoding=encoding) - expected = expected.apply(Categorical) - actual = self.read_table(pth, encoding=encoding, dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_chunksize(self): - # GH 10153 - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [pd.DataFrame({'a': [1, 1], - 'b': Categorical(['a', 'b'])}), - pd.DataFrame({'a': [1, 2], - 'b': Categorical(['b', 'c'])}, - index=[2, 3])] - actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, - chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( @@ -295,66 +175,6 @@ def test_pass_dtype_as_recarray(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1') - def test_empty_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) - - expected = DataFrame({'one': np.empty(0, dtype='u1'), - 'two': np.empty(0, dtype=np.object)}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_index_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), index_col=['one'], - dtype={'one': 'u1', 1: 'f'}) - - expected = DataFrame({'two': np.empty(0, dtype='f')}, - index=Index([], dtype='u1', name='one')) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_multiindex_pass_dtype(self): - data = 'one,two,three' - result = self.read_csv(StringIO(data), index_col=['one', 'two'], - dtype={'one': 'u1', 1: 'f8'}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), - np.empty(0, dtype='O')], - names=['one', 'two']) - expected = DataFrame( - {'three': np.empty(0, dtype=np.object)}, index=exp_idx) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={ - 'one': 'u1', 'one.1': 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_indexes(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # see gh-9424 - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one.1', dtype='f')], axis=1) - - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_usecols_dtypes(self): data = """\ 1,2,3 @@ -400,16 +220,6 @@ def test_custom_lineterminator(self): tm.assert_frame_equal(result, expected) - def test_raise_on_passed_int_dtype_with_nas(self): - # see gh-2631 - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - self.assertRaises(ValueError, self.read_csv, StringIO(data), - sep=",", skipinitialspace=True, - dtype={'DOY': np.int64}) - def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 @@ -561,49 +371,3 @@ def test_internal_null_byte(self): result = self.read_csv(StringIO(data), names=names) tm.assert_frame_equal(result, expected) - - def test_empty_dtype(self): - # see gh-14712 - data = 'a,b' - - expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) - result = self.read_csv(StringIO(data), header=0, dtype=np.float64) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Categorical([]), - 'b': pd.Categorical([])}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='category') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') - result = self.read_csv(StringIO(data), header=0, - dtype='datetime64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), - 'b': pd.Series([], dtype='timedelta64[ns]')}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='timedelta64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={0: np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.int32) - expected['b'] = expected['b'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.int32, 1: np.float64}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py new file mode 100644 index 0000000000000..18c37b31f6480 --- /dev/null +++ b/pandas/io/tests/parser/dtypes.py @@ -0,0 +1,274 @@ +# -*- coding: utf-8 -*- + +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" + +import numpy as np +import pandas as pd +import pandas.util.testing as tm + +from pandas import DataFrame, Series, Index, MultiIndex, Categorical +from pandas.compat import StringIO +from pandas.types.dtypes import CategoricalDtype +from pandas.io.common import ParserWarning + + +class DtypeTests(object): + def test_passing_dtype(self): + # see gh-6607 + df = DataFrame(np.random.rand(5, 2).round(4), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) + + with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: + df.to_csv(path) + + # see gh-3795: passing 'str' as the dtype + result = self.read_csv(path, dtype=str, index_col=0) + expected = df.astype(str) + tm.assert_frame_equal(result, expected) + + # for parsing, interpret object as str + result = self.read_csv(path, dtype=object, index_col=0) + tm.assert_frame_equal(result, expected) + + # we expect all object columns, so need to + # convert to test for equivalence + result = result.astype(float) + tm.assert_frame_equal(result, df) + + # invalid dtype + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'foo', 'B': 'float64'}, + index_col=0) + + # see gh-12048: empty frame + actual = self.read_csv(StringIO('A,B'), dtype=str) + expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) + tm.assert_frame_equal(actual, expected) + + def test_pass_dtype(self): + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'object') + + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', 'b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', np.nan, 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) + actual = self.read_table(pth, encoding=encoding, dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + def test_empty_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), + 'two': np.empty(0, dtype=np.object)}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_index_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), index_col=['one'], + dtype={'one': 'u1', 1: 'f'}) + + expected = DataFrame({'two': np.empty(0, dtype='f')}, + index=Index([], dtype='u1', name='one')) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_multiindex_pass_dtype(self): + data = 'one,two,three' + result = self.read_csv(StringIO(data), index_col=['one', 'two'], + dtype={'one': 'u1', 1: 'f8'}) + + exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), + np.empty(0, dtype='O')], + names=['one', 'two']) + expected = DataFrame( + {'three': np.empty(0, dtype=np.object)}, index=exp_idx) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 'one.1': 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_mangled_column_pass_dtype_by_indexes(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_empty_with_dup_column_pass_dtype_by_indexes(self): + # see gh-9424 + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one.1', dtype='f')], axis=1) + + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + def test_raise_on_passed_int_dtype_with_nas(self): + # see gh-2631 + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + self.assertRaises(ValueError, self.read_csv, StringIO(data), + sep=",", skipinitialspace=True, + dtype={'DOY': np.int64}) + + def test_dtype_with_converter(self): + data = """a,b +1.1,2.2 +1.2,2.3""" + # dtype spec ignored if converted specified + with tm.assert_produces_warning(ParserWarning): + result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, + converters={'a': lambda x: str(x)}) + expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + def test_empty_dtype(self): + # see gh-14712 + data = 'a,b' + + expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) + result = self.read_csv(StringIO(data), header=0, dtype=np.float64) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Categorical([]), + 'b': pd.Categorical([])}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='category') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') + result = self.read_csv(StringIO(data), header=0, + dtype='datetime64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), + 'b': pd.Series([], dtype='timedelta64[ns]')}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='timedelta64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={0: np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.int32) + expected['b'] = expected['b'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.int32, 1: np.float64}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 6001c85ae76b1..6cca2e35e1135 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -22,6 +22,7 @@ from .compression import CompressionTests from .multithread import MultithreadTests from .python_parser_only import PythonParserTests +from .dtypes import DtypeTests class BaseParser(CommentTests, CompressionTests, @@ -29,7 +30,8 @@ class BaseParser(CommentTests, CompressionTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, - UsecolsTests, QuotingTests): + UsecolsTests, QuotingTests, + DtypeTests): def read_csv(self, *args, **kwargs): raise NotImplementedError diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 5d60c20854a83..ffd1cfa9a2538 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -44,16 +44,6 @@ def test_c_engine(self): data = 'a b c\n1 2 3' msg = 'does not support' - # specify C-unsupported options with python-unsupported option - # (options will be ignored on fallback, raise) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=None, - delim_whitespace=False, dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), sep=r'\s', dtype={'a': float}) - with tm.assertRaisesRegexp(ValueError, msg): - read_table(StringIO(data), skipfooter=1, dtype={'a': float}) - # specify C engine with unsupported options (raise) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 6b43dfbabc4a0..6760e822960f1 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -13,7 +13,7 @@ from cpython cimport (PyObject, PyBytes_FromString, PyUnicode_Check, PyUnicode_AsUTF8String, PyErr_Occurred, PyErr_Fetch) from cpython.ref cimport PyObject, Py_XDECREF -from io.common import ParserError, DtypeWarning, EmptyDataError +from io.common import ParserError, DtypeWarning, EmptyDataError, ParserWarning # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. @@ -987,7 +987,7 @@ cdef class TextReader: Py_ssize_t i, nused kh_str_t *na_hashset = NULL int start, end - object name, na_flist + object name, na_flist, col_dtype = None bint na_filter = 0 Py_ssize_t num_cols @@ -1043,14 +1043,34 @@ cdef class TextReader: else: na_filter = 0 + col_dtype = None + if self.dtype is not None: + if isinstance(self.dtype, dict): + if name in self.dtype: + col_dtype = self.dtype[name] + elif i in self.dtype: + col_dtype = self.dtype[i] + else: + if self.dtype.names: + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) + else: + col_dtype = self.dtype + if conv: + if col_dtype is not None: + warnings.warn(("Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used").format(name), ParserWarning, + stacklevel=5) results[i] = _apply_converter(conv, self.parser, i, start, end, self.c_encoding) continue # Should return as the desired dtype (inferred or specified) col_res, na_count = self._convert_tokens( - i, start, end, name, na_filter, na_hashset, na_flist) + i, start, end, name, na_filter, na_hashset, + na_flist, col_dtype) if na_filter: self._free_na_set(na_hashset) @@ -1075,32 +1095,17 @@ cdef class TextReader: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, kh_str_t *na_hashset, - object na_flist): - cdef: - object col_dtype = None - - if self.dtype is not None: - if isinstance(self.dtype, dict): - if name in self.dtype: - col_dtype = self.dtype[name] - elif i in self.dtype: - col_dtype = self.dtype[i] - else: - if self.dtype.names: - # structured array - col_dtype = np.dtype(self.dtype.descr[i][1]) - else: - col_dtype = self.dtype + object na_flist, object col_dtype): - if col_dtype is not None: - col_res, na_count = self._convert_with_dtype( - col_dtype, i, start, end, na_filter, - 1, na_hashset, na_flist) + if col_dtype is not None: + col_res, na_count = self._convert_with_dtype( + col_dtype, i, start, end, na_filter, + 1, na_hashset, na_flist) - # Fallback on the parse (e.g. we requested int dtype, - # but its actually a float). - if col_res is not None: - return col_res, na_count + # Fallback on the parse (e.g. we requested int dtype, + # but its actually a float). + if col_res is not None: + return col_res, na_count if i in self.noconvert: return self._string_convert(i, start, end, na_filter, na_hashset)