add docs; test for conv cast

chris-b1 · chris-b1 · commit ab7e1e87e1d2 · 2016-10-30T16:39:33.000-05:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None``
   Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}``
   (unsupported with ``engine='python'``). Use `str` or `object` to preserve and
   not interpret dtype.
+
+  .. versionadded:: 0.20.0 support for the Python parser.
+
 engine : {``'c'``, ``'python'``}
   Parser engine to use. The C engine is faster while the python engine is
   currently more feature-complete.
@@ -473,10 +476,8 @@ However, if you wanted for all the data to be coerced, no matter the type, then
 using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be
 worth trying.
 
-.. note::
-    The ``dtype`` option is currently only supported by the C engine.
-    Specifying ``dtype`` with ``engine`` other than 'c' raises a
-    ``ValueError``.
+  .. versionadded:: 0.20.0 support for the Python parser.
+     The ``dtype`` option is supported by the 'python' engine
 
 .. note::
    In some cases, reading in abnormal data with columns containing mixed dtypes
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -31,7 +31,15 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
 
+ - The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
+ is now supported with the ``'python'`` engine.  See the :ref:`io docs <io.dtypes>` for more information.
 
+.. ipython:: python
+
+   from io import StringIO
+   data = "a,b\n1,2\n3,4"
+   pd.read_csv(StringIO(data), engine='python').dtypes
+   pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes
 
 .. _whatsnew_0200.api_breaking:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -115,8 +115,11 @@
 dtype : Type name or dict of column -> type, default None
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     Use `str` or `object` to preserve and not interpret dtype.
-    If converters are specified, they will be applied AFTER
-    dtype conversion.
+    If converters are specified, they will be applied INSTEAD
+    of dtype conversion.
+
+  .. versionadded:: 0.20.0 support for the Python parser.
+
 %s
 converters : dict, default None
     Dict of functions for converting values in certain columns. Keys can either
@@ -1292,20 +1295,6 @@ def _agg_index(self, index, try_parse_dates=True):
 
         return index
 
-    def _apply_converter(self, values, conv_f, na_values, col_na_values,
-                         col_na_fvalues):
-        """ apply converter function to values, respecting NAs """
-        try:
-            values = lib.map_infer(values, conv_f)
-        except ValueError:
-            mask = lib.ismember(values, na_values).view(np.uint8)
-            values = lib.map_infer_mask(values, conv_f, mask)
-
-        cvals, na_count = self._infer_types(
-            values, set(col_na_values) | col_na_fvalues,
-            try_num_bool=False)
-        return cvals, na_count
-
     def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                              converters=None, dtypes=None):
         result = {}
@@ -1323,45 +1312,58 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
             else:
                 col_na_values, col_na_fvalues = set(), set()
 
-            if conv_f is not None and cast_type is None:
-                # if type is not specified, apply the conversion first, without
-                # inference
-                cvals, na_count = self._apply_converter(
-                    values, conv_f, na_values,
-                    col_na_values, col_na_fvalues)
+            if conv_f is not None:
+                # conv_f applied to data before inference
+                # dtype isn't used if a converted specified
+                try:
+                    values = lib.map_infer(values, conv_f)
+                except ValueError:
+                    mask = lib.ismember(values, na_values).view(np.uint8)
+                    values = lib.map_infer_mask(values, conv_f, mask)
+
+                cvals, na_count = self._infer_types(
+                    values, set(col_na_values) | col_na_fvalues,
+                    try_num_bool=False)
             else:
-                try_num_bool = True
-                if cast_type and is_object_dtype(cast_type):
-                    # skip inference if specified dtype is object
-                    try_num_bool = False
+                # skip inference if specified dtype is object
+                try_num_bool = not (cast_type and is_object_dtype(cast_type))
 
                 # general type inference and conversion
                 cvals, na_count = self._infer_types(
                     values, set(col_na_values) | col_na_fvalues,
                     try_num_bool)
 
+                # type specificed in dtype param
+                if cast_type and not is_dtype_equal(cvals, cast_type):
+                    cvals = self._cast_types(cvals, cast_type, c)
+
             if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
                 cvals = lib.downcast_int64(
                     cvals, _parser.na_values,
                     self.use_unsigned)
 
-            if cast_type and not is_dtype_equal(cvals, cast_type):
-                # type specificed in dtype param
-
-                cvals = self._cast_types(cvals, cast_type, c)
-                # for consistency with c-parser, if a converter and dtype are
-                # specified, apply the converter last
-                if conv_f is not None:
-                    values, na_count = self._apply_converter(
-                        values, conv_f, na_values,
-                        col_na_values, col_na_fvalues)
-
             result[c] = cvals
             if verbose and na_count:
                 print('Filled %d NA values in column %s' % (na_count, str(c)))
         return result
 
     def _infer_types(self, values, na_values, try_num_bool=True):
+        """
+        Infer types of values, possibly casting
+
+        Parameters
+        ----------
+        values : ndarray
+        na_values : set
+        try_num_bool : bool, default try
+           try to cast values to numeric (first preference) or boolean
+
+        Returns:
+        --------
+        converted : ndarray
+        na_count : int
+        """
+
         na_count = 0
         if issubclass(values.dtype.type, (np.number, np.bool_)):
             mask = lib.ismember(values, na_values)
@@ -1393,7 +1395,22 @@ def _infer_types(self, values, na_values, try_num_bool=True):
         return result, na_count
 
     def _cast_types(self, values, cast_type, column):
-        """ cast column to type specified in dtypes= param """
+        """
+        Cast values to specified type
+
+        Parameters
+        ----------
+        values : ndarray
+        cast_type : string or np.dtype
+           dtype to cast values to
+        column : string
+            column name - used only for error reporting
+
+        Returns
+        -------
+        converted : ndarray
+        """
+
         if is_categorical_dtype(cast_type):
             # XXX this is for consistency with
             # c-parser which parses all categories
diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py
@@ -214,3 +214,13 @@ def test_raise_on_passed_int_dtype_with_nas(self):
         self.assertRaises(ValueError, self.read_csv, StringIO(data),
                           sep=",", skipinitialspace=True,
                           dtype={'DOY': np.int64})
+
+    def test_dtype_with_converter(self):
+        data = """a,b
+1.1,2.2
+1.2,2.3"""
+        result = self.read_csv(StringIO(data), dtype={'a': 'i8'},
+                               converters={'a': lambda x: str(x)})
+        # dtype spec ignored if converted specified
+        expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]})
+        tm.assert_frame_equal(result, expected)