-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
API: add dtype= option to python parser #14295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
jorisvandenbossche
merged 24 commits into
pandas-dev:master
from
chris-b1:textreader-dtype
Nov 26, 2016
Merged
Changes from all commits
Commits
Show all changes
24 commits
Select commit
Hold shift + click to select a range
960441a
API: add dtype= option to python parser
chris-b1 7be7b42
remove unsupported test
chris-b1 65a94ae
add test/fix for dtype=object
chris-b1 6853587
float precision...
chris-b1 3024177
float precision fix
chris-b1 f9ff10e
add docs; test for conv cast
chris-b1 f5b23a6
Add warning if both converter and dtype specified
chris-b1 e0e5ae8
doc comments
chris-b1 a5821d3
doc updates
chris-b1 7c703fe
lint
chris-b1 d790bdf
API: add dtype= option to python parser
chris-b1 5462774
remove unsupported test
chris-b1 64c7214
add test/fix for dtype=object
chris-b1 26f42c2
float precision...
chris-b1 7fbe0a3
float precision fix
chris-b1 08315b8
add docs; test for conv cast
chris-b1 810e750
Add warning if both converter and dtype specified
chris-b1 10f5be3
doc comments
chris-b1 b2f7b94
doc updates
chris-b1 be2b43b
lint
chris-b1 47669d3
TST: move empty dtype tests from c_parser_only to dtype tests
jorisvandenbossche 34e3a96
Merge branch 'textreader-dtype' of https://github.com/chris-b1/pandas…
chris-b1 1706b39
issue ref
chris-b1 3abb0bd
fix merge conflict leftover
jorisvandenbossche File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,11 +17,15 @@ | |
zip, string_types, map, u) | ||
from pandas.types.common import (is_integer, _ensure_object, | ||
is_list_like, is_integer_dtype, | ||
is_float, | ||
is_scalar) | ||
is_float, is_dtype_equal, | ||
is_object_dtype, | ||
is_scalar, is_categorical_dtype) | ||
from pandas.types.missing import isnull | ||
from pandas.types.cast import _astype_nansafe | ||
from pandas.core.index import Index, MultiIndex, RangeIndex | ||
from pandas.core.series import Series | ||
from pandas.core.frame import DataFrame | ||
from pandas.core.categorical import Categorical | ||
from pandas.core.common import AbstractMethodError | ||
from pandas.core.config import get_option | ||
from pandas.io.date_converters import generic_parser | ||
|
@@ -111,8 +115,9 @@ | |
are duplicate names in the columns. | ||
dtype : Type name or dict of column -> type, default None | ||
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} | ||
(Unsupported with engine='python'). Use `str` or `object` to preserve and | ||
not interpret dtype. | ||
Use `str` or `object` to preserve and not interpret dtype. | ||
If converters are specified, they will be applied INSTEAD | ||
of dtype conversion. | ||
%s | ||
converters : dict, default None | ||
Dict of functions for converting values in certain columns. Keys can either | ||
|
@@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds): | |
'true_values': None, | ||
'false_values': None, | ||
'converters': None, | ||
'dtype': None, | ||
'skipfooter': 0, | ||
|
||
'keep_default_na': True, | ||
|
@@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds): | |
'buffer_lines': None, | ||
'error_bad_lines': True, | ||
'warn_bad_lines': True, | ||
'dtype': None, | ||
'float_precision': None | ||
} | ||
|
||
|
@@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds): | |
'buffer_lines', | ||
'error_bad_lines', | ||
'warn_bad_lines', | ||
'dtype', | ||
'float_precision', | ||
]) | ||
_deprecated_args = set([ | ||
|
@@ -834,9 +838,6 @@ def _clean_options(self, options, engine): | |
" ignored as it is not supported by the 'python'" | ||
" engine.").format(reason=fallback_reason, | ||
option=arg) | ||
if arg == 'dtype': | ||
msg += " (Note the 'converters' option provides"\ | ||
" similar functionality.)" | ||
raise ValueError(msg) | ||
del result[arg] | ||
|
||
|
@@ -1285,36 +1286,59 @@ def _agg_index(self, index, try_parse_dates=True): | |
col_na_values, col_na_fvalues = _get_na_values( | ||
col_name, self.na_values, self.na_fvalues) | ||
|
||
arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues) | ||
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) | ||
arrays.append(arr) | ||
|
||
index = MultiIndex.from_arrays(arrays, names=self.index_names) | ||
|
||
return index | ||
|
||
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, | ||
converters=None): | ||
converters=None, dtypes=None): | ||
result = {} | ||
for c, values in compat.iteritems(dct): | ||
conv_f = None if converters is None else converters.get(c, None) | ||
if isinstance(dtypes, dict): | ||
cast_type = dtypes.get(c, None) | ||
else: | ||
# single dtype or None | ||
cast_type = dtypes | ||
|
||
if self.na_filter: | ||
col_na_values, col_na_fvalues = _get_na_values( | ||
c, na_values, na_fvalues) | ||
else: | ||
col_na_values, col_na_fvalues = set(), set() | ||
|
||
coerce_type = True | ||
if conv_f is not None: | ||
# conv_f applied to data before inference | ||
if cast_type is not None: | ||
warnings.warn(("Both a converter and dtype were specified " | ||
"for column {0} - only the converter will " | ||
"be used").format(c), ParserWarning, | ||
stacklevel=7) | ||
|
||
try: | ||
values = lib.map_infer(values, conv_f) | ||
except ValueError: | ||
mask = lib.ismember(values, na_values).view(np.uint8) | ||
values = lib.map_infer_mask(values, conv_f, mask) | ||
coerce_type = False | ||
|
||
cvals, na_count = self._convert_types( | ||
values, set(col_na_values) | col_na_fvalues, coerce_type) | ||
cvals, na_count = self._infer_types( | ||
values, set(col_na_values) | col_na_fvalues, | ||
try_num_bool=False) | ||
else: | ||
# skip inference if specified dtype is object | ||
try_num_bool = not (cast_type and is_object_dtype(cast_type)) | ||
|
||
# general type inference and conversion | ||
cvals, na_count = self._infer_types( | ||
values, set(col_na_values) | col_na_fvalues, | ||
try_num_bool) | ||
|
||
# type specificed in dtype param | ||
if cast_type and not is_dtype_equal(cvals, cast_type): | ||
cvals = self._cast_types(cvals, cast_type, c) | ||
|
||
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: | ||
cvals = lib.downcast_int64( | ||
|
@@ -1326,7 +1350,23 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, | |
print('Filled %d NA values in column %s' % (na_count, str(c))) | ||
return result | ||
|
||
def _convert_types(self, values, na_values, try_num_bool=True): | ||
def _infer_types(self, values, na_values, try_num_bool=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While you are at it, can you add a docstring here? |
||
""" | ||
Infer types of values, possibly casting | ||
|
||
Parameters | ||
---------- | ||
values : ndarray | ||
na_values : set | ||
try_num_bool : bool, default try | ||
try to cast values to numeric (first preference) or boolean | ||
|
||
Returns: | ||
-------- | ||
converted : ndarray | ||
na_count : int | ||
""" | ||
|
||
na_count = 0 | ||
if issubclass(values.dtype.type, (np.number, np.bool_)): | ||
mask = lib.ismember(values, na_values) | ||
|
@@ -1340,6 +1380,7 @@ def _convert_types(self, values, na_values, try_num_bool=True): | |
if try_num_bool: | ||
try: | ||
result = lib.maybe_convert_numeric(values, na_values, False) | ||
na_count = isnull(result).sum() | ||
except Exception: | ||
result = values | ||
if values.dtype == np.object_: | ||
|
@@ -1356,6 +1397,38 @@ def _convert_types(self, values, na_values, try_num_bool=True): | |
|
||
return result, na_count | ||
|
||
def _cast_types(self, values, cast_type, column): | ||
""" | ||
Cast values to specified type | ||
|
||
Parameters | ||
---------- | ||
values : ndarray | ||
cast_type : string or np.dtype | ||
dtype to cast values to | ||
column : string | ||
column name - used only for error reporting | ||
|
||
Returns | ||
------- | ||
converted : ndarray | ||
""" | ||
|
||
if is_categorical_dtype(cast_type): | ||
# XXX this is for consistency with | ||
# c-parser which parses all categories | ||
# as strings | ||
if not is_object_dtype(values): | ||
values = _astype_nansafe(values, str) | ||
values = Categorical(values) | ||
else: | ||
try: | ||
values = _astype_nansafe(values, cast_type, copy=True) | ||
except ValueError: | ||
raise ValueError("Unable to convert column %s to " | ||
"type %s" % (column, cast_type)) | ||
return values | ||
|
||
def _do_date_conversions(self, names, data): | ||
# returns data, columns | ||
if self.parse_dates is not None: | ||
|
@@ -1784,6 +1857,7 @@ def __init__(self, f, **kwds): | |
|
||
self.verbose = kwds['verbose'] | ||
self.converters = kwds['converters'] | ||
self.dtype = kwds['dtype'] | ||
|
||
self.compact_ints = kwds['compact_ints'] | ||
self.use_unsigned = kwds['use_unsigned'] | ||
|
@@ -1982,7 +2056,7 @@ def read(self, rows=None): | |
# DataFrame with the right metadata, even though it's length 0 | ||
names = self._maybe_dedup_names(self.orig_names) | ||
index, columns, col_dict = _get_empty_meta( | ||
names, self.index_col, self.index_names) | ||
names, self.index_col, self.index_names, self.dtype) | ||
columns = self._maybe_make_multi_index_columns( | ||
columns, self.col_names) | ||
return index, columns, col_dict | ||
|
@@ -2033,15 +2107,25 @@ def get_chunk(self, size=None): | |
|
||
def _convert_data(self, data): | ||
# apply converters | ||
clean_conv = {} | ||
|
||
for col, f in compat.iteritems(self.converters): | ||
if isinstance(col, int) and col not in self.orig_names: | ||
col = self.orig_names[col] | ||
clean_conv[col] = f | ||
def _clean_mapping(mapping): | ||
"converts col numbers to names" | ||
clean = {} | ||
for col, v in compat.iteritems(mapping): | ||
if isinstance(col, int) and col not in self.orig_names: | ||
col = self.orig_names[col] | ||
clean[col] = v | ||
return clean | ||
|
||
clean_conv = _clean_mapping(self.converters) | ||
if not isinstance(self.dtype, dict): | ||
# handles single dtype applied to all columns | ||
clean_dtypes = self.dtype | ||
else: | ||
clean_dtypes = _clean_mapping(self.dtype) | ||
|
||
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, | ||
self.verbose, clean_conv) | ||
self.verbose, clean_conv, | ||
clean_dtypes) | ||
|
||
def _to_recarray(self, data, columns): | ||
dtypes = [] | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you need a blank line here (to avoid warnings)