Skip to content

REF: shift ravel in infer_dtype #24560

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ def clean_index_list(obj: list):
return obj, all_arrays

# don't force numpy coerce with nan's
inferred = infer_dtype(obj)
inferred = infer_dtype(obj, skipna=False)
if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
return np.asarray(obj, dtype=object), 0
elif inferred in ['integer']:
Expand Down Expand Up @@ -1210,6 +1210,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
values = construct_1d_object_array_from_listlike(value)

values = getattr(values, 'values', values)

# make contiguous
values = values.ravel()

if skipna:
values = values[~isnaobj(values)]

Expand All @@ -1220,9 +1224,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
if values.dtype != np.object_:
values = values.astype('O')

# make contiguous
values = values.ravel()

n = len(values)
if n == 0:
return 'empty'
Expand Down
14 changes: 9 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _ensure_arraylike(values):
ensure that we are arraylike if not already
"""
if not is_array_like(values):
inferred = lib.infer_dtype(values)
inferred = lib.infer_dtype(values, skipna=False)
if inferred in ['mixed', 'string', 'unicode']:
if isinstance(values, tuple):
values = list(values)
Expand Down Expand Up @@ -202,8 +202,10 @@ def _get_hashtable_algo(values):

if ndtype == 'object':

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
if lib.infer_dtype(values, skipna=False) in ['string']:
ndtype = 'string'
else:
ndtype = 'object'
Expand All @@ -220,8 +222,10 @@ def _get_data_algo(values, func_map):
values, dtype, ndtype = _ensure_data(values)
if ndtype == 'object':

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
if lib.infer_dtype(values, skipna=False) in ['string']:
ndtype = 'string'

f = func_map.get(ndtype, func_map['object'])
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1652,7 +1652,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
# TODO: We do not have tests specific to string-dtypes,
# also complex or categorical or other extension
copy = False
if lib.infer_dtype(data) == 'integer':
if lib.infer_dtype(data, skipna=False) == 'integer':
data = data.astype(np.int64)
else:
# data comes back here as either i8 to denote UTC timestamps
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ def coerce_to_array(values, dtype, mask=None, copy=False):

values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values)
if inferred_type is 'mixed' and isna(values).all():
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == 'empty':
values = np.empty(len(values))
values.fill(np.nan)
elif inferred_type not in ['floating', 'integer',
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ def __floordiv__(self, other):
elif is_object_dtype(other):
result = [self[n] // other[n] for n in range(len(self))]
result = np.array(result)
if lib.infer_dtype(result) == 'timedelta':
if lib.infer_dtype(result, skipna=False) == 'timedelta':
result, _ = sequence_to_td64ns(result)
return type(self)(result)
return result
Expand Down
12 changes: 7 additions & 5 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def trans(x):

if isinstance(dtype, string_types):
if dtype == 'infer':
inferred_type = lib.infer_dtype(ensure_object(result.ravel()))
inferred_type = lib.infer_dtype(ensure_object(result.ravel()),
skipna=False)
if inferred_type == 'boolean':
dtype = 'bool'
elif inferred_type == 'integer':
Expand Down Expand Up @@ -460,7 +461,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
return arr.dtype, np.asarray(arr)

# don't force numpy coerce with nan's
inferred = lib.infer_dtype(arr)
inferred = lib.infer_dtype(arr, skipna=False)
if inferred in ['string', 'bytes', 'unicode',
'mixed', 'mixed-integer']:
return (np.object_, arr)
Expand Down Expand Up @@ -941,10 +942,11 @@ def try_timedelta(v):

# We have at least a NaT and a string
# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but
# technically is also a datetime
# e.g. '00:00:01' is a timedelta but technically is also a datetime
value = try_timedelta(v)
if lib.infer_dtype(value) in ['mixed']:
if lib.infer_dtype(value, skipna=False) in ['mixed']:
# cannot skip missing values, as NaT implies that the string
# is actually a datetime
value = try_datetime(v)

return value
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,8 @@ def is_datetime_arraylike(arr):
if isinstance(arr, ABCDatetimeIndex):
return True
elif isinstance(arr, (np.ndarray, ABCSeries)):
return arr.dtype == object and lib.infer_dtype(arr) == 'datetime'
return (is_object_dtype(arr.dtype)
and lib.infer_dtype(arr, skipna=False) == 'datetime')
return getattr(arr, 'inferred_type', None) == 'datetime'


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ def _infer_fill_value(val):
if is_datetimelike(val):
return np.array('NaT', dtype=val.dtype)
elif is_object_dtype(val.dtype):
dtype = lib.infer_dtype(ensure_object(val))
dtype = lib.infer_dtype(ensure_object(val), skipna=False)
if dtype in ['datetime', 'datetime64']:
return np.array('NaT', dtype=_NS_DTYPE)
elif dtype in ['timedelta', 'timedelta64']:
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# should not be coerced
# GH 11836
if is_integer_dtype(dtype):
inferred = lib.infer_dtype(data)
inferred = lib.infer_dtype(data, skipna=False)
if inferred == 'integer':
data = maybe_cast_to_integer_array(data, dtype,
copy=copy)
Expand Down Expand Up @@ -376,7 +376,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
else:
data = data.astype(dtype)
elif is_float_dtype(dtype):
inferred = lib.infer_dtype(data)
inferred = lib.infer_dtype(data, skipna=False)
if inferred == 'string':
pass
else:
Expand Down Expand Up @@ -414,7 +414,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
subarr = subarr.copy()

if dtype is None:
inferred = lib.infer_dtype(subarr)
inferred = lib.infer_dtype(subarr, skipna=False)
if inferred == 'integer':
try:
return cls._try_convert_to_int_index(
Expand Down Expand Up @@ -1718,7 +1718,7 @@ def inferred_type(self):
"""
Return a string of the type inferred from the values.
"""
return lib.infer_dtype(self)
return lib.infer_dtype(self, skipna=False)

@cache_readonly
def is_all_dates(self):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2318,7 +2318,8 @@ def _partial_tup_index(self, tup, side='left'):
section = labs[start:end]

if lab not in lev:
if not lev.is_type_compatible(lib.infer_dtype([lab])):
if not lev.is_type_compatible(lib.infer_dtype([lab],
skipna=False)):
raise TypeError('Level type mismatch: %s' % lab)

# short circuit
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
subarr = np.array(data, dtype=object, copy=copy)

if is_object_dtype(subarr.dtype) and dtype != 'object':
inferred = lib.infer_dtype(subarr)
inferred = lib.infer_dtype(subarr, skipna=False)
if inferred == 'period':
try:
subarr = period_array(subarr)
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -947,7 +947,8 @@ def _maybe_coerce_merge_keys(self):
continue

# let's infer and see if we are ok
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
elif (lib.infer_dtype(lk, skipna=False)
== lib.infer_dtype(rk, skipna=False)):
continue

# Check if we are trying to merge on obviously
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
------
ValueError if bins are not of a compat dtype to dtype
"""
bins_dtype = infer_dtype(bins)
bins_dtype = infer_dtype(bins, skipna=False)
if is_timedelta64_dtype(dtype):
if bins_dtype in ['timedelta', 'timedelta64']:
bins = to_timedelta(bins).view(np.int64)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ def _get_with(self, key):
if isinstance(key, Index):
key_type = key.inferred_type
else:
key_type = lib.infer_dtype(key)
key_type = lib.infer_dtype(key, skipna=False)

if key_type == 'integer':
if self.index.is_integer() or self.index.is_floating():
Expand Down Expand Up @@ -1012,7 +1012,7 @@ def _set_with(self, key, value):
if isinstance(key, Index):
key_type = key.inferred_type
else:
key_type = lib.infer_dtype(key)
key_type = lib.infer_dtype(key, skipna=False)

if key_type == 'integer':
if self.index.inferred_type == 'integer':
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ def sort_mixed(values):
return np.concatenate([nums, np.asarray(strs, dtype=object)])

sorter = None
if PY3 and lib.infer_dtype(values) == 'mixed-integer':
if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer':
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1300,7 +1300,7 @@ def _validate_usecols_arg(usecols):
elif not is_list_like(usecols):
raise ValueError(msg)
else:
usecols_dtype = lib.infer_dtype(usecols)
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
if usecols_dtype not in ('empty', 'integer',
'string', 'unicode'):
raise ValueError(msg)
Expand Down
12 changes: 6 additions & 6 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1952,7 +1952,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
return self.set_atom_complex(block)

dtype = block.dtype.name
inferred_type = lib.infer_dtype(block.values)
inferred_type = lib.infer_dtype(block.values, skipna=False)

if inferred_type == 'date':
raise TypeError(
Expand Down Expand Up @@ -1998,15 +1998,15 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
data = block.values

# see if we have a valid string type
inferred_type = lib.infer_dtype(data.ravel())
inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
if inferred_type != 'string':

# we cannot serialize this data, so report an exception on a column
# by column basis
for i, item in enumerate(block_items):

col = block.iget(i)
inferred_type = lib.infer_dtype(col.ravel())
inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
if inferred_type != 'string':
raise TypeError(
"Cannot serialize the column [%s] because\n"
Expand Down Expand Up @@ -2745,7 +2745,7 @@ def write_array(self, key, value, items=None):

# infer the type, warn if we have a non-string type here (for
# performance)
inferred_type = lib.infer_dtype(value.ravel())
inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
if empty_array:
pass
elif inferred_type == 'string':
Expand Down Expand Up @@ -4512,7 +4512,7 @@ def _convert_index(index, encoding=None, errors='strict', format_type=None):
if isinstance(index, MultiIndex):
raise TypeError('MultiIndex not supported here!')

inferred_type = lib.infer_dtype(index)
inferred_type = lib.infer_dtype(index, skipna=False)

values = np.asarray(index)

Expand Down Expand Up @@ -4745,7 +4745,7 @@ def __init__(self, table, where=None, start=None, stop=None):

# see if we have a passed coordinate like
try:
inferred = lib.infer_dtype(where)
inferred = lib.infer_dtype(where, skipna=False)
if inferred == 'integer' or inferred == 'boolean':
where = np.asarray(where)
if where.dtype == np.bool_:
Expand Down
23 changes: 7 additions & 16 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,27 +857,15 @@ def _harmonize_columns(self, parse_dates=None):
except KeyError:
pass # this column not in results

def _get_notna_col_dtype(self, col):
"""
Infer datatype of the Series col. In case the dtype of col is 'object'
and it contains NA values, this infers the datatype of the not-NA
values. Needed for inserting typed data containing NULLs, GH8778.
"""
col_for_inference = col
if col.dtype == 'object':
notnadata = col[~isna(col)]
if len(notnadata):
col_for_inference = notnadata

return lib.infer_dtype(col_for_inference)

def _sqlalchemy_type(self, col):

dtype = self.dtype or {}
if col.name in dtype:
return self.dtype[col.name]

col_type = self._get_notna_col_dtype(col)
# Infer type of column, while ignoring missing values.
# Needed for inserting typed data containing NULLs, GH 8778.
col_type = lib.infer_dtype(col, skipna=True)

from sqlalchemy.types import (BigInteger, Integer, Float,
Text, Boolean,
Expand Down Expand Up @@ -1374,7 +1362,10 @@ def _sql_type_name(self, col):
if col.name in dtype:
return dtype[col.name]

col_type = self._get_notna_col_dtype(col)
# Infer type of column, while ignoring missing values.
# Needed for inserting typed data containing NULLs, GH 8778.
col_type = lib.infer_dtype(col, skipna=True)

if col_type == 'timedelta64':
warnings.warn("the 'timedelta' type is not supported, and will be "
"written as integer values (ns frequency) to the "
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
to_datetime(d['year'], format='%Y').astype(np.int64))
d['days'] = days // NS_PER_DAY

elif infer_dtype(dates) == 'datetime':
elif infer_dtype(dates, skipna=False) == 'datetime':
if delta:
delta = dates.values - stata_epoch
f = lambda x: \
Expand Down Expand Up @@ -1867,7 +1867,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114,
if force_strl:
return '%9s'
if dtype.type == np.object_:
inferred_dtype = infer_dtype(column.dropna())
inferred_dtype = infer_dtype(column, skipna=True)
if not (inferred_dtype in ('string', 'unicode') or
len(column) == 0):
raise ValueError('Column `{col}` cannot be exported.\n\nOnly '
Expand Down
2 changes: 1 addition & 1 deletion pandas/plotting/_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def _convert_1d(values, units, axis):
return values.asfreq(axis.freq)._ndarray_values
elif isinstance(values, Index):
return values.map(lambda x: get_datevalue(x, axis.freq))
elif lib.infer_dtype(values) == 'period':
elif lib.infer_dtype(values, skipna=False) == 'period':
# https://github.com/pandas-dev/pandas/issues/24304
# convert ndarray[period] -> PeriodIndex
return PeriodIndex(values, freq=axis.freq)._ndarray_values
Expand Down
Loading