diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 21b20cb123ed6..9def910df0bab 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -16,11 +16,11 @@ class FromDicts(object): def setup(self): N, K = 5000, 50 - index = tm.makeStringIndex(N) - columns = tm.makeStringIndex(K) - frame = DataFrame(np.random.randn(N, K), index=index, columns=columns) + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + frame = DataFrame(np.random.randn(N, K), index=self.index, + columns=self.columns) self.data = frame.to_dict() - self.some_dict = list(self.data.values())[0] self.dict_list = frame.to_dict(orient='records') self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} @@ -31,8 +31,14 @@ def time_list_of_dict(self): def time_nested_dict(self): DataFrame(self.data) - def time_dict(self): - Series(self.some_dict) + def time_nested_dict_index(self): + DataFrame(self.data, index=self.index) + + def time_nested_dict_columns(self): + DataFrame(self.data, columns=self.columns) + + def time_nested_dict_index_columns(self): + DataFrame(self.data, index=self.index, columns=self.columns) def time_nested_dict_int64(self): # nested dict, integer indexes, regression described in #621 diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index ce63cb2473bc4..21a2a66831fd3 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1135,6 +1135,10 @@ Reshaping - Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`) - Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) +- Fixed construction of a :class:`DataFrame` from a ``dict`` containing ``NaN`` as key (:issue:`18455`) +- Suppressed error in the construction of a :class:`DataFrame` from a ``dict`` containing scalar values when the corresponding keys are not included in the passed index (:issue:`18600`) + +- Fixed (changed from ``object`` to ``float64``) dtype of :class:`DataFrame` initialized with axes, no data, and ``dtype=int`` (:issue:`19646`) - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`) - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`) - Bug in :func:`DataFrame.pivot_table` which fails when the ``aggfunc`` arg is of type string. The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9626079660771..0f5e8a99eb685 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -27,6 +27,7 @@ from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, + construct_1d_arraylike_from_scalar, maybe_cast_to_datetime, maybe_infer_to_datetimelike, maybe_convert_platform, @@ -429,44 +430,27 @@ def _init_dict(self, data, index, columns, dtype=None): Needs to handle a lot of exceptional cases. """ if columns is not None: - columns = _ensure_index(columns) + arrays = Series(data, index=columns, dtype=object) + data_names = arrays.index - # GH10856 - # raise ValueError if only scalars in dict + missing = arrays.isnull() if index is None: - extract_index(list(data.values())) - - # prefilter if columns passed - data = {k: v for k, v in compat.iteritems(data) if k in columns} - - if index is None: - index = extract_index(list(data.values())) - + # GH10856 + # raise ValueError if only scalars in dict + index = extract_index(arrays[~missing]) else: index = _ensure_index(index) - arrays = [] - data_names = [] - for k in columns: - if k not in data: - # no obvious "empty" int column - if dtype is not None and issubclass(dtype.type, - np.integer): - continue - - if dtype is None: - # 1783 - v = np.empty(len(index), dtype=object) - elif np.issubdtype(dtype, np.flexible): - v = np.empty(len(index), dtype=object) - else: - v = np.empty(len(index), dtype=dtype) - - v.fill(np.nan) + # no obvious "empty" int column + if missing.any() and not is_integer_dtype(dtype): + if dtype is None or np.issubdtype(dtype, np.flexible): + # 1783 + nan_dtype = object else: - v = data[k] - data_names.append(k) - arrays.append(v) + nan_dtype = dtype + v = construct_1d_arraylike_from_scalar(np.nan, len(index), + nan_dtype) + arrays.loc[missing] = [v] * missing.sum() else: keys = com._dict_keys_to_ordered_list(data) @@ -7253,8 +7237,6 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): # figure out the index, if necessary if index is None: index = extract_index(arrays) - else: - index = _ensure_index(index) # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d5cd22732f0a9..cf14bb8f2e534 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7341,7 +7341,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if not is_bool_dtype(dt): raise ValueError(msg.format(dtype=dt)) - cond = cond.astype(bool, copy=False) cond = -cond if inplace else cond # try to align with other diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f5956aacf8646..b0a6086c450ef 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4841,7 +4841,7 @@ def form_blocks(arrays, names, axes): items_dict = defaultdict(list) extra_locs = [] - names_idx = Index(names) + names_idx = _ensure_index(names) if names_idx.equals(axes[0]): names_indexer = np.arange(len(names_idx)) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index f3630dc43fbd1..422d9c8f0ecf7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -24,6 +24,7 @@ is_extension_array_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, + is_object_dtype, is_list_like, is_hashable, is_iterator, @@ -38,7 +39,8 @@ maybe_upcast, infer_dtype_from_scalar, maybe_convert_platform, maybe_cast_to_datetime, maybe_castable, - construct_1d_arraylike_from_scalar) + construct_1d_arraylike_from_scalar, + construct_1d_object_array_from_listlike) from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike from pandas.core.index import (Index, MultiIndex, InvalidIndexError, @@ -297,6 +299,7 @@ def _init_dict(self, data, index=None, dtype=None): # raises KeyError), so we iterate the entire dict, and align if data: keys, values = zip(*compat.iteritems(data)) + values = list(values) else: keys, values = [], [] @@ -4042,7 +4045,13 @@ def _try_cast(arr, take_fast_path): try: subarr = maybe_cast_to_datetime(arr, dtype) - if not is_extension_type(subarr): + # Take care in creating object arrays (but iterators are not + # supported): + if is_object_dtype(dtype) and (is_list_like(subarr) and + not (is_iterator(subarr) or + isinstance(subarr, np.ndarray))): + subarr = construct_1d_object_array_from_listlike(subarr) + elif not is_extension_type(subarr): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 499751e864331..47b7d60e3b6e8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -287,8 +287,50 @@ def test_constructor_dict(self): with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['a']) - with tm.assert_raises_regex(ValueError, msg): - DataFrame({'a': 0.7}, columns=['b']) + @pytest.mark.parametrize("scalar", [2, np.nan, None, 'D']) + def test_constructor_invalid_items_unused(self, scalar): + # No error if invalid (scalar) value is in fact not used: + result = DataFrame({'a': scalar}, columns=['b']) + expected = DataFrame(columns=['b']) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + def test_constructor_dict_nan_key(self, value): + # GH 18455 + cols = [1, value, 3] + idx = ['a', value] + values = [[0, 3], [1, 4], [2, 5]] + data = {cols[c]: Series(values[c], index=idx) for c in range(3)} + result = DataFrame(data).sort_values(1).sort_values('a', axis=1) + expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), + index=idx, columns=cols) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx).sort_values('a', axis=1) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx, columns=cols) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("value", [np.nan, None, float('nan')]) + def test_constructor_dict_nan_tuple_key(self, value): + # GH 18455 + cols = Index([(11, 21), (value, 22), (13, value)]) + idx = Index([('a', value), (value, 2)]) + values = [[0, 3], [1, 4], [2, 5]] + data = {cols[c]: Series(values[c], index=idx) for c in range(3)} + result = (DataFrame(data) + .sort_values((11, 21)) + .sort_values(('a', value), axis=1)) + expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), + index=idx, columns=cols) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx).sort_values(('a', value), axis=1) + tm.assert_frame_equal(result, expected) + + result = DataFrame(data, index=idx, columns=cols) + tm.assert_frame_equal(result, expected) @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') def test_constructor_dict_order_insertion(self): @@ -753,7 +795,7 @@ def test_constructor_corner(self): # does not error but ends up float df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=int) - assert df.values.dtype == np.object_ + assert df.values.dtype == np.dtype('float64') # #1783 empty dtype object df = DataFrame({}, columns=['foo', 'bar']) @@ -761,7 +803,7 @@ def test_constructor_corner(self): df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'), dtype=int) - assert df.values.dtype == np.object_ + assert df.values.dtype == np.dtype('float64') def test_constructor_scalar_inference(self): data = {'int': 1, 'bool': True, diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 68df0982a1e3e..d89731dc09044 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -603,13 +603,7 @@ def test_unstack_unused_levels(self): cols = pd.MultiIndex.from_product([[0, 1], col_level]) expected = pd.DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols) - # Broken (GH 18455): - # tm.assert_frame_equal(result, expected) - diff = result - expected - assert(diff.sum().sum() == 0) - assert((diff + 1).sum().sum() == 8) - - assert((result.columns.levels[1] == idx.levels[level]).all()) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)]) def test_unstack_unused_level(self, cols): diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index cbb5932a890dc..5ef6dc07a5c22 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -461,7 +461,7 @@ def test_read_one_empty_col_with_header(self, ext): ) expected_header_none = DataFrame(pd.Series([0], dtype='int64')) tm.assert_frame_equal(actual_header_none, expected_header_none) - expected_header_zero = DataFrame(columns=[0], dtype='int64') + expected_header_zero = DataFrame(columns=[0]) tm.assert_frame_equal(actual_header_zero, expected_header_zero) @td.skip_if_no('openpyxl')