pandas-dev · toobaz · Apr 1, 2018 · Nov 25, 2017 · Feb 5, 2018 · Feb 11, 2018
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -16,11 +16,11 @@ class FromDicts(object):
 
     def setup(self):
         N, K = 5000, 50
-        index = tm.makeStringIndex(N)
-        columns = tm.makeStringIndex(K)
-        frame = DataFrame(np.random.randn(N, K), index=index, columns=columns)
+        self.index = tm.makeStringIndex(N)
+        self.columns = tm.makeStringIndex(K)
+        frame = DataFrame(np.random.randn(N, K), index=self.index,
+                          columns=self.columns)
         self.data = frame.to_dict()
-        self.some_dict = list(self.data.values())[0]
         self.dict_list = frame.to_dict(orient='records')
         self.data2 = {i: {j: float(j) for j in range(100)}
                       for i in range(2000)}
@@ -31,8 +31,14 @@ def time_list_of_dict(self):
     def time_nested_dict(self):
         DataFrame(self.data)
 
-    def time_dict(self):
-        Series(self.some_dict)
+    def time_nested_dict_index(self):
+        DataFrame(self.data, index=self.index)
+
+    def time_nested_dict_columns(self):
+        DataFrame(self.data, columns=self.columns)
+
+    def time_nested_dict_index_columns(self):
+        DataFrame(self.data, index=self.index, columns=self.columns)
 
     def time_nested_dict_int64(self):
         # nested dict, integer indexes, regression described in #621

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -1135,6 +1135,10 @@ Reshaping
 - Bug in :func:`DataFrame.unstack` which casts int to float if ``columns`` is a ``MultiIndex`` with unused levels (:issue:`17845`)
 - Bug in :func:`DataFrame.unstack` which raises an error if ``index`` is a ``MultiIndex`` with unused labels on the unstacked level (:issue:`18562`)
 - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
+- Fixed construction of a :class:`DataFrame` from a ``dict`` containing ``NaN`` as key (:issue:`18455`)
+- Suppressed error in the construction of a :class:`DataFrame` from a ``dict`` containing scalar values when the corresponding keys are not included in the passed index (:issue:`18600`)
+
+- Fixed (changed from ``object`` to ``float64``) dtype of :class:`DataFrame` initialized with axes, no data, and ``dtype=int`` (:issue:`19646`)
 - Bug in :func:`Series.rank` where ``Series`` containing ``NaT`` modifies the ``Series`` inplace (:issue:`18521`)
 - Bug in :func:`cut` which fails when using readonly arrays (:issue:`18773`)
 - Bug in :func:`DataFrame.pivot_table` which fails when the ``aggfunc`` arg is of type string.  The behavior is now consistent with other methods like ``agg`` and ``apply`` (:issue:`18713`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.cast import (
     maybe_upcast,
     cast_scalar_to_array,
+    construct_1d_arraylike_from_scalar,
     maybe_cast_to_datetime,
     maybe_infer_to_datetimelike,
     maybe_convert_platform,
@@ -429,44 +430,27 @@ def _init_dict(self, data, index, columns, dtype=None):
         Needs to handle a lot of exceptional cases.
         """
         if columns is not None:
-            columns = _ensure_index(columns)
+            arrays = Series(data, index=columns, dtype=object)
+            data_names = arrays.index
 
-            # GH10856
-            # raise ValueError if only scalars in dict
+            missing = arrays.isnull()
             if index is None:
-                extract_index(list(data.values()))
-
-            # prefilter if columns passed
-            data = {k: v for k, v in compat.iteritems(data) if k in columns}
-
-            if index is None:
-                index = extract_index(list(data.values()))
-
+                # GH10856
+                # raise ValueError if only scalars in dict
+                index = extract_index(arrays[~missing])
             else:
                 index = _ensure_index(index)
 
-            arrays = []
-            data_names = []
-            for k in columns:
-                if k not in data:
-                    # no obvious "empty" int column
-                    if dtype is not None and issubclass(dtype.type,
-                                                        np.integer):
-                        continue
-
-                    if dtype is None:
-                        # 1783
-                        v = np.empty(len(index), dtype=object)
-                    elif np.issubdtype(dtype, np.flexible):
-                        v = np.empty(len(index), dtype=object)
-                    else:
-                        v = np.empty(len(index), dtype=dtype)
-
-                    v.fill(np.nan)
+            # no obvious "empty" int column
+            if missing.any() and not is_integer_dtype(dtype):
+                if dtype is None or np.issubdtype(dtype, np.flexible):
+                    # 1783
+                    nan_dtype = object
                 else:
-                    v = data[k]
-                data_names.append(k)
-                arrays.append(v)
+                    nan_dtype = dtype
+                v = construct_1d_arraylike_from_scalar(np.nan, len(index),
+                                                       nan_dtype)
+                arrays.loc[missing] = [v] * missing.sum()
 
         else:
             keys = com._dict_keys_to_ordered_list(data)
@@ -7253,8 +7237,6 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
     # figure out the index, if necessary
     if index is None:
         index = extract_index(arrays)
-    else:
-        index = _ensure_index(index)
 
     # don't force copy because getting jammed in an ndarray anyway
     arrays = _homogenize(arrays, index, dtype)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7341,7 +7341,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
                 if not is_bool_dtype(dt):
                     raise ValueError(msg.format(dtype=dt))
 
-        cond = cond.astype(bool, copy=False)
         cond = -cond if inplace else cond
 
         # try to align with other

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -4841,7 +4841,7 @@ def form_blocks(arrays, names, axes):
     items_dict = defaultdict(list)
     extra_locs = []
 
-    names_idx = Index(names)
+    names_idx = _ensure_index(names)
     if names_idx.equals(axes[0]):
         names_indexer = np.arange(len(names_idx))
     else:

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -24,6 +24,7 @@
     is_extension_array_dtype,
     is_datetime64tz_dtype,
     is_timedelta64_dtype,
+    is_object_dtype,
     is_list_like,
     is_hashable,
     is_iterator,
@@ -38,7 +39,8 @@
     maybe_upcast, infer_dtype_from_scalar,
     maybe_convert_platform,
     maybe_cast_to_datetime, maybe_castable,
-    construct_1d_arraylike_from_scalar)
+    construct_1d_arraylike_from_scalar,
+    construct_1d_object_array_from_listlike)
 from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike
 
 from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
@@ -297,6 +299,7 @@ def _init_dict(self, data, index=None, dtype=None):
         # raises KeyError), so we iterate the entire dict, and align
         if data:
             keys, values = zip(*compat.iteritems(data))
+            values = list(values)
         else:
             keys, values = [], []
 
@@ -4042,7 +4045,13 @@ def _try_cast(arr, take_fast_path):
 
         try:
             subarr = maybe_cast_to_datetime(arr, dtype)
-            if not is_extension_type(subarr):
+            # Take care in creating object arrays (but iterators are not
+            # supported):
+            if is_object_dtype(dtype) and (is_list_like(subarr) and
+                                           not (is_iterator(subarr) or
+                                           isinstance(subarr, np.ndarray))):
+                subarr = construct_1d_object_array_from_listlike(subarr)
+            elif not is_extension_type(subarr):
                 subarr = np.array(subarr, dtype=dtype, copy=copy)
         except (ValueError, TypeError):
             if is_categorical_dtype(dtype):

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -287,8 +287,50 @@ def test_constructor_dict(self):
         with tm.assert_raises_regex(ValueError, msg):
             DataFrame({'a': 0.7}, columns=['a'])
 
-        with tm.assert_raises_regex(ValueError, msg):
-            DataFrame({'a': 0.7}, columns=['b'])
+    @pytest.mark.parametrize("scalar", [2, np.nan, None, 'D'])
+    def test_constructor_invalid_items_unused(self, scalar):
+        # No error if invalid (scalar) value is in fact not used:
+        result = DataFrame({'a': scalar}, columns=['b'])
+        expected = DataFrame(columns=['b'])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
+    def test_constructor_dict_nan_key(self, value):
+        # GH 18455
+        cols = [1, value, 3]
+        idx = ['a', value]
+        values = [[0, 3], [1, 4], [2, 5]]
+        data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
+        result = DataFrame(data).sort_values(1).sort_values('a', axis=1)
+        expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
+                             index=idx, columns=cols)
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx).sort_values('a', axis=1)
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx, columns=cols)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("value", [np.nan, None, float('nan')])
+    def test_constructor_dict_nan_tuple_key(self, value):
+        # GH 18455
+        cols = Index([(11, 21), (value, 22), (13, value)])
+        idx = Index([('a', value), (value, 2)])
+        values = [[0, 3], [1, 4], [2, 5]]
+        data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
+        result = (DataFrame(data)
+                  .sort_values((11, 21))
+                  .sort_values(('a', value), axis=1))
+        expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
+                             index=idx, columns=cols)
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx).sort_values(('a', value), axis=1)
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx, columns=cols)
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6')
     def test_constructor_dict_order_insertion(self):
@@ -753,15 +795,15 @@ def test_constructor_corner(self):
 
         # does not error but ends up float
         df = DataFrame(index=lrange(10), columns=['a', 'b'], dtype=int)
-        assert df.values.dtype == np.object_
+        assert df.values.dtype == np.dtype('float64')
 
         # #1783 empty dtype object
         df = DataFrame({}, columns=['foo', 'bar'])
         assert df.values.dtype == np.object_
 
         df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'),
                        dtype=int)
-        assert df.values.dtype == np.object_
+        assert df.values.dtype == np.dtype('float64')
 
     def test_constructor_scalar_inference(self):
         data = {'int': 1, 'bool': True,

diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py
@@ -603,13 +603,7 @@ def test_unstack_unused_levels(self):
             cols = pd.MultiIndex.from_product([[0, 1], col_level])
             expected = pd.DataFrame(exp_data.reshape(3, 6),
                                     index=idx_level, columns=cols)
-            # Broken (GH 18455):
-            # tm.assert_frame_equal(result, expected)
-            diff = result - expected
-            assert(diff.sum().sum() == 0)
-            assert((diff + 1).sum().sum() == 8)
-
-            assert((result.columns.levels[1] == idx.levels[level]).all())
+            tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
     def test_unstack_unused_level(self, cols):

diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -461,7 +461,7 @@ def test_read_one_empty_col_with_header(self, ext):
             )
         expected_header_none = DataFrame(pd.Series([0], dtype='int64'))
         tm.assert_frame_equal(actual_header_none, expected_header_none)
-        expected_header_zero = DataFrame(columns=[0], dtype='int64')
+        expected_header_zero = DataFrame(columns=[0])
         tm.assert_frame_equal(actual_header_zero, expected_header_zero)
 
     @td.skip_if_no('openpyxl')