Skip to content

Commit f06cfc3

Browse files
authored
Merge branch 'master' into nchmura-df-style-hide
2 parents 72ea765 + 94ef7b6 commit f06cfc3

32 files changed

+340
-93
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ after_success:
123123

124124
after_script:
125125
- echo "after_script start"
126-
- source activate pandas && cd /tmp && python -c "import pandas; pandas.show_versions();"
126+
- source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd
127127
- if [ -e /tmp/single.xml ]; then
128128
ci/print_skipped.py /tmp/single.xml;
129129
fi

asv_bench/benchmarks/indexing.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ def time_getitem_list_like(self):
1919
def time_getitem_array(self):
2020
self.s[np.arange(10000)]
2121

22+
def time_getitem_lists(self):
23+
self.s[np.arange(10000).tolist()]
24+
2225
def time_iloc_array(self):
2326
self.s.iloc[np.arange(10000)]
2427

@@ -190,9 +193,15 @@ def setup(self):
190193
np.arange(1000)], names=['one', 'two'])
191194

192195
import string
193-
self.mistring = MultiIndex.from_product(
194-
[np.arange(1000),
195-
np.arange(20), list(string.ascii_letters)],
196+
197+
self.mi_large = MultiIndex.from_product(
198+
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
199+
names=['one', 'two', 'three'])
200+
self.mi_med = MultiIndex.from_product(
201+
[np.arange(1000), np.arange(10), list('A')],
202+
names=['one', 'two', 'three'])
203+
self.mi_small = MultiIndex.from_product(
204+
[np.arange(100), list('A'), list('A')],
196205
names=['one', 'two', 'three'])
197206

198207
def time_series_xs_mi_ix(self):
@@ -215,8 +224,14 @@ def time_multiindex_get_indexer(self):
215224
(0, 16), (0, 17), (0, 18),
216225
(0, 19)], dtype=object))
217226

227+
def time_multiindex_large_get_loc(self):
228+
self.mi_large.get_loc((999, 19, 'Z'))
229+
230+
def time_multiindex_med_get_loc(self):
231+
self.mi_med.get_loc((999, 9, 'A'))
232+
218233
def time_multiindex_string_get_loc(self):
219-
self.mistring.get_loc((999, 19, 'Z'))
234+
self.mi_small.get_loc((99, 'A', 'A'))
220235

221236
def time_is_monotonic(self):
222237
self.miint.is_monotonic

ci/script_multi.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ if [ "$BUILD_TEST" ]; then
2727
echo "[running]"
2828
cd /tmp
2929
unset PYTHONPATH
30-
python -c "import pandas; pandas.test(['-n 2', '--skip-slow', '--skip-network', '-r xX'])"
30+
python -c 'import pandas; pandas.test(["-n 2", "--skip-slow", "--skip-network", "-r xX", "-m not single"])'
3131

3232
elif [ "$DOC" ]; then
3333
echo "We are not running pytest as this is a doc-build"

doc/source/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ installed), make sure you have `pytest
202202
Dependencies
203203
------------
204204

205-
* `setuptools <http://pythonhosted.org/setuptools>`__
205+
* `setuptools <https://setuptools.readthedocs.io/en/latest/>`__
206206
* `NumPy <http://www.numpy.org>`__: 1.7.1 or higher
207207
* `python-dateutil <http://labix.org/python-dateutil>`__: 1.5 or higher
208208
* `pytz <http://pytz.sourceforge.net/>`__: Needed for time zone support

doc/source/style.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"\n",
1313
"<span style=\"color: red\">*Provisional: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your feedback.*</span>\n",
1414
"\n",
15-
"This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/html-styling.ipynb).\n",
15+
"This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/style.ipynb).\n",
1616
"\n",
1717
"You can apply **conditional formatting**, the visual styling of a DataFrame\n",
1818
"depending on the data within, by using the ``DataFrame.style`` property.\n",

doc/source/whatsnew/v0.20.2.txt

+10-6
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,15 @@ Highlights include:
1919
Enhancements
2020
~~~~~~~~~~~~
2121

22-
22+
- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)
2323

2424
.. _whatsnew_0202.performance:
2525

2626
Performance Improvements
2727
~~~~~~~~~~~~~~~~~~~~~~~~
2828

29-
29+
- Performance regression fix when indexing with a list-like (:issue:`16285`)
30+
- Performance regression fix for small MultiIndexes (:issuse:`16319`)
3031

3132
.. _whatsnew_0202.bug_fixes:
3233

@@ -36,7 +37,7 @@ Bug Fixes
3637
Conversion
3738
^^^^^^^^^^
3839

39-
40+
- Bug in ``pd.to_numeric()`` in which empty data inputs were causing Python to crash (:issue:`16302`)
4041

4142

4243
Indexing
@@ -50,11 +51,15 @@ I/O
5051

5152
- :class:`pandas.io.formats.style.Styler` now has ``index`` parameter and corresponding method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`)
5253
- :class:`pandas.io.formats.style.Styler` now has ``hidden_cols`` parameter and corresponding method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`)
54+
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
55+
5356

5457

5558
Plotting
5659
^^^^^^^^
5760

61+
- Bug in ``DataFrame.plot`` with a single column and a list-like ``color`` (:issue:`3486`)
62+
5863

5964

6065

@@ -67,13 +72,12 @@ Groupby/Resample/Rolling
6772
Sparse
6873
^^^^^^
6974

70-
71-
75+
- Bug in construction of SparseDataFrame from ``scipy.sparse.dok_matrix`` (:issue:`16179`)
7276

7377
Reshaping
7478
^^^^^^^^^
7579

76-
80+
- Bug in ``DataFrame.stack`` with unsorted levels in MultiIndex columns (:issue:`16323`)
7781

7882

7983
Numeric

pandas/_libs/hashtable.pxd

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ cdef struct Int64VectorData:
5252
cdef class Int64Vector:
5353
cdef Int64VectorData *data
5454
cdef ndarray ao
55+
cdef bint external_view_exists
5556

5657
cdef resize(self)
5758
cpdef to_array(self)

pandas/_libs/hashtable.pyx

+13
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ cdef class Factorizer:
6464
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
6565
array([ 0, 1, 20])
6666
"""
67+
if self.uniques.external_view_exists:
68+
uniques = ObjectVector()
69+
uniques.extend(self.uniques.to_array())
70+
self.uniques = uniques
6771
labels = self.table.get_labels(values, self.uniques,
6872
self.count, na_sentinel, check_null)
6973
mask = (labels == na_sentinel)
@@ -99,6 +103,15 @@ cdef class Int64Factorizer:
99103

100104
def factorize(self, int64_t[:] values, sort=False,
101105
na_sentinel=-1, check_null=True):
106+
"""
107+
Factorize values with nans replaced by na_sentinel
108+
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
109+
array([ 0, 1, 20])
110+
"""
111+
if self.uniques.external_view_exists:
112+
uniques = Int64Vector()
113+
uniques.extend(self.uniques.to_array())
114+
self.uniques = uniques
102115
labels = self.table.get_labels(values, self.uniques,
103116
self.count, na_sentinel,
104117
check_null)

pandas/_libs/hashtable_class_helper.pxi.in

+39-7
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ cdef class {{name}}Vector:
7171

7272
{{if dtype != 'int64'}}
7373
cdef:
74+
bint external_view_exists
7475
{{name}}VectorData *data
7576
ndarray ao
7677
{{endif}}
@@ -80,14 +81,15 @@ cdef class {{name}}Vector:
8081
sizeof({{name}}VectorData))
8182
if not self.data:
8283
raise MemoryError()
84+
self.external_view_exists = False
8385
self.data.n = 0
8486
self.data.m = _INIT_VEC_CAP
8587
self.ao = np.empty(self.data.m, dtype={{idtype}})
8688
self.data.data = <{{arg}}*> self.ao.data
8789

8890
cdef resize(self):
8991
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
90-
self.ao.resize(self.data.m)
92+
self.ao.resize(self.data.m, refcheck=False)
9193
self.data.data = <{{arg}}*> self.ao.data
9294

9395
def __dealloc__(self):
@@ -99,13 +101,20 @@ cdef class {{name}}Vector:
99101
return self.data.n
100102

101103
cpdef to_array(self):
102-
self.ao.resize(self.data.n)
103-
self.data.m = self.data.n
104+
if self.data.m != self.data.n:
105+
if self.external_view_exists:
106+
# should never happen
107+
raise ValueError("should have raised on append()")
108+
self.ao.resize(self.data.n, refcheck=False)
109+
self.data.m = self.data.n
110+
self.external_view_exists = True
104111
return self.ao
105112

106113
cdef inline void append(self, {{arg}} x):
107114

108115
if needs_resize(self.data):
116+
if self.external_view_exists:
117+
raise ValueError("external reference but Vector.resize() needed")
109118
self.resize()
110119

111120
append_data_{{dtype}}(self.data, x)
@@ -120,15 +129,19 @@ cdef class StringVector:
120129

121130
cdef:
122131
StringVectorData *data
132+
bint external_view_exists
123133

124134
def __cinit__(self):
125135
self.data = <StringVectorData *>PyMem_Malloc(
126136
sizeof(StringVectorData))
127137
if not self.data:
128138
raise MemoryError()
139+
self.external_view_exists = False
129140
self.data.n = 0
130141
self.data.m = _INIT_VEC_CAP
131142
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
143+
if not self.data.data:
144+
raise MemoryError()
132145

133146
cdef resize(self):
134147
cdef:
@@ -138,9 +151,10 @@ cdef class StringVector:
138151
m = self.data.m
139152
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
140153

141-
# TODO: can resize?
142154
orig_data = self.data.data
143155
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
156+
if not self.data.data:
157+
raise MemoryError()
144158
for i in range(m):
145159
self.data.data[i] = orig_data[i]
146160

@@ -164,6 +178,7 @@ cdef class StringVector:
164178
for i in range(self.data.n):
165179
val = self.data.data[i]
166180
ao[i] = val
181+
self.external_view_exists = True
167182
self.data.m = self.data.n
168183
return ao
169184

@@ -174,15 +189,20 @@ cdef class StringVector:
174189

175190
append_data_string(self.data, x)
176191

192+
cdef extend(self, ndarray[:] x):
193+
for i in range(len(x)):
194+
self.append(x[i])
177195

178196
cdef class ObjectVector:
179197

180198
cdef:
181199
PyObject **data
182200
size_t n, m
183201
ndarray ao
202+
bint external_view_exists
184203

185204
def __cinit__(self):
205+
self.external_view_exists = False
186206
self.n = 0
187207
self.m = _INIT_VEC_CAP
188208
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
@@ -193,19 +213,28 @@ cdef class ObjectVector:
193213

194214
cdef inline append(self, object o):
195215
if self.n == self.m:
216+
if self.external_view_exists:
217+
raise ValueError("external reference but Vector.resize() needed")
196218
self.m = max(self.m * 2, _INIT_VEC_CAP)
197-
self.ao.resize(self.m)
219+
self.ao.resize(self.m, refcheck=False)
198220
self.data = <PyObject**> self.ao.data
199221

200222
Py_INCREF(o)
201223
self.data[self.n] = <PyObject*> o
202224
self.n += 1
203225

204226
def to_array(self):
205-
self.ao.resize(self.n)
206-
self.m = self.n
227+
if self.m != self.n:
228+
if self.external_view_exists:
229+
raise ValueError("should have raised on append()")
230+
self.ao.resize(self.n, refcheck=False)
231+
self.m = self.n
232+
self.external_view_exists = True
207233
return self.ao
208234

235+
cdef extend(self, ndarray[:] x):
236+
for i in range(len(x)):
237+
self.append(x[i])
209238

210239
#----------------------------------------------------------------------
211240
# HashTable
@@ -362,6 +391,9 @@ cdef class {{name}}HashTable(HashTable):
362391

363392
if needs_resize(ud):
364393
with gil:
394+
if uniques.external_view_exists:
395+
raise ValueError("external reference to uniques held, "
396+
"but Vector.resize() needed")
365397
uniques.resize()
366398
append_data_{{dtype}}(ud, val)
367399
labels[i] = count

pandas/_libs/index.pyx

+32-1
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz):
553553
return tz is UTC or isinstance(tz, _du_utc)
554554

555555

556-
cdef class MultiIndexEngine(IndexEngine):
556+
cdef class MultiIndexObjectEngine(ObjectEngine):
557+
"""
558+
provide the same interface as the MultiIndexEngine
559+
but use the IndexEngine for computation
560+
561+
This provides good performance with samller MI's
562+
"""
563+
def get_indexer(self, values):
564+
# convert a MI to an ndarray
565+
if hasattr(values, 'values'):
566+
values = values.values
567+
return super(MultiIndexObjectEngine, self).get_indexer(values)
568+
569+
cpdef get_loc(self, object val):
570+
571+
# convert a MI to an ndarray
572+
if hasattr(val, 'values'):
573+
val = val.values
574+
return super(MultiIndexObjectEngine, self).get_loc(val)
575+
576+
577+
cdef class MultiIndexHashEngine(ObjectEngine):
578+
"""
579+
Use a hashing based MultiIndex impl
580+
but use the IndexEngine for computation
581+
582+
This provides good performance with larger MI's
583+
"""
557584

558585
def _call_monotonic(self, object mi):
559586
# defer these back to the mi iteself
@@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine):
584611
except TypeError:
585612
raise KeyError(val)
586613

614+
def get_indexer(self, values):
615+
self._ensure_mapping_populated()
616+
return self.mapping.lookup(values)
617+
587618
cdef _make_hash_table(self, n):
588619
return _hash.MultiIndexHashTable(n)
589620

0 commit comments

Comments
 (0)