@@ -97,7 +97,6 @@ from pandas._libs.missing cimport (
97
97
is_matching_na,
98
98
is_null_datetime64,
99
99
is_null_timedelta64,
100
- isnaobj,
101
100
)
102
101
from pandas._libs.tslibs.conversion cimport convert_to_tsobject
103
102
from pandas._libs.tslibs.nattype cimport (
@@ -1454,6 +1453,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
1454
1453
ndarray values
1455
1454
bint seen_pdnat = False
1456
1455
bint seen_val = False
1456
+ flatiter it
1457
1457
1458
1458
if util.is_array(value ):
1459
1459
values = value
@@ -1491,24 +1491,22 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
1491
1491
# This should not be reached
1492
1492
values = values.astype(object )
1493
1493
1494
- # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup
1495
- values = values.ravel(order = " K" )
1496
-
1497
- if skipna:
1498
- values = values[~ isnaobj(values)]
1499
-
1500
1494
n = cnp.PyArray_SIZE(values)
1501
1495
if n == 0 :
1502
1496
return " empty"
1503
1497
1504
1498
# Iterate until we find our first valid value. We will use this
1505
1499
# value to decide which of the is_foo_array functions to call.
1500
+ it = PyArray_IterNew(values)
1506
1501
for i in range (n):
1507
- val = values[i]
1502
+ # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
1503
+ # equivalents to `val = values[i]`
1504
+ val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
1505
+ PyArray_ITER_NEXT(it)
1508
1506
1509
1507
# do not use checknull to keep
1510
1508
# np.datetime64('nat') and np.timedelta64('nat')
1511
- if val is None or util.is_nan(val):
1509
+ if val is None or util.is_nan(val) or val is C_NA :
1512
1510
pass
1513
1511
elif val is NaT:
1514
1512
seen_pdnat = True
@@ -1520,23 +1518,25 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
1520
1518
if seen_val is False and seen_pdnat is True :
1521
1519
return " datetime"
1522
1520
# float/object nan is handled in latter logic
1521
+ if seen_val is False and skipna:
1522
+ return " empty"
1523
1523
1524
1524
if util.is_datetime64_object(val):
1525
- if is_datetime64_array(values):
1525
+ if is_datetime64_array(values, skipna = skipna ):
1526
1526
return " datetime64"
1527
1527
1528
1528
elif is_timedelta(val):
1529
- if is_timedelta_or_timedelta64_array(values):
1529
+ if is_timedelta_or_timedelta64_array(values, skipna = skipna ):
1530
1530
return " timedelta"
1531
1531
1532
1532
elif util.is_integer_object(val):
1533
1533
# ordering matters here; this check must come after the is_timedelta
1534
1534
# check otherwise numpy timedelta64 objects would come through here
1535
1535
1536
- if is_integer_array(values):
1536
+ if is_integer_array(values, skipna = skipna ):
1537
1537
return " integer"
1538
- elif is_integer_float_array(values):
1539
- if is_integer_na_array(values):
1538
+ elif is_integer_float_array(values, skipna = skipna ):
1539
+ if is_integer_na_array(values, skipna = skipna ):
1540
1540
return " integer-na"
1541
1541
else :
1542
1542
return " mixed-integer-float"
@@ -1557,7 +1557,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
1557
1557
return " time"
1558
1558
1559
1559
elif is_decimal(val):
1560
- if is_decimal_array(values):
1560
+ if is_decimal_array(values, skipna = skipna ):
1561
1561
return " decimal"
1562
1562
1563
1563
elif util.is_complex_object(val):
@@ -1567,8 +1567,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
1567
1567
elif util.is_float_object(val):
1568
1568
if is_float_array(values):
1569
1569
return " floating"
1570
- elif is_integer_float_array(values):
1571
- if is_integer_na_array(values):
1570
+ elif is_integer_float_array(values, skipna = skipna ):
1571
+ if is_integer_na_array(values, skipna = skipna ):
1572
1572
return " integer-na"
1573
1573
else :
1574
1574
return " mixed-integer-float"
@@ -1586,15 +1586,18 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
1586
1586
return " bytes"
1587
1587
1588
1588
elif is_period_object(val):
1589
- if is_period_array(values):
1589
+ if is_period_array(values, skipna = skipna ):
1590
1590
return " period"
1591
1591
1592
1592
elif is_interval(val):
1593
1593
if is_interval_array(values):
1594
1594
return " interval"
1595
1595
1596
+ cnp.PyArray_ITER_RESET(it)
1596
1597
for i in range (n):
1597
- val = values[i]
1598
+ val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
1599
+ PyArray_ITER_NEXT(it)
1600
+
1598
1601
if util.is_integer_object(val):
1599
1602
return " mixed-integer"
1600
1603
@@ -1823,10 +1826,11 @@ cdef class IntegerValidator(Validator):
1823
1826
1824
1827
1825
1828
# Note: only python-exposed for tests
1826
- cpdef bint is_integer_array(ndarray values):
1829
+ cpdef bint is_integer_array(ndarray values, bint skipna = True ):
1827
1830
cdef:
1828
1831
IntegerValidator validator = IntegerValidator(len (values),
1829
- values.dtype)
1832
+ values.dtype,
1833
+ skipna = skipna)
1830
1834
return validator.validate(values)
1831
1835
1832
1836
@@ -1837,10 +1841,10 @@ cdef class IntegerNaValidator(Validator):
1837
1841
or (util.is_nan(value) and util.is_float_object(value)))
1838
1842
1839
1843
1840
- cdef bint is_integer_na_array(ndarray values):
1844
+ cdef bint is_integer_na_array(ndarray values, bint skipna = True ):
1841
1845
cdef:
1842
1846
IntegerNaValidator validator = IntegerNaValidator(len (values),
1843
- values.dtype)
1847
+ values.dtype, skipna = skipna )
1844
1848
return validator.validate(values)
1845
1849
1846
1850
@@ -1853,10 +1857,11 @@ cdef class IntegerFloatValidator(Validator):
1853
1857
return issubclass (self .dtype.type, np.integer)
1854
1858
1855
1859
1856
- cdef bint is_integer_float_array(ndarray values):
1860
+ cdef bint is_integer_float_array(ndarray values, bint skipna = True ):
1857
1861
cdef:
1858
1862
IntegerFloatValidator validator = IntegerFloatValidator(len (values),
1859
- values.dtype)
1863
+ values.dtype,
1864
+ skipna = skipna)
1860
1865
return validator.validate(values)
1861
1866
1862
1867
@@ -1900,9 +1905,11 @@ cdef class DecimalValidator(Validator):
1900
1905
return is_decimal(value)
1901
1906
1902
1907
1903
- cdef bint is_decimal_array(ndarray values):
1908
+ cdef bint is_decimal_array(ndarray values, bint skipna = False ):
1904
1909
cdef:
1905
- DecimalValidator validator = DecimalValidator(len (values), values.dtype)
1910
+ DecimalValidator validator = DecimalValidator(
1911
+ len (values), values.dtype, skipna = skipna
1912
+ )
1906
1913
return validator.validate(values)
1907
1914
1908
1915
@@ -1997,10 +2004,10 @@ cdef class Datetime64Validator(DatetimeValidator):
1997
2004
1998
2005
1999
2006
# Note: only python-exposed for tests
2000
- cpdef bint is_datetime64_array(ndarray values):
2007
+ cpdef bint is_datetime64_array(ndarray values, bint skipna = True ):
2001
2008
cdef:
2002
2009
Datetime64Validator validator = Datetime64Validator(len (values),
2003
- skipna = True )
2010
+ skipna = skipna )
2004
2011
return validator.validate(values)
2005
2012
2006
2013
@@ -2012,10 +2019,10 @@ cdef class AnyDatetimeValidator(DatetimeValidator):
2012
2019
)
2013
2020
2014
2021
2015
- cdef bint is_datetime_or_datetime64_array(ndarray values):
2022
+ cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna = True ):
2016
2023
cdef:
2017
2024
AnyDatetimeValidator validator = AnyDatetimeValidator(len (values),
2018
- skipna = True )
2025
+ skipna = skipna )
2019
2026
return validator.validate(values)
2020
2027
2021
2028
@@ -2069,13 +2076,13 @@ cdef class AnyTimedeltaValidator(TimedeltaValidator):
2069
2076
2070
2077
2071
2078
# Note: only python-exposed for tests
2072
- cpdef bint is_timedelta_or_timedelta64_array(ndarray values):
2079
+ cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna = True ):
2073
2080
"""
2074
2081
Infer with timedeltas and/or nat/none.
2075
2082
"""
2076
2083
cdef:
2077
2084
AnyTimedeltaValidator validator = AnyTimedeltaValidator(len (values),
2078
- skipna = True )
2085
+ skipna = skipna )
2079
2086
return validator.validate(values)
2080
2087
2081
2088
@@ -2105,20 +2112,28 @@ cpdef bint is_time_array(ndarray values, bint skipna=False):
2105
2112
return validator.validate(values)
2106
2113
2107
2114
2108
- cdef bint is_period_array(ndarray[object ] values):
2115
+ # FIXME: actually use skipna
2116
+ cdef bint is_period_array(ndarray values, bint skipna = True ):
2109
2117
"""
2110
2118
Is this an ndarray of Period objects (or NaT) with a single `freq`?
2111
2119
"""
2120
+ # values should be object-dtype, but ndarray[object] assumes 1D, while
2121
+ # this _may_ be 2D.
2112
2122
cdef:
2113
- Py_ssize_t i, n = len ( values)
2123
+ Py_ssize_t i, N = values.size
2114
2124
int dtype_code = - 10000 # i.e. c_FreqGroup.FR_UND
2115
2125
object val
2126
+ flatiter it
2116
2127
2117
- if len (values) == 0 :
2128
+ if N == 0 :
2118
2129
return False
2119
2130
2120
- for i in range (n):
2121
- val = values[i]
2131
+ it = PyArray_IterNew(values)
2132
+ for i in range (N):
2133
+ # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
2134
+ # equivalents to `val = values[i]`
2135
+ val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
2136
+ PyArray_ITER_NEXT(it)
2122
2137
2123
2138
if is_period_object(val):
2124
2139
if dtype_code == - 10000 :
0 commit comments