-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: rank with +-inf, #6945 #17903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: rank with +-inf, #6945 #17903
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,7 @@ dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), | |
{{if dtype == 'object'}} | ||
|
||
|
||
def rank_1d_{{dtype}}(object in_arr, bint retry=1, ties_method='average', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. very odd that we had this arg here in the first place There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Btw, where should I put the note ? Which file ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. v0.22.0.txt, add a small sub-section in other enhancements with a mini-example. |
||
def rank_1d_{{dtype}}(object in_arr, ties_method='average', | ||
ascending=True, na_option='keep', pct=False): | ||
{{else}} | ||
|
||
|
@@ -40,7 +40,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, | |
""" | ||
|
||
cdef: | ||
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 | ||
Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 | ||
|
||
{{if dtype == 'object'}} | ||
ndarray sorted_data, values | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can put the sorted_nanmask for all dtypes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes , you are right. I should add for all dtypes. |
||
|
@@ -50,6 +50,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, | |
|
||
ndarray[float64_t] ranks | ||
ndarray[int64_t] argsorted | ||
ndarray[np.uint8_t, cast=True] sorted_mask | ||
|
||
{{if dtype == 'uint64'}} | ||
{{ctype}} val | ||
|
@@ -60,6 +61,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, | |
float64_t sum_ranks = 0 | ||
int tiebreak = 0 | ||
bint keep_na = 0 | ||
bint isnan | ||
float count = 0.0 | ||
tiebreak = tiebreakers[ties_method] | ||
|
||
|
@@ -76,12 +78,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, | |
|
||
keep_na = na_option == 'keep' | ||
|
||
{{if dtype != 'uint64'}} | ||
if ascending ^ (na_option == 'top'): | ||
nan_value = {{pos_nan_value}} | ||
else: | ||
nan_value = {{neg_nan_value}} | ||
|
||
{{if dtype == 'object'}} | ||
mask = missing.isnaobj(values) | ||
{{elif dtype == 'float64'}} | ||
|
@@ -90,56 +86,69 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, | |
mask = values == iNaT | ||
{{endif}} | ||
|
||
# double sort first by mask and then by values to ensure nan values are | ||
# either at the beginning or the end. mask/(~mask) controls padding at | ||
# tail or the head | ||
{{if dtype != 'uint64'}} | ||
if ascending ^ (na_option == 'top'): | ||
nan_value = {{pos_nan_value}} | ||
order = (values, mask) | ||
else: | ||
nan_value = {{neg_nan_value}} | ||
order = (values, ~mask) | ||
np.putmask(values, mask, nan_value) | ||
{{else}} | ||
mask = np.zeros(shape=len(values), dtype=bool) | ||
order = (values, mask) | ||
{{endif}} | ||
|
||
n = len(values) | ||
ranks = np.empty(n, dtype='f8') | ||
|
||
{{if dtype == 'object'}} | ||
|
||
try: | ||
_as = values.argsort() | ||
_as = np.lexsort(keys=order) | ||
except TypeError: | ||
if not retry: | ||
raise | ||
|
||
valid_locs = (~mask).nonzero()[0] | ||
ranks.put(valid_locs, rank_1d_object(values.take(valid_locs), 0, | ||
ties_method=ties_method, | ||
ascending=ascending)) | ||
np.putmask(ranks, mask, np.nan) | ||
return ranks | ||
# lexsort on object array will raise TypeError for numpy version | ||
# earlier than 1.11.0. Use argsort with order argument instead. | ||
_dt = [('values', 'O'), ('mask', '?')] | ||
_values = np.asarray(list(zip(order[0], order[1])), dtype=_dt) | ||
_as = np.argsort(_values, kind='mergesort', order=('mask', 'values')) | ||
{{else}} | ||
if tiebreak == TIEBREAK_FIRST: | ||
# need to use a stable sort here | ||
_as = values.argsort(kind='mergesort') | ||
_as = np.lexsort(keys=order) | ||
if not ascending: | ||
tiebreak = TIEBREAK_FIRST_DESCENDING | ||
else: | ||
_as = values.argsort() | ||
_as = np.lexsort(keys=order) | ||
{{endif}} | ||
|
||
if not ascending: | ||
_as = _as[::-1] | ||
|
||
sorted_data = values.take(_as) | ||
sorted_mask = mask.take(_as) | ||
_indices = order[1].take(_as).nonzero()[0] | ||
non_na_idx = _indices[0] if len(_indices) > 0 else -1 | ||
argsorted = _as.astype('i8') | ||
|
||
{{if dtype == 'object'}} | ||
for i in range(n): | ||
sum_ranks += i + 1 | ||
dups += 1 | ||
|
||
isnan = sorted_mask[i] | ||
val = util.get_value_at(sorted_data, i) | ||
|
||
if (val is nan_value) and keep_na: | ||
if isnan and keep_na: | ||
ranks[argsorted[i]] = nan | ||
continue | ||
|
||
count += 1.0 | ||
|
||
if (i == n - 1 or | ||
are_diff(util.get_value_at(sorted_data, i + 1), val)): | ||
are_diff(util.get_value_at(sorted_data, i + 1), val) or | ||
i == non_na_idx - 1): | ||
if tiebreak == TIEBREAK_AVERAGE: | ||
for j in range(i - dups + 1, i + 1): | ||
ranks[argsorted[j]] = sum_ranks / dups | ||
|
@@ -164,18 +173,19 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, | |
for i in range(n): | ||
sum_ranks += i + 1 | ||
dups += 1 | ||
|
||
val = sorted_data[i] | ||
|
||
{{if dtype != 'uint64'}} | ||
if (val == nan_value) and keep_na: | ||
isnan = sorted_mask[i] | ||
if isnan and keep_na: | ||
ranks[argsorted[i]] = nan | ||
continue | ||
{{endif}} | ||
|
||
count += 1.0 | ||
|
||
if i == n - 1 or sorted_data[i + 1] != val: | ||
if (i == n - 1 or sorted_data[i + 1] != val or | ||
i == non_na_idx - 1): | ||
if tiebreak == TIEBREAK_AVERAGE: | ||
for j in range(i - dups + 1, i + 1): | ||
ranks[argsorted[j]] = sum_ranks / dups | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when you use the ipython directive, it executes the code, so you just need to