-
Notifications
You must be signed in to change notification settings - Fork 367
Implement Series.factorize() #1972
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
a86af14
4adae2f
a4799a7
0e2915e
414f86e
017b01d
ba9c375
a011ff4
d33d1d3
e1ddc78
6c89cb9
f1d4c3e
1e6a727
be2e4df
fc94110
8f733e9
3e3fac4
7c092c8
2e538a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| from collections.abc import Mapping | ||
| from distutils.version import LooseVersion | ||
| from functools import partial, wraps, reduce | ||
| from itertools import chain | ||
| from typing import Any, Generic, Iterable, List, Optional, Tuple, TypeVar, Union, cast | ||
|
|
||
| import matplotlib | ||
|
|
@@ -1913,6 +1914,134 @@ def _fillna(self, value=None, method=None, axis=None, limit=None, part_cols=()): | |
| ) | ||
| )._kser_for(self._column_label) | ||
|
|
||
| def factorize( | ||
| self, sort: bool = True, na_sentinel: Optional[int] = -1 | ||
| ) -> Tuple["Series", pd.Index]: | ||
| """ | ||
| Encode the object as an enumerated type or categorical variable. | ||
| This method is useful for obtaining a numeric representation of an | ||
| array when all that matters is identifying distinct values. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| sort : bool, default True | ||
| na_sentinel : int or None, default -1 | ||
| Value to mark "not found". If None, will not drop the NaN | ||
| from the uniques of the values. | ||
|
|
||
| Returns | ||
| ------- | ||
| codes : Series | ||
| A Series that's an indexer into `uniques`. | ||
| ``uniques.take(codes)`` will have the same values as `values`. | ||
| uniques : Index | ||
| The unique valid values. | ||
|
|
||
| .. note :: | ||
|
|
||
| Even if there's a missing value in `values`, `uniques` will | ||
| *not* contain an entry for it. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> kser = ks.Series(['b', None, 'a', 'c', 'b']) | ||
| >>> codes, uniques = kser.factorize() | ||
| >>> codes | ||
| 0 1 | ||
| 1 -1 | ||
| 2 0 | ||
| 3 2 | ||
| 4 1 | ||
| dtype: int32 | ||
| >>> uniques | ||
| Index(['a', 'b', 'c'], dtype='object') | ||
|
|
||
| >>> codes, uniques = kser.factorize(na_sentinel=None) | ||
| >>> codes | ||
| 0 1 | ||
| 1 3 | ||
| 2 0 | ||
| 3 2 | ||
| 4 1 | ||
| dtype: int32 | ||
| >>> uniques | ||
| Index(['a', 'b', 'c', None], dtype='object') | ||
|
|
||
| >>> codes, uniques = kser.factorize(na_sentinel=-2) | ||
| >>> codes | ||
| 0 1 | ||
| 1 -2 | ||
| 2 0 | ||
| 3 2 | ||
| 4 1 | ||
| dtype: int32 | ||
| >>> uniques | ||
| Index(['a', 'b', 'c'], dtype='object') | ||
| """ | ||
| assert (na_sentinel is None) or isinstance(na_sentinel, int) | ||
| assert sort is True | ||
| uniq_sdf = self._internal.spark_frame.select(self.spark.column).distinct() | ||
|
|
||
| # Check number of uniques and constructs sorted `uniques_list` | ||
| max_compute_count = get_option("compute.max_rows") | ||
| if max_compute_count is not None: | ||
| uniq_pdf = uniq_sdf.limit(max_compute_count + 1).toPandas() | ||
| if len(uniq_pdf) > max_compute_count: | ||
| raise ValueError( | ||
| "Current Series has more then {0} unique values. " | ||
| "Please set 'compute.max_rows' by using 'databricks.koalas.config.set_option' " | ||
| "to more than {0} rows. Note that, before changing the " | ||
| "'compute.max_rows', this operation is considerably expensive.".format( | ||
| max_compute_count | ||
| ) | ||
| ) | ||
xinrong-meng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else: | ||
| raise ValueError( | ||
| "Please set 'compute.max_rows' by using 'databricks.koalas.config.set_option' " | ||
| "to restrict the total number of unique values of the current Series." | ||
| "Note that, before changing the 'compute.max_rows', " | ||
| "this operation is considerably expensive." | ||
| ) | ||
|
||
|
|
||
| uniques_list = first_series(uniq_pdf).tolist() | ||
| uniques_list = sorted(uniques_list, key=lambda x: (pd.isna(x), x)) | ||
|
|
||
| # Constructs `unique_to_code` mapping non-na unique to code | ||
| unique_to_code = {} | ||
| if na_sentinel is not None: | ||
| na_sentinel_code = na_sentinel | ||
| code = 0 | ||
| for unique in uniques_list: | ||
| if pd.isna(unique): | ||
| if na_sentinel is None: | ||
| na_sentinel_code = code | ||
| else: | ||
| unique_to_code[unique] = code | ||
| code += 1 | ||
|
|
||
| kvs = list( | ||
| chain(*([(F.lit(unique), F.lit(code)) for unique, code in unique_to_code.items()])) | ||
| ) | ||
|
|
||
| map_scol = F.create_map(kvs) | ||
| null_scol = F.when(self.spark.column.isNull(), F.lit(na_sentinel_code)) | ||
xinrong-meng marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| mapped_scol = map_scol.getItem(self.spark.column) | ||
|
|
||
| new_col = verify_temp_column_name(self.to_frame(), "__new_col__") | ||
| internal = self._internal.with_new_columns( | ||
| [null_scol.otherwise(mapped_scol).alias(new_col)] | ||
| ) | ||
|
|
||
| codes = first_series(DataFrame(internal)) | ||
|
|
||
| if na_sentinel is not None: | ||
| # Drops the NaN from the uniques of the values | ||
| uniques_list = [x for x in uniques_list if not pd.isna(x)] | ||
|
|
||
| uniques = pd.Index(uniques_list) | ||
|
|
||
| return codes, uniques | ||
|
|
||
| def dropna(self, axis=0, inplace=False, **kwargs) -> Optional["Series"]: | ||
| """ | ||
| Return a new Series with missing values removed. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.