Make is_monotonic/is_monotonic_decreasing distributed

HyukjinKwon · HyukjinKwon · commit ae10be6c9c4c · 2020-03-19T19:25:44.000+09:00
diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py
@@ -319,10 +319,10 @@ def is_monotonic(self):
         """
         Return boolean if values in the object are monotonically increasing.
 
-        .. note:: the current implementation of is_monotonic_increasing uses Spark's
-            Window without specifying partition specification. This leads to move all data into
-            single partition in single machine and could cause serious
-            performance degradation. Avoid this method against very large dataset.
+        .. note:: the current implementation of is_monotonic requires to shuffle
+            and aggregate multiple times to check the order locally and globally,
+            which is potentially expensive. In case of multi-index, all data are
+            transferred to single node which can easily cause out-of-memory error currently.
 
         Returns
         -------
@@ -385,12 +385,7 @@ def is_monotonic(self):
         >>> midx.is_monotonic
         False
         """
-        return self._is_monotonic().all()
-
-    def _is_monotonic(self):
-        col = self._scol
-        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
-        return self._with_new_scol((col >= F.lag(col, 1).over(window)) & col.isNotNull())
+        return self._is_monotonic("increasing")
 
     is_monotonic_increasing = is_monotonic
 
@@ -399,10 +394,10 @@ def is_monotonic_decreasing(self):
         """
         Return boolean if values in the object are monotonically decreasing.
 
-        .. note:: the current implementation of is_monotonic_decreasing uses Spark's
-            Window without specifying partition specification. This leads to move all data into
-            single partition in single machine and could cause serious
-            performance degradation. Avoid this method against very large dataset.
+        .. note:: the current implementation of is_monotonic_decreasing requires to shuffle
+            and aggregate multiple times to check the order locally and globally,
+            which is potentially expensive. In case of multi-index, all data are transferred
+            to single node which can easily cause out-of-memory error currently.
 
         Returns
         -------
@@ -465,12 +460,80 @@ def is_monotonic_decreasing(self):
         >>> midx.is_monotonic_decreasing
         True
         """
-        return self._is_monotonic_decreasing().all()
+        return self._is_monotonic("decreasing")
 
-    def _is_monotonic_decreasing(self):
-        col = self._scol
-        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
-        return self._with_new_scol((col <= F.lag(col, 1).over(window)) & col.isNotNull())
+    def _is_locally_monotonic_spark_column(self, order):
+        window = (
+            Window.partitionBy(F.col("__partition_id"))
+            .orderBy(NATURAL_ORDER_COLUMN_NAME)
+            .rowsBetween(-1, -1)
+        )
+
+        if order == "increasing":
+            return (F.col("__origin") >= F.lag(F.col("__origin"), 1).over(window)) & F.col(
+                "__origin"
+            ).isNotNull()
+        else:
+            return (F.col("__origin") <= F.lag(F.col("__origin"), 1).over(window)) & F.col(
+                "__origin"
+            ).isNotNull()
+
+    def _is_monotonic(self, order):
+        assert order in ("increasing", "decreasing")
+
+        sdf = self._internal.spark_frame
+
+        sdf = (
+            sdf.select(
+                F.spark_partition_id().alias(
+                    "__partition_id"
+                ),  # Make sure we use the same partition id in the whole job.
+                F.col(NATURAL_ORDER_COLUMN_NAME),
+                self._scol.alias("__origin"),
+            )
+            .select(
+                F.col("__partition_id"),
+                F.col("__origin"),
+                self._is_locally_monotonic_spark_column(order).alias(
+                    "__comparison_within_partition"
+                ),
+            )
+            .groupby(F.col("__partition_id"))
+            .agg(
+                F.min(F.col("__origin")).alias("__partition_min"),
+                F.max(F.col("__origin")).alias("__partition_max"),
+                F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True))).alias(
+                    "__comparison_within_partition"
+                ),
+            )
+        )
+
+        # Now we're windowing the aggregation results without partition specification.
+        # The number of rows here will be as the same of partitions, which is expected
+        # to be small.
+        window = Window.orderBy(F.col("__partition_id")).rowsBetween(-1, -1)
+        if order == "increasing":
+            comparison_col = F.col("__partition_min") >= F.lag(F.col("__partition_max"), 1).over(
+                window
+            )
+        else:
+            comparison_col = F.col("__partition_min") <= F.lag(F.col("__partition_max"), 1).over(
+                window
+            )
+
+        sdf = sdf.select(
+            comparison_col.alias("__comparison_between_partitions"),
+            F.col("__comparison_within_partition"),
+        )
+
+        ret = sdf.select(
+            F.min(F.coalesce(F.col("__comparison_between_partitions"), F.lit(True)))
+            & F.min(F.coalesce(F.col("__comparison_within_partition"), F.lit(True)))
+        ).collect()[0][0]
+        if ret is None:
+            return True
+        else:
+            return ret
 
     @property
     def ndim(self):
diff --git a/databricks/koalas/indexes.py b/databricks/koalas/indexes.py
@@ -1930,7 +1930,13 @@ def _comparator_for_monotonic_increasing(data_type):
         else:
             return compare_null_last
 
-    def _is_monotonic(self):
+    def _is_monotonic(self, order):
+        if order == "increasing":
+            return self._is_monotonic_increasing().all()
+        else:
+            return self._is_monotonic_decreasing().all()
+
+    def _is_monotonic_increasing(self):
         scol = self._scol
         window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
         prev = F.lag(scol, 1).over(window)
diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py
@@ -568,30 +568,19 @@ def _select_rows(self, rows_sel):
                 if (start is None and rows_sel.start is not None) or (
                     stop is None and rows_sel.stop is not None
                 ):
-                    inc, dec = (
-                        sdf.select(
-                            index_column._is_monotonic()._scol.alias("__increasing__"),
-                            index_column._is_monotonic_decreasing()._scol.alias("__decreasing__"),
-                        )
-                        .select(
-                            F.min(F.coalesce("__increasing__", F.lit(True))),
-                            F.min(F.coalesce("__decreasing__", F.lit(True))),
-                        )
-                        .first()
-                    )
                     if start is None and rows_sel.start is not None:
                         start = rows_sel.start
-                        if inc is not False:
+                        if index_column.is_monotonic_increasing is not False:
                             cond.append(index_column._scol >= F.lit(start).cast(index_data_type))
-                        elif dec is not False:
+                        elif index_column.is_monotonic_decreasing is not False:
                             cond.append(index_column._scol <= F.lit(start).cast(index_data_type))
                         else:
                             raise KeyError(rows_sel.start)
                     if stop is None and rows_sel.stop is not None:
                         stop = rows_sel.stop
-                        if inc is not False:
+                        if index_column.is_monotonic_increasing is not False:
                             cond.append(index_column._scol <= F.lit(stop).cast(index_data_type))
-                        elif dec is not False:
+                        elif index_column.is_monotonic_decreasing is not False:
                             cond.append(index_column._scol >= F.lit(stop).cast(index_data_type))
                         else:
                             raise KeyError(rows_sel.stop)