@@ -319,10 +319,10 @@ def is_monotonic(self):
319319 """
320320 Return boolean if values in the object are monotonically increasing.
321321
322- .. note:: the current implementation of is_monotonic_increasing uses Spark's
323- Window without specifying partition specification. This leads to move all data into
324- single partition in single machine and could cause serious
325- performance degradation. Avoid this method against very large dataset .
322+ .. note:: the current implementation of is_monotonic requires to shuffle
323+ and aggregate multiple times to check the order locally and globally,
324+ which is potentially expensive. In case of multi-index, all data are
325+ transferred to single node which can easily cause out-of-memory error currently .
326326
327327 Returns
328328 -------
@@ -385,12 +385,7 @@ def is_monotonic(self):
385385 >>> midx.is_monotonic
386386 False
387387 """
388- return self ._is_monotonic ().all ()
389-
390- def _is_monotonic (self ):
391- col = self ._scol
392- window = Window .orderBy (NATURAL_ORDER_COLUMN_NAME ).rowsBetween (- 1 , - 1 )
393- return self ._with_new_scol ((col >= F .lag (col , 1 ).over (window )) & col .isNotNull ())
388+ return self ._is_monotonic ("increasing" )
394389
395390 is_monotonic_increasing = is_monotonic
396391
@@ -399,10 +394,10 @@ def is_monotonic_decreasing(self):
399394 """
400395 Return boolean if values in the object are monotonically decreasing.
401396
402- .. note:: the current implementation of is_monotonic_decreasing uses Spark's
403- Window without specifying partition specification. This leads to move all data into
404- single partition in single machine and could cause serious
405- performance degradation. Avoid this method against very large dataset .
397+ .. note:: the current implementation of is_monotonic_decreasing requires to shuffle
398+ and aggregate multiple times to check the order locally and globally,
399+ which is potentially expensive. In case of multi-index, all data are transferred
400+ to single node which can easily cause out-of-memory error currently .
406401
407402 Returns
408403 -------
@@ -465,12 +460,80 @@ def is_monotonic_decreasing(self):
465460 >>> midx.is_monotonic_decreasing
466461 True
467462 """
468- return self ._is_monotonic_decreasing (). all ( )
463+ return self ._is_monotonic ( "decreasing" )
469464
470- def _is_monotonic_decreasing (self ):
471- col = self ._scol
472- window = Window .orderBy (NATURAL_ORDER_COLUMN_NAME ).rowsBetween (- 1 , - 1 )
473- return self ._with_new_scol ((col <= F .lag (col , 1 ).over (window )) & col .isNotNull ())
465+ def _is_locally_monotonic_spark_column (self , order ):
466+ window = (
467+ Window .partitionBy (F .col ("__partition_id" ))
468+ .orderBy (NATURAL_ORDER_COLUMN_NAME )
469+ .rowsBetween (- 1 , - 1 )
470+ )
471+
472+ if order == "increasing" :
473+ return (F .col ("__origin" ) >= F .lag (F .col ("__origin" ), 1 ).over (window )) & F .col (
474+ "__origin"
475+ ).isNotNull ()
476+ else :
477+ return (F .col ("__origin" ) <= F .lag (F .col ("__origin" ), 1 ).over (window )) & F .col (
478+ "__origin"
479+ ).isNotNull ()
480+
481+ def _is_monotonic (self , order ):
482+ assert order in ("increasing" , "decreasing" )
483+
484+ sdf = self ._internal .spark_frame
485+
486+ sdf = (
487+ sdf .select (
488+ F .spark_partition_id ().alias (
489+ "__partition_id"
490+ ), # Make sure we use the same partition id in the whole job.
491+ F .col (NATURAL_ORDER_COLUMN_NAME ),
492+ self ._scol .alias ("__origin" ),
493+ )
494+ .select (
495+ F .col ("__partition_id" ),
496+ F .col ("__origin" ),
497+ self ._is_locally_monotonic_spark_column (order ).alias (
498+ "__comparison_within_partition"
499+ ),
500+ )
501+ .groupby (F .col ("__partition_id" ))
502+ .agg (
503+ F .min (F .col ("__origin" )).alias ("__partition_min" ),
504+ F .max (F .col ("__origin" )).alias ("__partition_max" ),
505+ F .min (F .coalesce (F .col ("__comparison_within_partition" ), F .lit (True ))).alias (
506+ "__comparison_within_partition"
507+ ),
508+ )
509+ )
510+
511+ # Now we're windowing the aggregation results without partition specification.
512+ # The number of rows here will be as the same of partitions, which is expected
513+ # to be small.
514+ window = Window .orderBy (F .col ("__partition_id" )).rowsBetween (- 1 , - 1 )
515+ if order == "increasing" :
516+ comparison_col = F .col ("__partition_min" ) >= F .lag (F .col ("__partition_max" ), 1 ).over (
517+ window
518+ )
519+ else :
520+ comparison_col = F .col ("__partition_min" ) <= F .lag (F .col ("__partition_max" ), 1 ).over (
521+ window
522+ )
523+
524+ sdf = sdf .select (
525+ comparison_col .alias ("__comparison_between_partitions" ),
526+ F .col ("__comparison_within_partition" ),
527+ )
528+
529+ ret = sdf .select (
530+ F .min (F .coalesce (F .col ("__comparison_between_partitions" ), F .lit (True )))
531+ & F .min (F .coalesce (F .col ("__comparison_within_partition" ), F .lit (True )))
532+ ).collect ()[0 ][0 ]
533+ if ret is None :
534+ return True
535+ else :
536+ return ret
474537
475538 @property
476539 def ndim (self ):
0 commit comments