Refresh GitHub workflow matrix. (#2083)

ueshin · web-flow · commit b32203bd5250 · 2021-03-05T16:40:55.000-08:00
Now that Spark 3.1 has been released, we should refresh GitHub Action's workflow matrix to focus more on the newer versions.

The new matrix:

| Python | PySpark | pandas | PyArrow |
| :---: | :---: | :---: | :---: |
| 3.5 | 2.3.4 | 0.23.4 | 0.16.0 |
| 3.6 | 2.3.4 | 0.24.2 | 0.10.0 |
| 3.6 | 2.4.7 | 0.24.2 | 0.14.1 |
| 3.7 | 2.4.7 | 0.25.3 | 0.15.1 |
| 3.7 | 3.0.2 | 1.0.5 | 1.0.1 |
| 3.7 | 3.1.1 | 1.1.5 | 2.0.0 |
| 3.8 | 3.0.2 | 1.1.5 | 2.0.0 |
| 3.8 | 3.1.1 | 1.2.3 | 3.0.0 |
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -89,26 +89,26 @@ jobs:
             pandas-version: 0.24.2
             pyarrow-version: 0.14.1
             logger: databricks.koalas.usage_logging.usage_logger
-          - python-version: 3.6
-            spark-version: 2.4.7
-            pandas-version: 0.25.3
-            pyarrow-version: 0.15.1
-            default-index-type: 'distributed-sequence'
           - python-version: 3.7
             spark-version: 2.4.7
             pandas-version: 0.25.3
-            pyarrow-version: 0.14.1
-          - python-version: 3.7
-            spark-version: 2.4.7
-            pandas-version: 1.0.5
             pyarrow-version: 0.15.1
             default-index-type: 'distributed-sequence'
           - python-version: 3.7
             spark-version: 3.0.2
-            pandas-version: 0.25.3
+            pandas-version: 1.0.5
             pyarrow-version: 1.0.1
+          - python-version: 3.7
+            spark-version: 3.1.1
+            pandas-version: 1.1.5
+            pyarrow-version: 2.0.0
+            default-index-type: 'distributed-sequence'
           - python-version: 3.8
             spark-version: 3.0.2
+            pandas-version: 1.1.5
+            pyarrow-version: 2.0.0
+          - python-version: 3.8
+            spark-version: 3.1.1
             pandas-version: 1.2.3
             pyarrow-version: 3.0.0
             default-index-type: 'distributed-sequence'
@@ -151,6 +151,8 @@ jobs:
         fi
         conda install -c conda-forge --yes pandas==$PANDAS_VERSION pyarrow==$PYARROW_VERSION
         sed -i -e "/pandas/d" -e "/pyarrow/d" requirements-dev.txt
+        # Disable mypy check for PySpark 3.1
+        if [[ "SPARK_VERSION" > "3.1" ]]; then sed -i '/mypy/d' requirements-dev.txt; fi
         # sphinx-plotly-directive is not available on Conda.
         sed -i '/sphinx-plotly-directive/d' requirements-dev.txt
         conda install -c conda-forge --yes --file requirements-dev.txt
diff --git a/databricks/koalas/indexes/multi.py b/databricks/koalas/indexes/multi.py
@@ -480,21 +480,19 @@ def _is_monotonic(self, order):
             return self._is_monotonic_decreasing().all()
 
     def _is_monotonic_increasing(self):
-        scol = self.spark.column
         window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
-        prev = F.lag(scol, 1).over(window)
 
         cond = F.lit(True)
         has_not_null = F.lit(True)
-        for field in self.spark.data_type[::-1]:
-            left = scol.getField(field.name)
-            right = prev.getField(field.name)
-            compare = MultiIndex._comparator_for_monotonic_increasing(field.dataType)
+        for scol in self._internal.index_spark_columns[::-1]:
+            data_type = self._internal.spark_type_for(scol)
+            prev = F.lag(scol, 1).over(window)
+            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
             # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
             # Therefore, we should check `has_not_null` over the all levels.
-            has_not_null = has_not_null & left.isNotNull()
-            cond = F.when(left.eqNullSafe(right), cond).otherwise(
-                compare(left, right, spark.Column.__gt__)
+            has_not_null = has_not_null & scol.isNotNull()
+            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(
+                compare(scol, prev, spark.Column.__gt__)
             )
 
         cond = has_not_null & (prev.isNull() | cond)
@@ -524,21 +522,19 @@ def _comparator_for_monotonic_decreasing(data_type):
         return compare_disallow_null
 
     def _is_monotonic_decreasing(self):
-        scol = self.spark.column
         window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
-        prev = F.lag(scol, 1).over(window)
 
         cond = F.lit(True)
         has_not_null = F.lit(True)
-        for field in self.spark.data_type[::-1]:
-            left = scol.getField(field.name)
-            right = prev.getField(field.name)
-            compare = MultiIndex._comparator_for_monotonic_decreasing(field.dataType)
+        for scol in self._internal.index_spark_columns[::-1]:
+            data_type = self._internal.spark_type_for(scol)
+            prev = F.lag(scol, 1).over(window)
+            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
             # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
             # Therefore, we should check `has_not_null` over the all levels.
-            has_not_null = has_not_null & left.isNotNull()
-            cond = F.when(left.eqNullSafe(right), cond).otherwise(
-                compare(left, right, spark.Column.__lt__)
+            has_not_null = has_not_null & scol.isNotNull()
+            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(
+                compare(scol, prev, spark.Column.__lt__)
             )
 
         cond = has_not_null & (prev.isNull() | cond)
diff --git a/databricks/koalas/internal.py b/databricks/koalas/internal.py
@@ -445,8 +445,8 @@ def __init__(
         |              zoo|              bar|     7|     8|     9|...
         +-----------------+-----------------+------+------+------+...
 
-        >>> internal.index_spark_columns
-        [Column<b'__index_level_0__'>, Column<b'__index_level_1__'>, Column<b'(a, x)'>]
+        >>> internal.index_spark_columns  # doctest: +SKIP
+        [Column<'__index_level_0__'>, Column<'__index_level_1__'>, Column<'(a, x)'>]
 
         >>> internal.index_names
         [('row_index_a',), ('row_index_b',), ('a', 'x')]
@@ -457,8 +457,8 @@ def __init__(
         >>> internal.column_labels
         [('a', 'y'), ('b', 'z')]
 
-        >>> internal.data_spark_columns
-        [Column<b'(a, y)'>, Column<b'(b, z)'>]
+        >>> internal.data_spark_columns  # doctest: +SKIP
+        [Column<'(a, y)'>, Column<'(b, z)'>]
 
         >>> internal.data_dtypes
         [dtype('int64'), dtype('int64')]