Open
Description
Describe the bug
From nvbug, when AST ColumnReference returns a string column, cuDF reports non-fixed-width error.
Steps/Code to reproduce bug
import datetime
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
if __name__ == "__main__":
spark = SparkSession.builder.appName(f"ast_error_rep").getOrCreate()
spark.conf.set('spark.sql.adaptive.enabled','false')
spark.conf.set('spark.rapids.sql.projectAstEnabled', 'true')
spark.conf.set("spark.rapids.sql.enabled","true")
data = [
(True, -121, -5755, 1334097539, 381282419020311609, -1.1721174086335895e+38, 2.0819242151822298e+304, 'A3iMw', datetime.datetime(7266, 5, 8, 8, 38, 33, 297395),
datetime.date(9496, 3, 16)),
(True, 27, 14757, -1576346870, 8658683339966252738, 7.71262355598091e+37,
2.1026643576591284e-206, 'im86tIxX5puQJCBEeP', datetime.datetime(5660, 9, 29, 8, 7, 51, 885069), datetime.date(5764, 10, 27)),
(None, None, None, None, None, None, None, None, None, None)
]
schema = StructType([
StructField('col_bool', BooleanType(), True),
StructField('col_byte', StringType(), True),
StructField('col_short', ShortType(), True),
StructField('col_int', IntegerType(), True),
StructField('col_long', LongType(), True),
StructField('col_float', FloatType(), True),
StructField('col_double', DoubleType(), True),
StructField('col_string', StringType(), True),
StructField('col_timestamp', TimestampType(), True),
StructField('col_date', DateType(), True)])
df = spark.createDataFrame(data, schema)
cols = df.columns
df = df.withColumn("cidx", F.monotonically_increasing_id())
window_spec = Window.orderBy("cidx")
df = df.withColumn("rid", F.row_number().over(window_spec)).drop("cidx")
df_bnlj_non_null_neg = df.alias("t1").join(df.alias("t2"),
[(F.col("t1.col_int") > F.col("t2.col_int")), ~(F.col("t1.col_int").isNull())],
how="leftsemi").selectExpr(*[f"isnotnull({c}) as {c}_isnotnull" for c in cols], "rid")
df_bnlj_non_null_neg.show()
df_bnlj_non_null_neg.explain()
spark.stop()
Error:
25/04/10 05:40:09 WARN TaskSetManager: Lost task 0.0 in stage 2.0 (TID 4) (10.176.9.147 executor 0): ai.rapids.cudf.CudfException: CUDF failure at: /home/jenkins/agent/workspace/jenkins-spark-rapids-jni_nightly-pre_release-431-cuda12/thirdparty/cudf/cpp/src/column/column_factories.cpp:159: Invalid, non-fixed-width type.
at ai.rapids.cudf.ast.CompiledExpression.computeColumn(Native Method)
at ai.rapids.cudf.ast.CompiledExpression.computeColumn(CompiledExpression.java:88)
at com.nvidia.spark.rapids.GpuProjectAstExec$$anon$1.$anonfun$next$15(basicPhysicalOperators.scala:765)
at com.nvidia.spark.rapids.RapidsPluginImplicits$MapsSafely.$anonfun$safeMap$1(implicits.scala:166)
at com.nvidia.spark.rapids.RapidsPluginImplicits$MapsSafely.$anonfun$safeMap$1$adapted(implicits.scala:163)
at scala.collection.immutable.List.foreach(List.scala:431)