From afc34ad57f8525fbd0f1f855c5263946ec689f91 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Fri, 21 Mar 2025 16:25:05 +0200 Subject: [PATCH 01/14] fix(spark): JAVA_HOME needs to point to OpenJDK --- spark-k8s/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index 0ca95319d..b7532c8aa 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -306,6 +306,7 @@ RUN < Date: Fri, 21 Mar 2025 17:52:11 +0200 Subject: [PATCH 02/14] move env directive out of heredoc --- spark-k8s/Dockerfile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index b7532c8aa..379cee15b 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -319,10 +319,6 @@ microdnf install \ microdnf clean all rm -rf /var/cache/yum -# The base image (java-base) defines this to point to a JRE installation. -# Spark Connect requires it to point to a JDK installation. -ENV JAVA_HOME="/usr/lib/jvm/java-${JAVA_VERSION}-openjdk" - ln -s /usr/bin/python${PYTHON} /usr/bin/python ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip @@ -336,6 +332,11 @@ chown -R ${STACKABLE_USER_UID}:0 /stackable chmod -R g=u /stackable EOF +# The base image (java-base) defines this to point to a JRE installation. +# Spark Connect requires it to point to a JDK installation. +ENV JAVA_HOME="/usr/lib/jvm/java-${JAVA_VERSION}-openjdk" + + # ---------------------------------------- # Attention: We are changing the group of all files in /stackable directly above # If you do any file based actions (copying / creating etc.) below this comment you From bea57685b5bf2fb99ed9723e879b6b4424174ce5 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Mon, 24 Mar 2025 16:32:00 +0200 Subject: [PATCH 03/14] add spark-connect jars to dist/connect folder --- spark-k8s/Dockerfile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index 379cee15b..ef7e94dff 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -247,6 +247,15 @@ RUN curl -O https://repo.stackable.tech/repository/packages/jackson-dataformat-x && curl -O https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \ && curl -O https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar +WORKDIR /stackable/spark-${PRODUCT}/dist/connect + +# As of version 3.5.5, spark-connect jars are not included in the dist folder. +# To avoid classpath conflicts with existing spark applications, +# we create a new dist/connect folder, and copy them here. +RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \ + && cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \ + && cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar + WORKDIR /stackable/jmx RUN curl -O "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" From 02e379844d455a43a011c2d5d648ae76c1d3df2a Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Mon, 24 Mar 2025 16:49:28 +0200 Subject: [PATCH 04/14] fix typo --- spark-k8s/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index ef7e94dff..70225174a 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -254,7 +254,7 @@ WORKDIR /stackable/spark-${PRODUCT}/dist/connect # we create a new dist/connect folder, and copy them here. RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \ && cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \ - && cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar + && cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar . WORKDIR /stackable/jmx From 7a259d2fd3784f16f073c07fcda3e374cc094afc Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Mon, 24 Mar 2025 18:30:45 +0200 Subject: [PATCH 05/14] added spark-connect-client image --- conf.py | 2 + spark-connect-client/Dockerfile | 81 +++++++ .../licenses/JMX_EXPORTER_LICENSE | 201 ++++++++++++++++++ spark-connect-client/licenses/SPARK_LICENSE | 201 ++++++++++++++++++ .../licenses/STACKABLE_LICENSE | 43 ++++ .../python/simple-connect-app.py | 21 ++ spark-connect-client/versions.py | 8 + 7 files changed, 557 insertions(+) create mode 100644 spark-connect-client/Dockerfile create mode 100644 spark-connect-client/licenses/JMX_EXPORTER_LICENSE create mode 100644 spark-connect-client/licenses/SPARK_LICENSE create mode 100644 spark-connect-client/licenses/STACKABLE_LICENSE create mode 100755 spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py create mode 100644 spark-connect-client/versions.py diff --git a/conf.py b/conf.py index e863dd145..85a818d58 100644 --- a/conf.py +++ b/conf.py @@ -36,6 +36,7 @@ zookeeper = importlib.import_module("zookeeper.versions") tools = importlib.import_module("tools.versions") statsd_exporter = importlib.import_module("statsd_exporter.versions") +spark_connect_client = importlib.import_module("spark-connect-client.versions") products = [ {"name": "airflow", "versions": airflow.versions}, @@ -64,6 +65,7 @@ {"name": "zookeeper", "versions": zookeeper.versions}, {"name": "tools", "versions": tools.versions}, {"name": "statsd_exporter", "versions": statsd_exporter.versions}, + {"name": "spark-connect-client", "versions": spark_connect_client.versions}, ] open_shift_projects = { diff --git a/spark-connect-client/Dockerfile b/spark-connect-client/Dockerfile new file mode 100644 index 000000000..c5902e52b --- /dev/null +++ b/spark-connect-client/Dockerfile @@ -0,0 +1,81 @@ +# syntax=docker/dockerfile:1.10.0@sha256:865e5dd094beca432e8c0a1d5e1c465db5f998dca4e439981029b3b81fb39ed5 +# check=error=true + +# hadoop-builder: Provides Hadoop libraries +FROM stackable/image/spark-k8s AS spark-builder + +FROM stackable/image/java-base AS final + +ARG PRODUCT +ARG PYTHON +ARG RELEASE +ARG STACKABLE_USER_UID + +LABEL name="Stackable Spark Connect Examples" \ + maintainer="info@stackable.tech" \ + vendor="Stackable GmbH" \ + version="${PRODUCT}" \ + release="${RELEASE}" \ + summary="Spark Connect Examples" \ + description="Spark Connect Examples" + + +ENV HOME=/stackable +ENV SPARK_HOME=/stackable/spark +ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin +ENV PYSPARK_PYTHON=/usr/bin/python +ENV PYTHONPATH=$SPARK_HOME/python + +COPY spark-k8s/stackable /stackable +COPY spark-k8s/licenses /licenses + +COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark /stackable/spark +COPY --from=spark-builder /usr/bin/tini /usr/bin/tini + +RUN <" or with a notice of your own that is not confusingly similar to the notice in this License; and (iii) You may not claim that your original works are open source software unless your Modified License has been approved by Open Source Initiative (OSI) and You comply with its license review and certification process. diff --git a/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py b/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py new file mode 100755 index 000000000..c3b230871 --- /dev/null +++ b/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py @@ -0,0 +1,21 @@ +from pyspark.sql import SparkSession + +if __name__ == "__main__": + spark = ( + SparkSession.builder.appName("SampleConnectApp") + .remote("sc://localhost") + .getOrCreate() + ) + + # See https://issues.apache.org/jira/browse/SPARK-46032 + spark.addArtifacts("/stackable/spark/connect/spark-connect_2.12-3.5.5.jar") + + logFile = "/stackable/spark/README.md" + logData = spark.read.text(logFile).cache() + + numAs = logData.filter(logData.value.contains("a")).count() + numBs = logData.filter(logData.value.contains("b")).count() + + print("Lines with a: %i, lines with b: %i" % (numAs, numBs)) + + spark.stop() diff --git a/spark-connect-client/versions.py b/spark-connect-client/versions.py new file mode 100644 index 000000000..c0d2a9351 --- /dev/null +++ b/spark-connect-client/versions.py @@ -0,0 +1,8 @@ +versions = [ + { + "product": "3.5.5", + "spark-k8s": "3.5.5", + "java-base": "17", + "python": "3.11", + }, +] From e3265c9ab9ce0a895a3960b9e7c58bf880a39462 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Tue, 25 Mar 2025 11:54:51 +0200 Subject: [PATCH 06/14] Use Temurin instead of OpenJDK --- spark-k8s/Dockerfile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index 70225174a..7eea4dd50 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -280,6 +280,8 @@ COPY shared/log4shell_scanner /bin/log4shell_scanner RUN /bin/log4shell_scanner s /stackable/spark-${PRODUCT}/dist # === +# TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here. +# To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead. FROM stackable/image/java-base AS final ARG PRODUCT @@ -299,7 +301,9 @@ LABEL name="Apache Spark" \ ENV HOME=/stackable ENV SPARK_HOME=/stackable/spark -ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin +# Override the java-base version of JAVA_HOME to point to the jdk. +ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk" +ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH ENV PYSPARK_PYTHON=/usr/bin/python ENV PYTHONPATH=$SPARK_HOME/python @@ -313,18 +317,18 @@ COPY --from=spark-builder /usr/bin/tini /usr/bin/tini RUN < Date: Tue, 25 Mar 2025 12:33:55 +0200 Subject: [PATCH 07/14] cleanup spark-connect-client --- spark-connect-client/Dockerfile | 38 +--- .../licenses/JMX_EXPORTER_LICENSE | 201 ------------------ spark-connect-client/licenses/SPARK_LICENSE | 201 ------------------ .../licenses/STACKABLE_LICENSE | 43 ---- 4 files changed, 8 insertions(+), 475 deletions(-) delete mode 100644 spark-connect-client/licenses/JMX_EXPORTER_LICENSE delete mode 100644 spark-connect-client/licenses/SPARK_LICENSE delete mode 100644 spark-connect-client/licenses/STACKABLE_LICENSE diff --git a/spark-connect-client/Dockerfile b/spark-connect-client/Dockerfile index c5902e52b..6b8bb6955 100644 --- a/spark-connect-client/Dockerfile +++ b/spark-connect-client/Dockerfile @@ -1,10 +1,9 @@ # syntax=docker/dockerfile:1.10.0@sha256:865e5dd094beca432e8c0a1d5e1c465db5f998dca4e439981029b3b81fb39ed5 -# check=error=true -# hadoop-builder: Provides Hadoop libraries +# spark-builder: provides client libs for spark-connect FROM stackable/image/spark-k8s AS spark-builder -FROM stackable/image/java-base AS final +FROM stackable/image/java-base ARG PRODUCT ARG PYTHON @@ -17,36 +16,21 @@ LABEL name="Stackable Spark Connect Examples" \ version="${PRODUCT}" \ release="${RELEASE}" \ summary="Spark Connect Examples" \ - description="Spark Connect Examples" + description="Spark Connect client libraries for Python and the JVM, including some examples." ENV HOME=/stackable -ENV SPARK_HOME=/stackable/spark -ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin -ENV PYSPARK_PYTHON=/usr/bin/python -ENV PYTHONPATH=$SPARK_HOME/python -COPY spark-k8s/stackable /stackable -COPY spark-k8s/licenses /licenses - -COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark /stackable/spark -COPY --from=spark-builder /usr/bin/tini /usr/bin/tini +COPY spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples +COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark/connect /stackable/spark/connect RUN <" or with a notice of your own that is not confusingly similar to the notice in this License; and (iii) You may not claim that your original works are open source software unless your Modified License has been approved by Open Source Initiative (OSI) and You comply with its license review and certification process. From 1243a1e5cdf5a12ea47f597a8c817d1399c94a28 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Tue, 25 Mar 2025 12:37:46 +0200 Subject: [PATCH 08/14] changelog --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2897fdb86..63ea2aa3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- spark-connect-client: A new image for Spark connect tests and demos ([#1034]) + +### Changed + +- spark-k8s: Replace OpenJDK with Temurin JDK and some cleanup ([#1034]) + ## [25.3.0] - 2025-03-21 ### Added From f979808adfee826e40d0d82634f88acbd4f06e5f Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Tue, 25 Mar 2025 14:43:33 +0200 Subject: [PATCH 09/14] make connect app configurable --- .../spark-connect-examples/python/simple-connect-app.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py b/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py index c3b230871..d3176b338 100755 --- a/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py +++ b/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py @@ -1,9 +1,12 @@ +import sys + from pyspark.sql import SparkSession if __name__ == "__main__": + remote: str = sys.argv[1] spark = ( - SparkSession.builder.appName("SampleConnectApp") - .remote("sc://localhost") + SparkSession.builder.appName("SimpleSparkConnectApp") + .remote(remote) .getOrCreate() ) From 91ba78d016838193db1eec249a6bc3044041d7e9 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Tue, 25 Mar 2025 14:50:22 +0200 Subject: [PATCH 10/14] cleanup comment --- spark-k8s/Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index 7eea4dd50..63934552f 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -318,9 +318,11 @@ COPY --from=spark-builder /usr/bin/tini /usr/bin/tini RUN < Date: Tue, 25 Mar 2025 14:54:18 +0200 Subject: [PATCH 11/14] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63ea2aa3b..7354a9891 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ All notable changes to this project will be documented in this file. ### Changed -- spark-k8s: Replace OpenJDK with Temurin JDK and some cleanup ([#1034]) +- spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034]) ## [25.3.0] - 2025-03-21 From 4de1d45e775ec9e8076d84f0d42d8444652e0b89 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Tue, 25 Mar 2025 15:00:47 +0200 Subject: [PATCH 12/14] pr uri --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7354a9891..fd563b049 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ All notable changes to this project will be documented in this file. - spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034]) +[#1034]: https://github.com/stackabletech/docker-images/pull/1034 + ## [25.3.0] - 2025-03-21 ### Added From e5bae9826b5922f9081f6e301293f647553ebbae Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Fri, 28 Mar 2025 16:07:18 +0100 Subject: [PATCH 13/14] fix merge --- spark-k8s/Dockerfile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index 311833b34..703fd4ce3 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -244,15 +244,18 @@ RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect && cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \ && cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar . -WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars - COPY spark-k8s/stackable/jmx /stackable/jmx +WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars + RUN < Date: Fri, 28 Mar 2025 16:45:08 +0100 Subject: [PATCH 14/14] fixes --- spark-k8s/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile index 703fd4ce3..84c13cd6e 100644 --- a/spark-k8s/Dockerfile +++ b/spark-k8s/Dockerfile @@ -267,9 +267,6 @@ curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_pr -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar -# Symlink example jar, so that we can easily use it in tests -ln -s /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples_*.jar /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples.jar - chmod -R g=u /stackable/spark-${PRODUCT}/dist chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json chmod -R g=u /stackable/jmx @@ -332,6 +329,10 @@ rm -rf /var/cache/yum ln -s /usr/bin/python${PYTHON} /usr/bin/python ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip + +# Symlink example jar, so that we can easily use it in tests +ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar +chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar EOF