77# - Required env variables: SPARK_HOME, HADOOP_VERSION, SPARK_DOWNLOAD_URL
88# - Optional env variables: SPARK_VERSION, SCALA_VERSION
99
10+ import logging
1011import os
1112import subprocess
1213from pathlib import Path
1314
1415import requests
1516from bs4 import BeautifulSoup
1617
18+ LOGGER = logging .getLogger (__name__ )
19+
1720
1821def get_all_refs (url : str ) -> list [str ]:
1922 """
@@ -31,16 +34,19 @@ def get_spark_version() -> str:
3134 """
3235 if (version := os .environ ["SPARK_VERSION" ]) != "" :
3336 return version
37+ LOGGER .info ("Downloading Spark versions information" )
3438 all_refs = get_all_refs ("https://archive.apache.org/dist/spark/" )
3539 stable_versions = [
3640 ref .removeprefix ("spark-" ).removesuffix ("/" )
3741 for ref in all_refs
3842 if ref .startswith ("spark-" ) and "incubating" not in ref and "preview" not in ref
3943 ]
4044 # Compare versions semantically
41- return max (
45+ latest_version = max (
4246 stable_versions , key = lambda ver : [int (sub_ver ) for sub_ver in ver .split ("." )]
4347 )
48+ LOGGER .info (f"Latest version: { latest_version } " )
49+ return latest_version
4450
4551
4652def download_spark (
@@ -53,9 +59,11 @@ def download_spark(
5359 Downloads and unpacks spark
5460 The resulting spark directory name is returned
5561 """
62+ LOGGER .info ("Downloading and unpacking Spark" )
5663 spark_dir_name = f"spark-{ spark_version } -bin-hadoop{ hadoop_version } "
5764 if scala_version :
5865 spark_dir_name += f"-scala{ scala_version } "
66+ LOGGER .info (f"Spark directory name: { spark_dir_name } " )
5967 spark_url = spark_download_url / f"spark-{ spark_version } " / f"{ spark_dir_name } .tgz"
6068
6169 tmp_file = Path ("/tmp/spark.tar.gz" )
@@ -80,11 +88,12 @@ def download_spark(
8088 return spark_dir_name
8189
8290
83- def prepare_spark (spark_dir_name : str , spark_home : Path ) -> None :
91+ def configure_spark (spark_dir_name : str , spark_home : Path ) -> None :
8492 """
8593 Creates a ${SPARK_HOME} symlink to a versioned spark directory
8694 Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
8795 """
96+ LOGGER .info ("Configuring Spark" )
8897 subprocess .check_call (["ln" , "-s" , f"/usr/local/{ spark_dir_name } " , spark_home ])
8998
9099 # Add a link in the before_notebook hook in order to source PYTHONPATH automatically
@@ -95,13 +104,15 @@ def prepare_spark(spark_dir_name: str, spark_home: Path) -> None:
95104
96105
97106if __name__ == "__main__" :
107+ logging .basicConfig (level = logging .INFO )
108+
98109 spark_version = get_spark_version ()
99110 spark_dir_name = download_spark (
100111 spark_version = spark_version ,
101112 hadoop_version = os .environ ["HADOOP_VERSION" ],
102113 scala_version = os .environ ["SCALA_VERSION" ],
103114 spark_download_url = Path (os .environ ["SPARK_DOWNLOAD_URL" ]),
104115 )
105- prepare_spark (
116+ configure_spark (
106117 spark_dir_name = spark_dir_name , spark_home = Path (os .environ ["SPARK_HOME" ])
107118 )
0 commit comments