Skip to content

Commit

Permalink
[spark-rapids] Update spark rapids version to 24.04.0 (#1176)
Browse files Browse the repository at this point in the history
* 24.04 release

Signed-off-by: Suraj Aralihalli <suraj.ara16@gmail.com>

* upgrade drivers

Signed-off-by: Suraj Aralihalli <suraj.ara16@gmail.com>

* add update

Signed-off-by: Suraj Aralihalli <suraj.ara16@gmail.com>

* fix update

Signed-off-by: Suraj Aralihalli <suraj.ara16@gmail.com>

* remove allow release info change

Signed-off-by: Suraj Aralihalli <suraj.ara16@gmail.com>

* update tests to unblock CI

Signed-off-by: Suraj Aralihalli <suraj.ara16@gmail.com>

---------

Signed-off-by: Suraj Aralihalli <suraj.ara16@gmail.com>
  • Loading branch information
SurajAralihalli committed May 22, 2024
1 parent b71dd5e commit ddc335b
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 31 deletions.
26 changes: 8 additions & 18 deletions spark-rapids/spark-rapids.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ else
fi

# Update SPARK RAPIDS config
readonly DEFAULT_SPARK_RAPIDS_VERSION="24.02.0"
readonly DEFAULT_SPARK_RAPIDS_VERSION="24.04.0"
readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})

Expand All @@ -51,8 +51,8 @@ readonly MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-maste
readonly RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')

# CUDA version and Driver version config
CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.2.2') #12.2.2
NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '535.104.05') #535.104.05
CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.4.1') #12.2.2
NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '550.54.15') #535.104.05
CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.2

# EXCEPTIONS
Expand All @@ -66,15 +66,6 @@ if [[ "${OS_NAME}" == "ubuntu" ]]; then
CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.1
fi
fi
# Change CUDA version for Debian 12 (Cuda 12.3.2 - Driver v545.23.08 is the latest version supported by Debian 12)
if [[ "${OS_NAME}" == "debian" ]]; then
DEBIAN_VERSION=$(lsb_release -r | awk '{print $2}') # 12
if [[ "${DEBIAN_VERSION}" == "12" ]]; then
CUDA_VERSION=$(get_metadata_attribute 'cuda-version' '12.3.2') #12.3.2
NVIDIA_DRIVER_VERSION=$(get_metadata_attribute 'driver-version' '545.23.08') #545.23.08
CUDA_VERSION_MAJOR="${CUDA_VERSION%.*}" #12.3
fi
fi

# Verify Secure boot
SECURE_BOOT="disabled"
Expand Down Expand Up @@ -209,11 +200,7 @@ function install_nvidia_gpu_driver() {
apt install -y ca-certificates-java
fi

## EXCEPTION
if [[ ${DEBIAN_VERSION} == 12 ]]; then
execute_with_retries "apt-get install -y -q nvidia-kernel-open-dkms"
fi

execute_with_retries "apt-get install -y -q nvidia-kernel-open-dkms"
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"

Expand All @@ -239,9 +226,12 @@ function install_nvidia_gpu_driver() {
cp /var/cuda-repo-ubuntu${UBUNTU_VERSION}04-${CUDA_VERSION_MAJOR//./-}-local/cuda-*-keyring.gpg /usr/share/keyrings/
execute_with_retries "apt-get update"

execute_with_retries "apt-get install -y -q --no-install-recommends nvidia-driver-${NVIDIA_DRIVER_VERSION_PREFIX}-open"
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-drivers-${NVIDIA_DRIVER_VERSION_PREFIX}"
execute_with_retries "apt-get install -y -q --no-install-recommends cuda-toolkit-${CUDA_VERSION_MAJOR//./-}"

modprobe nvidia

# enable a systemd service that updates kernel headers after reboot
setup_systemd_update_headers

Expand Down Expand Up @@ -428,7 +418,7 @@ function setup_gpu_yarn() {

if [[ ${OS_NAME} == debian ]] || [[ ${OS_NAME} == ubuntu ]]; then
export DEBIAN_FRONTEND=noninteractive
execute_with_retries "apt-get update"
execute_with_retries "apt-get --allow-releaseinfo-change update"
execute_with_retries "apt-get install -y -q pciutils"
elif [[ ${OS_NAME} == rocky ]] ; then
execute_with_retries "dnf -y -q install pciutils"
Expand Down
44 changes: 31 additions & 13 deletions spark-rapids/test_spark_rapids.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,20 @@ def verify_spark_job_sql(self):
("STANDARD", ["w-0"], GPU_T4))
def test_spark_rapids(self, configuration, machine_suffixes, accelerator):

if self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky OS")

if self.getImageVersion() < pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in pre 2.0 images")

if self.getImageVersion() == pkg_resources.parse_version("2.1") or self.getImageOs() == "rocky":
self.skipTest("Not supported in image2.1 or rocky images")
if self.getImageVersion() == pkg_resources.parse_version("2.0") and self.getImageOs() == "ubuntu":
self.skipTest("Not supported in image 2.0 for ubuntu")

if self.getImageVersion() == pkg_resources.parse_version("2.1"):
self.skipTest("Not supported in image 2.1 because of secure boot")

if self.getImageVersion() == pkg_resources.parse_version("2.2") and self.getImageOs() == "debian":
self.skipTest("The Debian version (12) for Dataproc 2.2 is not supported")
if self.getImageVersion() == pkg_resources.parse_version("2.2"):
self.skipTest("Not supported in image 2.2 because of secure boot")

optional_components = None
metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
Expand All @@ -91,14 +97,20 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
("STANDARD", ["w-0"], GPU_T4))
def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):

if self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky OS")

if self.getImageVersion() < pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in pre 2.0 images")

if self.getImageVersion() == pkg_resources.parse_version("2.1") or self.getImageOs() == "rocky":
self.skipTest("Not supported in image2.1 or rocky images")
if self.getImageVersion() == pkg_resources.parse_version("2.0") and self.getImageOs() == "ubuntu":
self.skipTest("Not supported in image 2.0 for ubuntu")

if self.getImageVersion() == pkg_resources.parse_version("2.2") and self.getImageOs() == "debian":
self.skipTest("The Debian version (12) for Dataproc 2.2 is not supported")
if self.getImageVersion() == pkg_resources.parse_version("2.1"):
self.skipTest("Not supported in image 2.1 because of secure boot")

if self.getImageVersion() == pkg_resources.parse_version("2.2"):
self.skipTest("Not supported in image 2.2 because of secure boot")

optional_components = None
metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
Expand All @@ -120,18 +132,24 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
# Only need to do this once
self.verify_spark_job_sql()

@parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.1.1", "530.30.02"))
@parameterized.parameters(("STANDARD", ["w-0"], GPU_T4, "12.4.0", "550.54.14"))
def test_non_default_cuda_versions(self, configuration, machine_suffixes,
accelerator, cuda_version, driver_version):

if self.getImageOs() == "rocky":
self.skipTest("Not supported for Rocky OS")

if self.getImageVersion() < pkg_resources.parse_version("2.0"):
self.skipTest("Not supported in pre 2.0 images")

if self.getImageVersion() == pkg_resources.parse_version("2.1") or self.getImageOs() == "rocky":
self.skipTest("Not supported in image2.1 or rocky images")
if self.getImageVersion() == pkg_resources.parse_version("2.0") and self.getImageOs() == "ubuntu":
self.skipTest("Not supported in image 2.0 for ubuntu")

if self.getImageVersion() == pkg_resources.parse_version("2.1"):
self.skipTest("Not supported in image 2.1 because of secure boot")

if self.getImageVersion() == pkg_resources.parse_version("2.2") and self.getImageOs() == "debian":
self.skipTest("The Debian version (12) for Dataproc 2.2 is not supported")
if self.getImageVersion() == pkg_resources.parse_version("2.2"):
self.skipTest("Not supported in image 2.2 because of secure boot")

metadata = ("gpu-driver-provider=NVIDIA,rapids-runtime=SPARK"
",cuda-version={0},driver-version={1}".format(cuda_version, driver_version))
Expand Down

0 comments on commit ddc335b

Please sign in to comment.