initialization-actions/mlvm/mlvm.sh at master · GoogleCloudDataproc/initialization-actions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/bin/bash

# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This initialization action will download a set of frequently-used Machine Learning
# libraries onto a Dataproc cluster, as well as GPU support and connectors for
# Google Cloud Storage, BigQuery and Spark-BigQuery.

set -euxo pipefail

readonly SPARK_JARS_DIR=/usr/lib/spark/jars
readonly CONNECTORS_DIR=/usr/local/share/google/dataproc/lib

readonly DEFAULT_INIT_ACTIONS_REPO=gs://dataproc-initialization-actions
readonly INIT_ACTIONS_REPO="$(/usr/share/google/get_metadata_value attributes/init-actions-repo ||
  echo ${DEFAULT_INIT_ACTIONS_REPO})"
readonly INIT_ACTIONS_DIR=$(mktemp -d -t dataproc-init-actions-XXXX)

readonly RAPIDS_RUNTIME="$(/usr/share/google/get_metadata_value attributes/rapids-runtime || echo "")"
readonly INCLUDE_GPUS="$(/usr/share/google/get_metadata_value attributes/include-gpus || echo "")"
readonly SPARK_BIGQUERY_VERSION="$(/usr/share/google/get_metadata_value attributes/spark-bigquery-connector-version ||
  echo "0.22.0")"

R_VERSION="$(R --version | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p')"
readonly R_VERSION
readonly SPARK_NLP_VERSION="3.2.1" # Must include subminor version here

CONDA_PACKAGES=(
  "r-dplyr=1.0"
  "r-essentials=${R_VERSION}"
  "r-sparklyr=1.7"
  "scikit-learn=0.24"
  "xgboost=1.4"
)

# rapids-xgboost (part of the RAPIDS library) requires a custom build of
# xgboost that is incompatible with r-xgboost. As such, r-xgboost is not
# installed into the MLVM if RAPIDS support is desired.
if [[ -z ${RAPIDS_RUNTIME} ]]; then
  CONDA_PACKAGES+=("r-xgboost=1.4")
fi

PIP_PACKAGES=(
  "mxnet==1.8.*"
  "rpy2==3.4.*"
  "spark-nlp==${SPARK_NLP_VERSION}"
  "sparksql-magic==0.0.*"
  "tensorflow-datasets==4.4.*"
  "tensorflow-hub==0.12.*"
)

PIP_PACKAGES+=(
  "spark-tensorflow-distributor==1.0.0"
  "tensorflow==2.6.*"
  "tensorflow-estimator==2.6.*"
  "tensorflow-io==0.20"
  "tensorflow-probability==0.13.*"
)

readonly CONDA_PACKAGES
readonly PIP_PACKAGES

mkdir -p ${SPARK_JARS_DIR} ${CONNECTORS_DIR}

function execute_with_retries() {
  local -r cmd=$1
  for ((i = 0; i < 10; i++)); do
    if eval "$cmd"; then
      return 0
    fi
    sleep 5
  done
  echo "Cmd '${cmd}' failed."
  return 1
}

function download_spark_jar() {
  local -r url=$1
  local -r jar_name=${url##*/}
  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
    "${url}" -o "${SPARK_JARS_DIR}/${jar_name}"
}

function download_init_actions() {
  # Download initialization actions locally.
  mkdir "${INIT_ACTIONS_DIR}"/{gpu,rapids,dask}

  gsutil -m rsync -r "${INIT_ACTIONS_REPO}/rapids/" "${INIT_ACTIONS_DIR}/rapids/"
  gsutil -m rsync -r "${INIT_ACTIONS_REPO}/gpu/" "${INIT_ACTIONS_DIR}/gpu/"
  gsutil -m rsync -r "${INIT_ACTIONS_REPO}/dask/" "${INIT_ACTIONS_DIR}/dask/"

  find "${INIT_ACTIONS_DIR}" -name '*.sh' -exec chmod +x {} \;
}

function install_gpu_drivers() {
  "${INIT_ACTIONS_DIR}/gpu/install_gpu_driver.sh"
}

function install_conda_packages() {
  local base
  base=$(conda info --base)
  local -r mamba_env_name=mamba
  local -r mamba_env=${base}/envs/mamba
  local -r extra_packages="$(/usr/share/google/get_metadata_value attributes/CONDA_PACKAGES || echo "")"
  local -r extra_channels="$(/usr/share/google/get_metadata_value attributes/CONDA_CHANNELS || echo "")"

  conda config --add channels pytorch
  conda config --add channels conda-forge

  conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 -c pytorch -c conda-forge

  # Create a separate environment with mamba.
  # Mamba provides significant decreases in installation times.
  conda create -y -n ${mamba_env_name} mamba

  execute_with_retries "${mamba_env}/bin/mamba install -y ${CONDA_PACKAGES[*]} -p ${base}"

  if [[ -n "${extra_channels}" ]]; then
    for channel in ${extra_channels}; do
      "${mamba_env}/bin/conda" config --add channels "${channel}"
    done
  fi

  if [[ -n "${extra_packages}" ]]; then
    execute_with_retries "${mamba_env}/bin/mamba install -y ${extra_packages[*]} -p ${base}"
  fi

  # Clean up environment
  "${mamba_env}/bin/mamba" clean -y --all

  # Remove mamba env when done
  conda env remove -n ${mamba_env_name}
}

function install_pip_packages() {
  local -r extra_packages="$(/usr/share/google/get_metadata_value attributes/PIP_PACKAGES || echo "")"

  execute_with_retries "pip install ${PIP_PACKAGES[*]}"

  if [[ -n "${extra_packages}" ]]; then
    execute_with_retries "pip install ${extra_packages[*]}"
  fi
}

function install_dask() {
  "${INIT_ACTIONS_DIR}/dask/dask.sh"
}

function install_spark_nlp() {
  local -r name="spark-nlp_2.12"
  local -r repo_url="https://repo1.maven.org/maven2/com/johnsnowlabs/nlp"
  download_spark_jar "${repo_url}/${name}/${SPARK_NLP_VERSION}/${name}-${SPARK_NLP_VERSION}.jar"
}

function install_connectors() {
  local -r url="gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-${SPARK_BIGQUERY_VERSION}.jar"

  gsutil cp "${url}" "${CONNECTORS_DIR}/"

  local -r jar_name=${url##*/}

  # Update or create version-less connector link
  ln -s -f "${CONNECTORS_DIR}/${jar_name}" "${CONNECTORS_DIR}/spark-bigquery-connector.jar"
}

function install_rapids() {
  # Only install RAPIDS if "rapids-runtime" metadata exists and GPUs requested.
  if [[ -n ${RAPIDS_RUNTIME} ]]; then
    if [[ -n ${INCLUDE_GPUS} ]]; then
      "${INIT_ACTIONS_DIR}/rapids/rapids.sh"
    else
      echo "RAPIDS runtime declared but GPUs not included. Exiting."
      return 1
    fi
  fi
}

function main() {
  # Download initialization actions
  echo "Downloading initialization actions"
  download_init_actions

  # Install GPU Drivers
  echo "Installing GPU drivers"
  install_gpu_drivers

  # Install Dask
  install_dask

  # Install Spark Libraries
  echo "Installing Spark-NLP jars"
  install_spark_nlp

  # Install GCP Connectors
  echo "Installing GCP Connectors"
  install_connectors

  # Install RAPIDS
  echo "Installing rapids"
  install_rapids

  # Install Conda packages
  echo "Installing Conda packages"
  install_conda_packages

  # Install Pip packages
  echo "Installing Pip Packages"
  install_pip_packages
}

main