-
Notifications
You must be signed in to change notification settings - Fork 514
/
zeppelin.sh
executable file
·155 lines (127 loc) · 5.05 KB
/
zeppelin.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/bin/bash
# Copyright 2015 Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This init script installs Apache Zeppelin on the master node of a Cloud
# Dataproc cluster. Zeppelin is also configured based on the size of your
# cluster and the versions of Spark/Hadoop which are installed.
set -euxo pipefail
# Detect dataproc image version from its various names
if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then
DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
fi
readonly NOT_SUPPORTED_MESSAGE="Zeppelin initialization action is not supported on Dataproc ${DATAPROC_IMAGE_VERSION}.
Use Zeppelin Optional Component instead: https://cloud.google.com/dataproc/docs/concepts/components/zeppelin"
[[ "${DATAPROC_IMAGE_VERSION}" != 1.* ]] && echo "${NOT_SUPPORTED_MESSAGE}" && exit 1
readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
readonly INTERPRETER_FILE='/etc/zeppelin/conf/interpreter.json'
readonly INIT_SCRIPT='/usr/lib/systemd/system/zeppelin-notebook.service'
function retry_apt_command() {
cmd="$1"
for ((i = 0; i < 10; i++)); do
if eval "$cmd"; then
return 0
fi
sleep 5
done
return 1
}
function update_apt_get() {
retry_apt_command "apt-get update"
}
function install_apt_get() {
pkgs="$@"
retry_apt_command "apt-get install -y $pkgs"
}
function err() {
echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $@" >&2
return 1
}
function install_zeppelin() {
# Install zeppelin. Don't mind if it fails to start the first time.
retry_apt_command "apt-get install -t $(lsb_release -sc)-backports -y zeppelin" || dpkg -l zeppelin
if [ $? != 0 ]; then
err 'Failed to install zeppelin'
fi
# If both asm-3.1.jar and asm-5.0.4.jar are found in /usr/lib/zeppelin/lib for
# Zeppelin 0.7, delete asm-5.0.4.jar. This is a temporary workaround before
# we figure out the root cause of asm conflict.
local zeppelin_version="$(dpkg-query --showformat='${Version}' --show zeppelin)"
local asm_3_1=/usr/lib/zeppelin/lib/asm-3.1.jar
local asm_5_0_4=/usr/lib/zeppelin/lib/asm-5.0.4.jar
if [[ "${zeppelin_version}" == "0.7."* && -f "${asm_3_1}" && -f "${asm_5_0_4}" ]]; then
rm "${asm_5_0_4}"
fi
}
function configure_zeppelin() {
local zeppelin_version
zeppelin_version="$(dpkg-query --showformat='${Version}' --show zeppelin)"
# Only use sed to modify interpreter.json prior to Zeppelin 0.8.0.
# The file format has changed in 0.8.0.
# TODO(karthikpal): Evaluate which of these (if any) are necessary >= 0.8.0
if dpkg --compare-versions "${zeppelin_version}" '<' 0.8.0; then
# Wait up to 60s for ${INTERPRETER_FILE} to be available
for i in {1..6}; do
if [[ -r "${INTERPRETER_FILE}" ]]; then
break
else
sleep 10
fi
done
if [[ ! -r "${INTERPRETER_FILE}" ]]; then
err "${INTERPRETER_FILE} is missing"
fi
# stop service, systemd will be configured
service zeppelin stop
# Set spark.yarn.isPython to fix Zeppelin pyspark in Dataproc 1.0.
sed -i 's/\(\s*\)"spark\.app\.name[^,}]*/&,\n\1"spark.yarn.isPython": "true"/' \
"${INTERPRETER_FILE}"
# Unset Spark Executor memory to let the spark-defaults.conf set it.
sed -i '/spark\.executor\.memory/d' "${INTERPRETER_FILE}"
# Set BigQuery project ID if present.
local project_id
project_id="$(/usr/share/google/get_metadata_value ../project/project-id)"
sed -i "s/\(\"zeppelin.bigquery.project_id\"\)[^,}]*/\1: \"${project_id}\"/" \
"${INTERPRETER_FILE}"
fi
# Link in hive configuration.
ln -s /etc/hive/conf/hive-site.xml /etc/zeppelin/conf
local zeppelin_port
zeppelin_port="$(/usr/share/google/get_metadata_value attributes/zeppelin-port || true)"
if [[ -n "${zeppelin_port}" ]]; then
echo "export ZEPPELIN_PORT=${zeppelin_port}" \
>>/etc/zeppelin/conf/zeppelin-env.sh
fi
# Install matplotlib. Note that this will work in Zeppelin, but not
# in a vanilla Python interpreter, as that requires an X server to be running
install_apt_get python-dev python-tk
if ! command -v pip >/dev/null; then
install_apt_get python-pip
fi
pip install --upgrade matplotlib
# Install R libraries for Zeppelin 0.6.1+
if dpkg --compare-versions "${zeppelin_version}" '>=' 0.6.1; then
# TODO(pmkc): Add googlevis to Zeppelin Package recommendations
install_apt_get r-cran-googlevis
# TODO(Aniszewski): Fix SparkR (GitHub issue #198)
fi
}
function main() {
if [[ "${ROLE}" == 'Master' ]]; then
update_apt_get || err 'Failed to update apt-get'
install_zeppelin
configure_zeppelin
systemctl restart zeppelin
fi
}
main