-
Notifications
You must be signed in to change notification settings - Fork 0
/
install_script.sh
415 lines (354 loc) · 17.1 KB
/
install_script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#!/bin/bash
# ------------------------------------------------------------------
# sudo nano install.sh
# install:
# Python3 libraries
# Jupyter notebook and lab
# Nodejs 12
# Hadoop-3.2.1
# Spark-2.4.5
# Zookeeper-3.6.3
# Kafka-2.13
# Apache-hive-3.1.2
# Postgresql 10
# kernels:
# Scala kernel - run on cluster, master = yarn http://master:8088
# Python3 kernel - run on cluster, spark://master:7077
# Sqlite3 kernel
# R kernel
# Julia kernel
# Bash kernel
# ------------------------------------------------------------------
hostAddress="master"
hostAddress1="worker1"
hostAddress2="worker2"
hostAddress3="worker3"
host=$(hostname)
sudo add-apt-repository universe
sudo add-apt-repository multiverse
sudo apt-get update
sudo apt-get -y upgrade
sudo apt-get update
sudo apt-get -y install python python3-dev python3-pip htpdate net-tools build-essential libncursesw5-dev libgdbm-dev libc6-dev zlib1g-dev libsqlite3-dev tk-dev libssl-dev openssl libffi-dev libxpm-dev libxext-dev libbz2-dev libncurses5-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev curl dirmngr apt-transport-https lsb-release ca-certificates libcurl4-gnutls-dev libxml2-dev gcc g++ make
# openjdk-8-jdk
sudo apt -y install software-properties-common
sudo add-apt-repository ppa:linuxuprising/java -y
sudo apt-get -y install openjdk-8-jdk
sudo update-java-alternatives --list
sudo update-java-alternatives --set /usr/lib/jvm/java-1.8.0-openjdk-arm64
# nodejs
curl -sL https://deb.nodesource.com/setup_12.x | sudo bash
sudo apt update
sudo apt -y install nodejs
# python libs
python3 -m pip install --upgrade --force-reinstall pip
pip3 install -U pip
pip3 install --upgrade pip
pip3 install numpy pandas ipython pillow jupyter jupyterlab plotly ipywidgets jupyter-dash jupyterlab-dash bokeh dash findspark notebook py4j pydoop
sudo apt -y install python3-matplotlib python3-scipy
sudo apt -y install ipython3
sudo apt -y install python3-sklearn python3-sklearn-lib
sudo apt -y install python-numpy python-scipy python-matplotlib ipython python-pandas python-sympy python-nose
sudo apt -y install libsasl2-dev gcc g++
pip3 install sasl thrift PyHive pyhs2
pip3 install thrift_sasl --user
# hadoop and spark
cd
wget https://archive.apache.org/dist/hadoop/core/hadoop-3.2.1/hadoop-3.2.1.tar.gz
wget https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
sudo tar -xvf hadoop-3.2.1.tar.gz -C /opt/
sudo tar -xvf spark-2.4.5-bin-hadoop2.7.tgz -C /opt/
cd /opt/
sudo mv hadoop-3.2.1 hadoop
sudo mv spark-2.4.5-bin-hadoop2.7 spark
# hive
cd
wget https://downloads.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
sudo tar -xvf apache-hive-3.1.2-bin.tar.gz
sudo mv apache-hive-3.1.2-bin hive
sudo mv hive /opt/
# zookeeper
cd
wget https://downloads.apache.org/zookeeper/zookeeper-3.6.3/apache-zookeeper-3.6.3-bin.tar.gz
tar -xzvf apache-zookeeper-3.6.3-bin.tar.gz
sudo mv apache-zookeeper-3.6.3-bin /opt/zookeeper
cd
cd /opt/
sudo chown -R pi:pi zookeeper
sudo mkdir /opt/zookeeper_data
sudo chown -R pi:pi zookeeper_data
sudo touch /opt/zookeeper/conf/zoo.cfg
sudo sh -c "echo '# see zoo_sample.cfg_old for information about parameters' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'tickTime=2000' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'dataDir=/opt/zookeeper_data' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'clientPort=2189' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'initLimit=20' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'syncLimit=5' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo '# this parameters are for a zookeeper cluster (assemble)' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'server.1=worker1:2888:3888' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'server.2=worker2:2888:3888' >> /opt/zookeeper/conf/zoo.cfg"
sudo sh -c "echo 'server.3=worker3:2888:3888' >> /opt/zookeeper/conf/zoo.cfg"
if [ "$host" = "$hostAddress1" ]; then
sudo touch /opt/zookeeper_data/myid
sudo sh -c "echo '1' >> /opt/zookeeper_data/myid"
else
printf '%s\n' "It is not the worker1 host"
fi
if [ "$host" = "$hostAddress2" ]; then
sudo touch /opt/zookeeper_data/myid
sudo sh -c "echo '2' >> /opt/zookeeper_data/myid"
else
printf '%s\n' "It is not the worker2 host"
fi
if [ "$host" = "$hostAddress3" ]; then
sudo touch /opt/zookeeper_data/myid
sudo sh -c "echo '3' >> /opt/zookeeper_data/myid"
else
printf '%s\n' "It is not the worker3 host"
fi
# kafka
cd
wget https://archive.apache.org/dist/kafka/2.5.0/kafka_2.13-2.5.0.tgz
tar -xzvf kafka_2.13-2.5.0.tgz
sudo mv kafka_2.13-2.5.0 /opt/kafka
cd
cd /opt/
sudo chown -R pi:pi kafka
sudo mkdir /opt/kafka_data
sudo chown -R pi:pi /opt/kafka_data
sudo mv /opt/kafka/config/server.properties /opt/kafka/config/server.propertiesbak
if [ "$host" = "$hostAddress1" ]; then
cd
cd opt/kafka/config/
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/server.properties1
sudo mv server.properties1 /opt/kafka/config/server.properties
else
printf '%s\n' "It is not the worker1 host"
fi
if [ "$host" = "$hostAddress2" ]; then
cd
cd opt/kafka/config/
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/server.properties2
sudo mv server.properties2 /opt/kafka/config/server.properties
else
printf '%s\n' "It is not the worker2 host"
fi
if [ "$host" = "$hostAddress3" ]; then
cd
cd opt/kafka/config/
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/server.properties3
sudo mv server.properties3 /opt/kafka/config/server.properties
else
printf '%s\n' "It is not the worker3 host"
fi
cd
sudo mkdir /opt/hadoop_tmp
sudo mkdir -p /opt/hadoop_tmp/hdfs/datanode
sudo mkdir -p /opt/hadoop_tmp/hdfs/namenode
sudo chown -R pi:pi /opt/spark
sudo chown -R pi:pi /opt/hive
sudo chown -R pi:pi /opt/hadoop
sudo chown -R pi:pi /opt/hadoop_tmp
sudo chown -R pi:pi /opt/kafka
sudo chown -R pi:pi /opt/kafka_data
sudo chown -R pi:pi /opt/zookeeper
sudo chown -R pi:pi /opt/zookeeper_data
sudo htpdate -a -l www.pool.ntp.org
sudo chown -R pi:pi /usr/src/
sudo chown -R pi:pi /usr/share/
sudo chown -R pi:pi /usr/local/
sudo chown -R pi:pi /home/pi/
sudo chown -R pi:pi /var/log
echo '# HADOOP - SPARK - HIVE variables' >> ~/.bashrc
echo 'export PYTHONHASHSEED=123' >> ~/.bashrc
echo 'export PYSPARK_PYTHON=/usr/bin/python3' >> ~/.bashrc
echo "export PYSPARK_DRIVER_PYTHON='juypyter'" >> ~/.bashrc
echo "export PYSPARK_DRIVER_PYTHON_OPTS='notebook --allow-root --ip 0.0.0.0 --no-browser'" >> ~/.bashrc
echo 'export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-arm64' >> ~/.bashrc
echo 'export HADOOP_HOME=/opt/hadoop' >> ~/.bashrc
echo 'export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin' >> ~/.bashrc
echo 'export SPARK_HOME=/opt/spark' >> ~/.bashrc
echo 'export PATH=$PATH:$SPARK_HOME/bin' >> ~/.bashrc
echo 'export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH' >> ~/.bashrc
echo 'export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"' >> ~/.bashrc
echo 'export HIVE_HOME=/opt/hive' >> ~/.bashrc
echo 'export PATH=$PATH:/opt/hive/bin' >> ~/.bashrc
echo 'export HIVE_CONF_DIR=/opt/hive/conf' >> ~/.bashrc
echo 'export PATH=$PATH:$HIVE_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin' >> ~/.bashrc
echo 'export PATH=$PATH:~/.local/bin' >> ~/.bashrc
. ~/.bashrc
sudo rm /opt/hive/lib/guava-19.0.jar
sudo cp /opt/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar /opt/hive/lib/
cp /opt/hive/conf/hive-env.sh.template /opt/hive/conf/hive-env.sh
cd /opt/hive/lib/
sudo wget https://jdbc.postgresql.org/download/postgresql-42.2.24.jar
sudo mv /opt/hive/conf/hivemetastore-site.xml /opt/hive/conf/hivemetastore-site.xmlbak
sudo mv /opt/hive/conf/hiveserver2-site.xml /opt/hive/conf/hiveserver2-site.xmlbak
sudo mv /opt/hive/conf/hive-env.sh /opt/hive/conf/hive-env.shbak
cd
cd /opt/hive/conf/
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/hivemetastore-site.xml
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/hive-env.sh
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/hiveserver2-site.xml
cd
sudo touch /opt/hadoop/etc/hadoop/workers
sudo sed -i '1d' /opt/hadoop/etc/hadoop/workers
echo 'master' >> /opt/hadoop/etc/hadoop/workers
echo 'worker1' >> /opt/hadoop/etc/hadoop/workers
echo 'worker2' >> /opt/hadoop/etc/hadoop/workers
echo 'worker3' >> /opt/hadoop/etc/hadoop/workers
sudo mv /opt/hadoop/etc/hadoop/capacity-scheduler.xml /opt/hadoop/etc/hadoop/capacity-scheduler.xmlbak
sudo mv /opt/hadoop/etc/hadoop/core-site.xml /opt/hadoop/etc/hadoop/core-site.xmlbak
sudo mv /opt/hadoop/etc/hadoop/hdfs-site.xml /opt/hadoop/etc/hadoop/hdfs-site.xmlbak
sudo mv /opt/hadoop/etc/hadoop/mapred-site.xml /opt/hadoop/etc/hadoop/mapred-site.xmlbak
sudo mv /opt/hadoop/etc/hadoop/yarn-site.xml /opt/hadoop/etc/hadoop/yarn-site.xmlbak
cd
cd /opt/hadoop/etc/hadoop/
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/capacity-scheduler.xml
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/core-site.xml
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/hdfs-site.xml
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/mapred-site.xml
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/yarn-site.xml
cd
echo 'export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-arm64' >> /opt/hadoop/etc/hadoop/hadoop-env.sh
cd
cd /opt/spark/conf/
sudo wget https://raw.githubusercontent.com/AndreiFAD/raspberry_pi_cluster/main/spark-defaults.conf
sudo cp spark-env.sh.template spark-env.sh
sudo sh -c "echo 'export SPARK_EXECUTOR_CORES=4' >> /opt/spark/conf/spark-env.sh"
sudo sh -c "echo 'export SPARK_EXECUTOR_MEMORY=2500M' >> /opt/spark/conf/spark-env.sh"
sudo sh -c "echo 'export SPARK_DRIVER_MEMORY=2500M' >> /opt/spark/conf/spark-env.sh"
sudo sh -c "echo \"export SPARK_MASTER_HOST='master'\" >> /opt/spark/conf/spark-env.sh"
sudo sh -c "echo 'export SPARK_WORKER_CORES=4' >> /opt/spark/conf/spark-env.sh"
sudo sh -c "echo 'export SPARK_WORKER_MEMORY=2500M' >> /opt/spark/conf/spark-env.sh"
sudo sh -c "echo 'export PYSPARK_PYTHON=/usr/bin/python3' >> /opt/spark/conf/spark-env.sh"
sudo sh -c "echo \"export PYSPARK_DRIVER_PYTHON='jupyter'\" >> /opt/spark/conf/spark-env.sh"
cd
sudo touch /opt/spark/conf/master
sudo sh -c "echo 'master' >> /opt/spark/conf/master"
cd
sudo touch /opt/spark/conf/slaves
echo 'worker1' >> /opt/spark/conf/slaves
echo 'worker2' >> /opt/spark/conf/slaves
echo 'worker3' >> /opt/spark/conf/slaves
if [ "$host" = "$hostAddress" ]; then
# extra conf to jupyter
jupyter notebook -y --generate-config
cd $home
sudo mkdir -p notebooks
# spylon kernel scala
sudo apt -y install scala
pip3 install spylon-kernel pyspark
python3 -m spylon_kernel install --user
sudo sed -i '1d' /home/pi/.local/share/jupyter/kernels/spylon-kernel/kernel.json
sudo echo '{"argv": ["/usr/bin/python3", "-m", "spylon_kernel", "-f", "{connection_file}"], "display_name": "Scala", "env": {"PYTHONUNBUFFERED": "1", "SPARK_SUBMIT_OPTS": "-Dscala.usejavacp=true"}, "language": "scala", "name": "spylon-kernel"}' >> /home/pi/.local/share/jupyter/kernels/spylon-kernel/kernel.json
# SQLite kernel
sudo apt-get install -y sqlite3
sudo git clone https://github.com/brownan/sqlite3-kernel.git
cd sqlite3-kernel
python3 setup.py install
python3 -m sqlite3_kernel.install --user
cd ..
sudo rm -rf sqlite3-kernel/
# Install TeX to convert Jupyter notebooks to other formats etc PDF.
sudo apt install -y texlive-xetex
sudo apt install -y latexmk
# bash kernel
pip3 install bash_kernel
python3 -m bash_kernel.install --user
# R kernel
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/'
sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu xenial-cran40/'
sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu trusty/'
sudo apt update
sudo apt-get install -y gfortran libreadline6-dev libx11-dev libxt-dev \
libpng-dev libjpeg-dev libcairo2-dev xvfb \
libbz2-dev libzstd-dev liblzma-dev \
libcurl4-openssl-dev \
texinfo texlive texlive-fonts-extra \
screen wget libpcre2-dev build-essential libatomic1 gfortran perl wget m4 \
cmake pkg-config libopenblas-base libopenblas-dev libatlas3-base \
liblapack-dev libmpfr-dev libgmp3-dev gfortran
sudo apt-get install -y build-essential
sudo apt-get install -y fort77
sudo apt-get install -y xorg-dev
sudo apt-get install -y liblzma-dev libblas-dev gfortran
sudo apt-get install -y gcc-multilib
sudo apt-get install -y gobjc++
sudo apt-get install -y aptitude
sudo aptitude install -y libreadline-dev
sudo sed -i "s/# deb-src/deb-src/g" /etc/apt/sources.list
sudo apt-get update
sudo apt-get -y build-dep r-base-dev
cd /usr/local/src
sudo wget -c https://cran.r-project.org/src/base/R-4/R-4.1.0.tar.gz
sudo tar -xf R-4.1.0.tar.gz
cd R-4.1.0
sudo ./configure
sudo make -j9
sudo make install
cd ..
sudo rm -rf R-3.6.3*
cd
Rscript -e 'install.packages("IRkernel", repos="https://cloud.r-project.org")'
Rscript -e 'IRkernel::installspec(displayname = "R 4.1.0")'
sudo echo > /home/pi/.local/share/jupyter/kernels/ir/kernel.json
sudo echo '{ "argv": ["/usr/local/lib/R/bin/R", "--slave", "-e", "IRkernel::main()", "--args", "{connection_file}"], "display_name": "R 4.1.0", "language": "R"}' >> /home/pi/.local/share/jupyter/kernels/ir/kernel.json
Rscript -e 'install.packages("dplyr", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("ggplot2", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("tidyr", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("shiny", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("caret", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("e1071", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("plotly", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("tidyquant", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("repr", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("evaluate", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("crayon", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("pbdZMQ", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("devtools", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("uuid", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("digest", repo = "https://lib.ugent.be/CRAN/")'
Rscript -e 'install.packages("stringi", repo = "https://lib.ugent.be/CRAN/")'
# julia kernel
cd /usr/local/src
sudo git clone git://github.com/JuliaLang/julia.git
cd julia
sudo git checkout v1.5.3
sudo make install
sudo ln -s /usr/local/src/julia/julia /usr/bin/julia
julia -e 'using Pkg; Pkg.add("IJulia");'
julia -e 'using IJulia;'
# Jupyter nbextensions
jupyter lab clean
jupyter nbextension enable --py widgetsnbextension
jupyter labextension install jupyterlab-dash --no-build --minimize=False
jupyter labextension install @jupyter-widgets/jupyterlab-manager --no-build --minimize=False
jupyter labextension install bqplot --no-build --minimize=False
jupyter labextension install jupyter-leaflet --no-build --minimize=False
jupyter lab build --dev-build=False --minimize=False
printf '%s\n' "on the master host"
sudo apt update
sudo apt install -y postgresql postgresql-contrib
psql -V
sudo sed -i -e 's/^/#/' /etc/postgresql/10/main/pg_hba.conf
{
echo 'local all postgres trust'
echo 'local all all md5'
echo 'host all all 127.0.0.1/32 md5'
echo 'host all all ::1/128 md5'
echo 'local replication all peer'
echo 'host replication all 127.0.0.1/32 md5'
echo 'host replication all ::1/128 md5'
echo 'host all all 0.0.0.0/0 trust'
} >> /etc/postgresql/10/main/pg_hba.conf
sudo sh -c "echo \"listen_addresses = '*'\" >> /etc/postgresql/10/main/postgresql.conf"
sudo service postgresql restart
sudo echo "CREATE USER hive WITH PASSWORD 'hive';" | psql -U postgres
sudo echo "CREATE DATABASE metastore;" | psql -U postgres
sudo echo "GRANT ALL PRIVILEGES ON DATABASE metastore TO hive;" | psql -U postgres
else
printf '%s\n' "It is not the master host"
fi