0% found this document useful (0 votes)
115 views

Noeud

The document provides instructions for setting up a Hadoop 3.1.1 and Spark 2.4.0 cluster on Ubuntu 18.04 using one master node and two slave nodes. It includes steps to configure static IP addresses, install Java, download and extract Hadoop and Spark, configure environment variables, copy configuration files to the master node, start Hadoop and Spark services, and verify the cluster is functioning properly. It also provides instructions for cloning the virtual machines to the slave nodes, formatting HDFS, and creating a simple Spark application to test the setup.

Uploaded by

aitlhaj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
115 views

Noeud

The document provides instructions for setting up a Hadoop 3.1.1 and Spark 2.4.0 cluster on Ubuntu 18.04 using one master node and two slave nodes. It includes steps to configure static IP addresses, install Java, download and extract Hadoop and Spark, configure environment variables, copy configuration files to the master node, start Hadoop and Spark services, and verify the cluster is functioning properly. It also provides instructions for cloning the virtual machines to the slave nodes, formatting HDFS, and creating a simple Spark application to test the setup.

Uploaded by

aitlhaj
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

# Setup Hadoop-3.1.1 & spark-2.4.

0 cluster
# using ubuntu 18.04

#1
sudo vi /etc/hosts
172.20.10.4 server
172.20.10.5 slave1
172.20.10.6 slave2

#2 vi /etc/netplan/50-cloud-init.yml
#changing to static ip

network:
ethernets:
enp0s3:
addresses: [172.20.10.4/24]
gateway4: 172.20.10.1
nameservers:
addresses: [8.8.8.8,8.8.4.4]
dhcp4: no
version: 2

sudo netplan apply


#3 change name by editing /etc/hostname or hostnamectl
sudo hostnamectl set-hostname master
hostname
sudo usermod -aG sudo ziyati

#4 connect to master to install openssh-server


sudo apt-get remove --purge openssh-server
sudo apt-get install openssh-server
#if problem during install openssh-server
sudo apt-get install aptitude
sudo aptitude install openssh-client=required_version
sudo aptitude install openssh-client=1:7.6p1-4
# Problem install openssh-server
ssh-keygen -t rsa -P ""
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys

#5 Test secure connection


ssh ziyati@localhost
logout

#6 install java
sudo apt install openjdk-8-jdk
update-java-alternatives -l

# may be problems occurs during install


## then
sudo rm /var/lib/dpkg/updates/000*
sudo apt-get clean
sudo apt-get update
sudo apt-get install ttf-mscorefonts-installer

#7 download hadoop
curl -O http://mirror.cogentco.com/pub/apache/hadoop/common/hadoop-3.1.1/hadoop-
3.1.1.tar.gz
tar -xzf hadoop-3.1.2.tar.gz
sudo mv hadoop-3.1.2 /usr/local/hadoop
mkdir -p /home/ziyati/hadoop_tmp/{data,name}
rm hadoop-3.1.2.tar.gz

#7.1 Set up hadoop environment variables.

echo export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64 >> ~/.bashrc


echo export PATH=\$JAVA_HOME/bin:\$PATH >> ~/.bashrc
echo export HADOOP_HOME=/usr/local/hadoop >> ~/.bashrc
echo export PATH=\$HADOOP_HOME/bin:\$HADOOP_HOME/sbin:\$PATH >> ~/.bashrc
echo export HADOOP_CONF_DIR=\$HADOOP_HOME"/etc/hadoop" >> ~/.bashrc

source ~/.bashrc

#7.2 Copy configuration files for master node.


cp master/* /usr/local/hadoop/etc/hadoop/

#8 download spark
curl -O https://www-eu.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-
hadoop2.7.tgz
tar -xzf spark-2.4.3-bin-hadoop2.7.tgz
sudo mv spark-2.4.3-bin-hadoop2.7 /usr/local/spark
rm spark-2.4.0-bin-hadoop2.7.tgz

#8.1 Set up spark environment variables.

echo export SPARK_HOME=/usr/local/spark >> ~/.bashrc


echo export PATH=\$SPARK_HOME/bin:\$PATH >> ~/.bashrc
echo export PATH=\$SPARK_HOME/sbin:\$PATH >> ~/.bashrc
source .bashrc

#8.2 Set up spark files.


vi $SPARK_HOME/conf/slaves
172.20.10.5
172.20.10.6
#9 Clone VM to slave1 and slave2
# change IP and hostname

after change ip
you have to actualize

sudo netplan apply

# Format HDFS
hdfs namenode -format

#10 Start hadoop services


cd /usr/local/hadoop/sbin
./start-dfs.sh
./start-yarn.sh

#11 Connect to master to verify

http://172.20.10.4:9870
http://172.20.10.4:8088

#start spark
cd /usr/local/spark/sbin

./start-all.sh
http://172.20.10.4:8080
spark should launched from master

############
# Thanks !
############

#all services are up in server


#check in slave1
jps

# Complete the env Anaconda


# Installing Jupyter
curl -O https://repo.anaconda.com/archive/Anaconda3-2019.03-Linux-x86_64.sh
bash Anaconda3-2019.03-Linux-x86_64.sh

# create a virtual env called jupyter


conda create -n jupyter
# activate it
source activate jupyter
conda install notebook

# start jupyter using ip (Server)


jupyter notebook --ip 172.20.10.4

# Installing findspark
# findspark is a Python library that automatically allow you to import and use
PySpark as any other Python library.

pip install findspark

# Create your first Spark application


#Node1
import findspark
#Node2
findspark.init()
#Node3
import pyspark
#Node4
sc = pyspark.SparkContext(master='spark://172.20.10.4:7077', appName='myApp')

You might also like