0% found this document useful (0 votes)
1 views

Project Retail Group1

Project Gladiator is a retail project that involves data ingestion, preparation, and transformation using various technologies including MySQL, Hive, and Spark. The document outlines the design, algorithms, and steps for loading data into tables, creating dataframes, and performing data preparation tasks. It also includes integration with AWS and visualization components.

Uploaded by

Krushnamane
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
1 views

Project Retail Group1

Project Gladiator is a retail project that involves data ingestion, preparation, and transformation using various technologies including MySQL, Hive, and Spark. The document outlines the design, algorithms, and steps for loading data into tables, creating dataframes, and performing data preparation tasks. It also includes integration with AWS and visualization components.

Uploaded by

Krushnamane
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 49

Project Gladiator

Retail project
Group 1

Prepared By:

Name PSNo.
Mahima Tendulkar 10671415
Rohan Badgujar 10671304
Rahul Kumar Singh 10671510
2

INDEX
========================================================

Sr No. Content
1 Design and algorithm
2 Data Ingestion and Preparation
3 Transformations using spark
4 Using Delta lake for update and delete
5 Kafka spark structured streaming
6 Integration with AWS
7 Visualization
8 Conclusion
3

DESIGN AND ALGORITHM


========================================================
On Premises:

On AWS:
4

SCHEMA OF TABLES
==============================================
5

STEP 1: DATA LOADING AND PREPARATION


=========================================================

1. Creating tables in MySQL and loading data from .csv files

--CREATE DATABASE project


create​ ​database​ ​project​;
use project;

--CREATE REGION TABLE


create​ ​table​ ​region​(r_regionkey ​int​(​38​) ​NOT NULL​, r_name ​varchar​(​25​) ​NOT NULL​,
r_comment ​varchar​(​152​), ​primary key​ (r_regionkey));

--LOAD DATA INTO REGION TABLE


load data infile ​'/var/lib/mysql-files/region.csv'​ into table region fields terminated
by ​','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;

--CREATE NATION TABLE


create​ ​table​ ​nation​ (n_nationkey ​int​(​38​),n_name ​varchar​(​25​),n_regionkey
int​(​38​),n_comment ​varchar​(​152​),​primary key​ (n_nationkey), ​foreign key​ (n_regionkey)
references​ region (r_regionkey));

--LOAD DATA INTO NATION TABLE


load data infile ​'/var/lib/mysql-files/nation.csv'​ into table nation fields terminated
by ​','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;

--CREATE CUSTOMER TABLE


create​ ​table​ ​customer​(c_custkey ​int​(​38​),c_name ​varchar​(​25​),c_address
varchar​(​40​),c_nationkey ​int​(​38​),c_phone ​varchar​(​15​),c_acctbal
decimal(​12​,​2​),c_mktsegment ​varchar​(​10​),c_comment ​varchar​(​117​),​primary key​ (c_custkey),
FOREIGN KEY​ (c_nationkey) ​REFERENCES​ nation(n_nationkey));
6

--LOAD DATA INTO CUSTOMER TABLE


load data infile ​'/var/lib/mysql-files/customer.csv'​ into table customer fields
terminated by ​','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;

--CREATE ORDERS TABLE


create​ ​table​ ​orders​(o_orderkey ​int​(​38​) ​not null​, o_custkey ​int​(​38​) ​not null​,
o_orderstatus ​VARCHAR​(​1​) ​not null​, o_totalprice float(​12​,​2​) ​not null​, o_orderdate ​DATE
not null​, o_orderpriority ​VARCHAR​(​15​) ​not null​, o_clerk ​VARCHAR​(​15​) ​not null​,
o_shippriority ​int​(​38​) ​not null​, o_comment ​VARCHAR​(​79​) ​not null​, ​primary key
(o_orderkey), ​FOREIGN KEY​ (o_custkey) ​REFERENCES​ customer(c_custkey));

--LOAD DATA INTO ORDERS TABLE


load data infile ​'/var/lib/mysql-files/orders.csv'​ into table orders fields terminated
by ​','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;

--CREATE PART TABLE


create​ ​table​ ​part​(p_partkey ​int​(​38​),p_name ​varchar​(​55​),p_mfgr ​varchar​(​25​),p_brand
varchar​(​10​),p_type ​varchar​(​25​),p_size ​int​(​38​),p_container ​varchar​(​10​),p_retailprice
decimal(​12​,​2​), p_comment ​varchar​(​23​),​primary key​(p_partkey));

--LOAD DATA INTO PART TABLE


load data infile ​'/var/lib/mysql-files/part.csv'​ into table part fields terminated by
','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;

--CREATE SUPPLIER TABLE


create​ ​table​ ​supplier​(s_suppkey ​int​(​38​) ​NOT NULL​, s_name ​varchar​(​25​) ​NOT NULL​, s_address
varchar​(​40​) ​NOT NULL​, s_nationkey ​int​(​38​) ​NOT NULL​, s_phone ​varchar​(​40​) ​NOT NULL​,
s_acctbal float(​12​,​2​) ​NOT NULL​, s_comment ​varchar​(​101​),​primary key​ (s_suppkey), ​FOREIGN
KEY​ (s_nationkey) ​REFERENCES​ nation(n_nationkey));

--LOAD DATA INTO SUPPLIER TABLE


load data infile ​'/var/lib/mysql-files/supplier.csv'​ into table supplier fields
terminated by ​','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;
7

--CREATE PART-SUPP TABLE


CREATE​ ​TABLE​ ​partsupp​(ps_partkey ​int​(​38​),ps_suppkey ​int​(​38​),ps_availqty
int​(​38​),ps_supplycost DECIMAL(​12​,​2​),ps_comment ​varchar​(​199​),​primary
key​(ps_partkey,ps_suppkey), ​foreign key​(ps_partkey) ​references​ part(p_partkey),​foreign
key​(ps_suppkey) ​references​ supplier(s_suppkey));

--LOAD DATA INTO PARTSUPP TABLE


load data infile ​'/var/lib/mysql-files/partsupp.csv'​ into table supplier fields
terminated by ​','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;

--CREATE LINEITEM TABLE


create​ ​table​ ​lineitem​(l_orderkey ​int​(​38​), l_partkey ​int​(​38​), l_suppkey ​int​(​38​),
l_linenumber ​int​(​38​),l_quantity DECIMAL(​12​,​2​), l_extendedprice DECIMAL(​12​,​2​),
l_discount DECIMAL(​12​,​2​), l_tax DECIMAL(​12​,​2​), l_returnflag ​VARCHAR​(​1​), l_linestatus
VARCHAR​(​1​),l_shipdate ​DATE​, l_commitdate ​DATE​, l_receiptdate ​DATE​, l_shipinstruct
VARCHAR​(​25​),l_shipmode ​VARCHAR​(​10​), l_comment ​VARCHAR​(​44​),​primary
key​(l_orderkey,l_linenumber),​foreign key​(l_orderkey) ​references
orders(o_orderkey),​foreign key​(l_partkey,l_suppkey) ​references
partsupp(ps_partkey,ps_suppkey),​foreign key​(l_partkey) ​references
part(p_partkey),​foreign key​(l_suppkey) ​references​ supplier(s_suppkey));

--LOAD DATA INTO LINEITEM


load data infile ​'/var/lib/mysql-files/lineitem.csv'​ into table lineitem fields
terminated by ​','​ enclosed by ​'"'​ lines terminated by ​'​\n​'​ ;
8
2. Import data into hive using Sqoop

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table region -create-hive-table -hive-table project.region -hive-import
--hive-drop-import-delims -m 1;

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table nation -create-hive-table -hive-table project.nation -hive-import
--hive-drop-import-delims -m 1;

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table customer -create-hive-table -hive-table project.customer
-hive-import --hive-drop-import-delims -m 5;

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table part -create-hive-table -hive-table project.part -hive-import
--hive-drop-import-delims -m 5;

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table lineitem -create-hive-table -hive-table project.lineitem
-hive-import --hive-drop-import-delims -m 5;

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table partsupp -create-hive-table -hive-table project.partsupp
-hive-import --hive-drop-import-delims -m 5;

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table orders -create-hive-table -hive-table project.orders -hive-import
--hive-drop-import-delims -m 5;

sqoop​-​import ​--connect jdbc:mysql://localhost/project -username hiveuser -password


hivepassword --table supplier -create-hive-table -hive-table project.supplier
-hive-import --hive-drop-import-delims -m 5;
9

3. Create dataframe in spark using data in hive

from​ pyspark.sql ​import​ ​*


spark.sql(​"use project"​)
spark.sql(​"show tables"​).show()

regiondf​=​spark.sql(​"select * from region"​)


nationdf​=​spark.sql(​"select * from nation"​)
partdf​=​spark.sql(​"select * from part"​)
ordersdf​=​ spark.sql(​"select * from orders"​)
supplierdf​=​spark.sql(​"select * from supplier"​)
customerdf​=​spark.sql(​"select * from customer"​)
lineitemdf​=​spark.sql(​"select * from lineitem"​)
partsuppdf​=​spark.sql(​"select * from partsupp"​)

4. Create dataframe in spark using data in mysql


nation ​=​ spark.read.format(​"jdbc"​).option(​"url"​,
"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"nation"​).option(​"numPartitions"​,
"5"​).option(​"user"​, ​"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

customer ​=​ spark.read.format(​"jdbc"​).option(​"url"​,


"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"customer"​).option(​"lowerBound"​,
"1000"​).option(​"upperBound"​, ​"5000"​).option(​"numPartitions"​,
"5"​).option(​"partitionColumn"​, ​"c_custkey"​).option(​"user"​,
"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

region ​=​ spark.read.format(​"jdbc"​).option(​"url"​,


"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"region"​).option(​"lowerBound"​,
"1000"​).option(​"upperBound"​, ​"5000"​).option(​"numPartitions"​,
"5"​).option(​"partitionColumn"​, ​"r_regionkey"​).option(​"user"​,
10

"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

supplier ​=​ spark.read.format(​"jdbc"​).option(​"url"​,


"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"supplier"​).option(​"lowerBound"​,
"1000"​).option(​"upperBound"​, ​"5000"​).option(​"numPartitions"​,
"5"​).option(​"partitionColumn"​, ​"s_suppkey"​).option(​"user"​,
"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

partsupp ​=​ spark.read.format(​"jdbc"​).option(​"url"​,


"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"partsupp"​).option(​"lowerBound"​,
"1000"​).option(​"upperBound"​, ​"5000"​).option(​"numPartitions"​,
"5"​).option(​"partitionColumn"​, ​"ps_partkey"​).option(​"user"​,
"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

orders ​=​ spark.read.format(​"jdbc"​).option(​"url"​,


"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"orders"​).option(​"lowerBound"​,
"1000"​).option(​"upperBound"​, ​"5000"​).option(​"numPartitions"​,
"5"​).option(​"partitionColumn"​, ​"o_orderkey"​).option(​"user"​,
"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

lineitem ​=​ spark.read.format(​"jdbc"​).option(​"url"​,


"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"lineitem"​).option(​"lowerBound"​,
"1000"​).option(​"upperBound"​, ​"5000"​).option(​"numPartitions"​,
"5"​).option(​"partitionColumn"​, ​"l_orderkey"​).option(​"user"​,
"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

part ​=​ spark.read.format(​"jdbc"​).option(​"url"​,


"jdbc:mysql://localhost:3306/project"​).option(​"driver"​,
"com.mysql.jdbc.Driver"​).option(​"dbtable"​, ​"part"​).option(​"lowerBound"​,
"1000"​).option(​"upperBound"​, ​"5000"​).option(​"numPartitions"​,
"5"​).option(​"partitionColumn"​, ​"p_partkey"​).option(​"user"​,
11

"hiveuser"​).option(​"password"​, ​"hivepassword"​).load()

5. Data preparation
a. Checking for null values

select​ ​count​(​*​) ​from​ lineitem ​where​ orderkey is ​NULL​ ​or​ partkey is ​NULL
or​ suppkey is ​NULL​ ​or​ linenumber is ​NULL​ ​or​ quantity is ​NULL​ ​or
extendedprice is ​NULL​ ​or​ discount is ​NULL​ ​or​ tax is ​NULL​ ​or​ returnflag
is ​NULL​ ​or​ linestatus is ​NULL​ ​or​ shipdate is ​NULL​ ​or​ commitdate is ​NULL
or​ receiptdate is ​NULL​ ​or​ shipinstruct is ​NULL​ ​or​ shipmode is ​NULL​ ​or
comment is ​NULL​;

b. Checking for bad records

select​ ​count​(​*​) ​from​ customer ​where​ nationkey IS ​NULL​;


+​----------+
| ​count​(​*​) |
+​----------+
| ​0​ |
+​----------+
1​ row ​in​ ​set​ (​0​.​09​ sec)

select​ ​count​(​*​) ​from​ customer ​where​ nationkey​=​""​;


+​----------+
| ​count​(​*​) |
+​----------+
| ​0​ |
+​----------+
12

6. Records Count validation in MYSQL, Hive and Spark

Table Mysql Spark Hive

Customers 150000 150000 150000

Orders 1500000 1500000 1500000

Lineitem 6001215 6001215 6001215

Part 200000 200000 200000

Partsupp 800000 800000 800000

Supplier 10000 10000 10000

Nation 25 25 25

Region 5 5 5

7. Partitioning and Bucketing


a. Partitioning using Hive
create​ ​table​ ​orders_part​(o_orderkey ​int​, o_custkey ​int​, o_orderstatus string,
o_totalprice double, o_orderdate string, o_orderpriority string, o_clerk string,
o_shippriority ​int​, o_comment string) partitioned by (year string, month string, day
string);

insert overwrite table orders_part partition(year,month,day) ​select​ o_orderkey,


o_custkey, o_orderstatus, o_totalprice,o_orderdate string, o_orderpriority, o_clerk,
o_shippriority, o_comment, ​substring​(o_orderdate,​1​,​4​), ​substring​(o_orderdate,​6​,​2​),
substring​(o_orderdate,​9​,​2​) ​from​ orders ​where​ o_orderkey​<​100​;
13
Partitions were created in following manner:
14

b. Bucketing using Hive

create​ ​table​ ​orders_bucket_1​(o_orderkey ​int​, o_custkey ​int​, o_orderstatus string,


o_totalprice double, o_orderpriority string, o_clerk string, o_shippriority ​int​,
o_comment string,o_orderdate string);

Insert overwrite orders_bucket_1 ​select​ o_orderkey ​int​, o_custkey ​int​, o_orderstatus


string, o_totalprice double, o_orderpriority string, o_clerk string, o_shippriority
int​, o_comment string, o_orderdate string ​from​ orders ​limit​ ​1000​;

create​ ​table​ ​orders_bucket_2​(o_orderkey ​int​, o_custkey ​int​, o_orderstatus string,


o_totalprice double, o_orderpriority string, o_clerk string,o_orderdate string,
o_shippriority ​int​, o_comment string) clustered by (o_orderdate) into ​8​ buckets;

insert overwrite table orders_bucket_2 ​select​ o_orderkey, o_custkey, o_orderstatus,


o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment ​from
orders;

Buckets were created in following manner:


15

c. Partitioning using Spark


from​ ​pyspark​.​sql​.functions import ​*
lineitemdf​=​spark​.​sql​(​"select * from lineitem_ limit 10000"​)

lineitem​=​lineitemdf​.​select​(col(​"l_shipdate"​),year(col(​"l_shipdate"​)).alias(​"year"​),mont
h(col(​"l_shipdate"​)).alias(​"month"​),dayofmonth(col(​"l_shipdate"​)).alias(​"day"​))

lineitem​.​write​.partitionBy(​"year"​,​"month"​,​"day"​).option(​"path"​,​"/user/hive/warehouse/pr
oject3_rohan_hv.db/lineitem_part"​).save(​"lineitem_parts"​)

Partitions were created in following manner:


16

d. Bucketing using Spark

lineitemdf​=​spark​.​sql​(​"select * from lineitem limit 10000"​)

lineitem​.​write​.partitionBy(“l_shipmode”).bucketBy(​5​,​"l_linestatus"​).option(​"path"​,​"/use
r/ak/data/project_data/line_buck"​).saveAsTable(​"line_buck"​)
17
Buckets were obtained in following manner:

8. Observations and challenges


● Data load time
Table Sqoop Pyspark(from Hive)

LINEITEM 20sec 2 sec

ORDERS 12sec 1sec

● Data format used was CSV. Delimited by comma and line separator ‘\n’.
But certain fields also contained the commas within the strings. Hence it
caused data mismatch while loading. This challenge we overcame by
using “--hive-drop-import-delims” while sqoop importing data.
● We can observe from point 6, the data has an enormous number of
records. During partitioning and bucketing the number of records had to
be limited due to system limitations.
18

● STEP 2: TRANSFORMATIONS USING SPARK


=============================================================
Query 1 :​ ​Retail Company is aiming to find the customers who hold
top 10 maximum account balances.

##​sort
customerdf​=​spark.sql(​"select * from customer"​)
customerdf.sort(customerdf.c_acctbal.desc()).show(​10​)

Query 2 :​ Retail Company planning to launch a scheme for the


people holding balance greater than 9990. Find how many customers
are eligible for the scheme.

##​filter

customerdf.filter(customerdf[​"c_acctbal"​]​>​9990​).show()
customerdf.filter(customerdf[​"c_acctbal"​]​>​9990​).count()
19

Query 3 : ​Retail Company wants to set up a new office but it is


confused which location to choose. To help the company find out
where it’s maximum customers are located.

##join

nationdf​=​spark.sql(“select ​*​ ​from​ nation”)


customer_join_nation​=​customerdf.join(nationdf,customerdf.c_nationkey​==​nationdf.n_nat
ionkey,​'left'​)
customer_join_nation.show(​5​)

##aggregation(group by)

customer_join_nation.select(​"c_nationkey"​,​"n_name"​).groupBy(​"c_nationkey"​,​"n_name"​).
count().show()
20

Query 4:​Retail Company is interested in knowing the details of


customers from India who are not maintaining minimum zero account
balance.
##join,filter
cust_join_Natdf​=​customerdf.join(nationdf,customerdf.c_nationkey​==​nationdf.n_nationk
ey,​'left'​)

cust_join_Natdf.select(​'c_custkey'​,​'c_name'​,​'n_name'​,​'c_acctbal'​).filter((custNatdf
.n_name​==​"INDIA"​) ​&​ (custNatdf.c_acctbal ​<​ ​0​))

Query 5:​The newly appointed CFO wants to know the average


21

account balance maintained by customers in each region. To


help him find the solution.

##join,filter,aggregation

cust_join_Natdf​=​custdf.join(nationdf,customerdf.c_nationkey​==​nationdf.n_nationkey,
"inner"​)

custNatReg​=​custNatdf.join(regiondf,cust_join_Natdf.n_regionkey​==​regiondf.r_regionk
ey,​'inner'​)

custNatRegdf.groupby(​'r_name'​).agg({​'c_acctbal'​:​'sum'​,​'c_acctbal'​:​'mean'​}).show()

Query 6:​A technical architect at a retail company wants to maintain


unique customers details.Help him out.

##dropDuplicates
custdf.select(​'c_nationkey'​,​'c_mktsegment'​).dropDuplicates().show()
22

Query 7: ​The marketing team at the retail company wants to know


the distribution of market segments across various nations.
Help them to get a clear overview of the market.

##crosstab
cust_join_natdf.crosstab(​'nation_name'​,​'customer_mktsegment'​).show()
23

Query 8: ​The Shipping department at the retail company wants to know


the breakdown of services of the different modes of shipment.To
help him analyze the required data.

lineitemdf.groupby(​'l_shipmode'​).agg({​'l_extendedprice'​:​'mean'​}).show()

Query 9:​The Retail company is planning to give Diwali Gifts to its


most trustable and loyal customers who have done the maximum
amount of purchase.Help them identify the ​s​uitable customers for the
gift.

custjoinOrd​=​customerdf.join(order,customerdf.c_custkey​==​orderdf.o_custkey,​"inner"​)

total_Order_amt_df​=​custjoinOrd.select(​'c_name'​,​'o_totalprice'​).groupby(​'c_name'​).agg
(​sum​(​"o_totalprice"​).alias(​"total_order_amt"​)).limit(​25​)

total_Order_amt_df.sort(total_Order_amt_df.desc()).show()
24

STEP 3: USING DELTA LAKE FOR UPDATE AND DELETE


=============================================================
1. Creating Delta Table

import​ pyspark
from​ pyspark.sql.functions ​import​ ​*

from​ pyspark.sql ​import​ SparkSession

spark ​=​ pyspark.sql.SparkSession.builder.appName(​"delta"​) \


25

.config(​"spark.jars.packages"​, ​"io.delta:delta-core_2.12:0.7.0"​) \
.config(​"spark.sql.extensions"​,
"io.delta.sql.DeltaSparkSessionExtension"​) \
.config(​"spark.sql.catalog.spark_catalog"​,
"org.apache.spark.sql.delta.catalog.DeltaCatalog"​) \
.getOrCreate()

from​ delta.tables ​import​ ​*


ordersdf​=​spark.read.jdbc(
​url​=​'jdbc:mysql://localhost:3306/retail'​,
​table​=​'ORDERS'​,
​column​=​'O_ORDERKEY'​,
​lowerBound​=​0​,
​upperBound​=​3000​,
​numPartitions​=​10​,
​properties​=​ {
​'user'​:​'hiveuser'​,
​'password'​:​'hivepassword'
}
)

spark.sql(​"use project"​)
spark.sql(​"show tables"​).show()

ordersdf​=​spark.sql(​"select* from orders"​)

ordersdf.write.format(​"delta"​).save(​"file:///home/ak/delta/project"​)

spark.sql(​"CREATE TABLE orders_delta USING DELTA LOCATION


'file:///home/ak/delta/project/'"​)

spark.sql(​"show tables"​).show()

delta Table​=​DeltaTable.forPath(spark,​"file:///home/ak/delta/project"​)
26

2. Updating records
Update the order status from F to Fail

df1​=​deltaTable.toDF().show()

deltaTable.update(col(​"o_orderstatus"​) ​==​ ​"F"​, { ​"o_orderstatus"​: lit(​"Fail"​) } )


df2​=​deltaTable.toDF().show()

3. Deleting records
27

##​delete the records where order priority ​is​ LOW


##​Before delete operation
spark.sql(​"select * from orders_delta where o_orderkey=1200006"​).show()

##​delete operation
deltaTable.delete(​"o_orderpriority='5-LOW'"​)
df1​=​delta Table.toDF()
df1.show()

##​after delete operation


spark.sql(​"select * from orders_delta where o_orderkey=1200006"​).show()
28

STEP 4: KAFKA-SPARK STRUCTURED STREAMING


=============================================================

cd​ kafka_2.11​-​2.4​.1​/​bin

##new tab start zookeeper

./​zookeeper​-​server​-start​.sh .​./​config​/​zookeeper.properties

##new tab start kafka server

./​kafka​-​server​-start​.sh .​./​config​/​server.properties

##create topic retail new tab

./​kafka​-​topics.sh​ --​create​ --​zookeeper localhost​:​2181​ --​replication​-factor​ ​1


--​partitions ​1​ --​topic retail

##go to connect-file-source.properties in kafka_2.11-2.4.1/config folder and set


file=path and topic=retail

##new tab start producer

connect​-​standalone.sh .​./​config​/​connect​-​standalone.properties
.​./​config​/​connect​-​file​-​source.properties

##code for spark structure streaming

pyspark ​--​packages org.apache.spark:spark​-​sql​-​kafka​-​0​-​10​_2.11:​2.4​.​1


from​ pyspark.sql.functions ​import​ ​*

df ​=​ spark.readStream \
.format(​"kafka"​) \
29

.option(​"kafka.bootstrap.servers"​, ​"localhost:9092"​) \
.option(​"subscribe"​, ​"retail"​) \
.option(​"startingOffsets"​, ​"earliest"​) \
.load()

#In this case you can choose if you want to re-read all the messages from the beginning
(earliest) or just after the last one (latest).

df2​=​df.selectExpr(​"timestamp"​,​"CAST(value AS STRING)"​)

df_split​=​df2.withColumn(​"c_custkey"​, split(col(​"value"​),
",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​0​)).withColumn(​"c_name "​,
split(col(​"value"​),
",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​1​)).withColumn(​"c_address"​,
split(col(​"value"​),
",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​2​)).withColumn(​"c_nationkey "​,
split(col(​"value"​),
",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​3​)).withColumn(​"c_phone"​,
split(col(​"value"​),
",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​4​)).withColumn(​"c_acctbal"​,
split(col(​"value"​),
",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​5​)).withColumn(​"c_mktsegment "​,
split(col(​"value"​),
",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​6​)).withColumn(​"c_comment"​,
split(col(​"value"​), ​",(?=([^​\\\"​]​\\\"​[^​\\\"​]​\\\"​)[^​\\\"​]$)"​).getItem(​7​))

df_sel​=​df_split.select(​"timestamp"​,​"c_nationkey "​,​"c_acctbal"​)

df_agg​=​df_sel.withWatermark(​"timestamp"​, ​"10 minutes"​)\


.groupBy(df_sel[​'c_nationkey'​], window(​"timestamp"​,​"30 seconds"​,​"18 seconds"​))\
.agg({​"c_acctbal"​: ​"sum"​})\
.withColumnRenamed(​"sum(c_acctbal)"​, ​"total_bal"​)\
.orderBy(​'total_bal'​)
30

query​=​df_agg.writeStream\
.outputMode(​"complete"​)\
.format(​"console"​)\
.option(​"truncate"​, ​"false"​)\
.start()\
.awaitTermination()
31

Observations and challenges:

● Generally Kafka Dataframe consists of key, value, topic, partition, offset,


timestamp, timestamp type. Which had to be splitted into required format.
● Kafka dataframe value had to be separated using the regex pattern to
avoid the column mismatch.

STEP 5: INTEGRATION WITH AWS SERVICES


============================================================

1. S3 integration with Hive

1​) put data ​in​ s3 ​----​ supplier.csv at specific location


2​) go to IAM user, select your username, go to access​-->​create access key​--->​get ID
and​ Secret access key.
3​) ​in​ core​-​site.xml add below lines
<​property​>
<​name​>​fs.s3a.awsAccessKeyId​</​name​>
<​value​>​accesskey​</​value​>
</​property​>

<​property​>
<​name​>​fs.s3a.awsSecretAccessKey​</​name​>
<​value​>​secretaccesskey​</​value​>
</​property​>
32

<​property​>
<​name​>​fs.s3n.awsAccessKeyId​</​name​>
<​value​>​accesskey​</​value​>
</​property​>

<​property​>
<​name​>​fs.s3n.awsSecretAccessKey​</​name​>
<​value​>​secretaccesskey​</​value​>
</​property​>

4​) add below jars ​and​ put these ​in​ spark​->​jars ​as​ well ​as​ separate folder jar​-​s3​-​hive

https:​//​repo1.maven.org​/​maven2​/​com​/​jamesmurty​/​utils​/​java​-​xmlbuilder​/​0.4​/​java​-​xmlbuilde
r​-​0.4​.jar

https:​//​repo1.maven.org​/​maven2​/​commons​-​logging​/​commons​-​logging​/​1.1​.​1​/​commons​-​logging​-​1
.1​.​1​.jar

https:​//​repo1.maven.org​/​maven2​/​org​/​apache​/​commons​/​commons​-​lang3​/​3.4​/​commons​-​lang3​-​3.4​.
jar

https:​//​repo1.maven.org​/​maven2​/​commons​-​httpclient​/​commons​-​httpclient​/​3.1​/​commons​-​httpc
lient​-​3.1​.jar

https:​//​repo1.maven.org​/​maven2​/​commons​-​codec​/​commons​-​codec​/​1.3​/​commons​-​codec​-​1.3​.jar

https:​//​repo1.maven.org​/​maven2​/​com​/​amazonaws​/​aws​-​java​-​sdk​-​bundle​/​1.11​.​892​/​aws​-​java​-​sdk
-​bundle​-​1.11​.​892​.jar

https:​//​repo1.maven.org​/​maven2​/​net​/​java​/​dev​/​jets3t​/​jets3t​/​0.9​.​4​/​jets3t​-​0.9​.​4​.jar

https:​//​repo1.maven.org​/​maven2​/​org​/​apache​/​hadoop​/​hadoop​-​aws​/​2.10​.​0​/​hadoop​-​aws​-​2.10​.​0​.j
ar
33

5​) give below line ​in​ hadoop​-​env.sh

export HADOOP_CLASSPATH​=​$HADOOP_CLASSPATH:$HOME​/​jar​-​s3​-​hive​/*

6​) ​in​ spark​-​defaults.conf


spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.access.key AKIAYLYEKLCYHWDKZKFY
spark.hadoop.fs.s3a.secret.key I3Yd6bSfcf8tkqp0vdYdhXvYdceMXoFO​+​KoNDcb1
spark.hadoop.fs.s3n.access.key AKIAYLYEKLCYHWDKZKFY
spark.hadoop.fs.s3n.secret.key I3Yd6bSfcf8tkqp0vdYdhXvYdceMXoFO​+​KoNDcb1

7​) ​in​ hive


set​ fs.s3a.awsAccessKeyId​=​AKIAYLYEKLCYHWDKZKFY;

set​ fs.s3a.awsSecretAccessKey​=​I3Yd6bSfcf8tkqp0vdYdhXvYdceMXoFO​+​KoNDcb1;

use project;

create external table ​if​ ​not​ exists supplier_s3a(s_suppkey ​int​, s_name string,
s_address string, s_nationkey ​int​, s_phone string, s_accbal double, s_comment string)
ROW FORMAT SERDE ​'org.apache.hadoop.hive.serde2.OpenCSVSerde'​ WITH SERDEPROPERTIES
(​"separatorChar"​=​","​ , ​"quoteChar"​=​"​\"​"​ ) LOCATION ​'s3a://lti858/project3/'​;

create external table ​if​ ​not​ exists supplier_s3n(s_suppkey ​int​, s_name string,
s_address string, s_nationkey ​int​, s_phone string, s_accbal double, s_comment string)
ROW FORMAT SERDE ​'org.apache.hadoop.hive.serde2.OpenCSVSerde'​ WITH SERDEPROPERTIES
(​"separatorChar"​=​","​ , ​"quoteChar"​=​"​\"​"​ ) LOCATION ​'s3n://lti858/project3/'​;
34

2. S3 integration with Spark

# creating dataframe in pyspark using data in s3

supplierdfs3n ​=​ spark.read.csv(​"s3n://lti858/project3/supplier.csv"​)


supplierdfs3n.show()

# writing data to s3 from dataframe

nationdf​=​spark.sql(​"select * from nation"​)


nationdf.write.option(​"header"​,​"true"​).csv(​"s3n://lti858/project3_write/"​)

3. Run Spark job on EMR

spark​-​submit \
​--​master spark:​//​localhost:​7077​ \
​--​deploy​-​mode cluster \
​--​supervise \
​--​executor​-​memory ​20​G \
​--​total​-​executor​-​cores ​100​ \
​/​path​/​to​/​examples.py

spark​-​submit \
​--​master yarn \
​--​deploy​-​mode cluster \ # can be client for client mode
​--​executor​-​memory ​20​G \
​--​num​-​executors ​50​ \
​/​path​/​to​/​examples.py
35

4. Create S3 Bucket using AWS CLI

#The following command creates a bucket named my-bucket:

aws s3api create​-​bucket ​--​bucket my​-​bucket ​--​region us​-​east​-​1

5. Redshift integration with Spark

## read data from redshift to dataframe in spark

partdf ​=​ spark.read \


.format(​"com.databricks.spark.redshift"​) \
.option(​"url"​,
"jdbc:redshift://redshifthost:5439/database?user=username&password=pass"​) \
.option(​"dbtable"​, ​"part"​) \
.option(​"tempdir"​, ​"s3n://lti858/Mahima_Tendulkar/temdir_redshift"​) \
.load()

## write data into redshift from dataframe in spark


ordersdf.write \
.format(​"com.databricks.spark.redshift"​) \
.option(​"url"​,
"jdbc:redshift://redshifthost:5439/database?user=username&password=pass"​) \
.option(​"dbtable"​, ​"orders"​) \
.option(​"tempdir"​, ​"s3n://lti858/Retail_Project1/temdir_redshift"​) \
.mode(​"error"​) \
.save()

# Using IAM Role based authentication


df.write \
.format(​"com.databricks.spark.redshift"​) \
.option(​"url"​,
"jdbc:redshift://redshifthost:5439/database?user=username&password=pass"​) \
.option(​"dbtable"​, ​"orders"​) \
36

.option(​"tempdir"​, ​"s3n://lti858/Retail_Project1/temdir_redshift"​) \
.option(​"aws_iam_role"​, ​"arn:aws:iam::123456789000:role/redshift_iam_role"​)\
.mode(​"error"​) \
​ .save()

6. Snowflake integration with Spark

# include below jars in spark-->jars folder


https:​//​repo1.maven.org​/​maven2​/​net​/​snowflake​/​spark​-​snowflake_2.11​/​2.8​.​2​-​spark_
2.4​/​spark​-​snowflake_2.11​-​2.8​.​2​-​spark_2.4.jar

https:​//​repo1.maven.org​/​maven2​/​net​/​snowflake​/​snowflake​-​jdbc​/​3.12​.​13​/​snowflake​-
jdbc​-​3.12​.​13​.jar

## using webui snowflake, create customer table in snowflake with bellow


command

use role accountadmin;


use snowdb;
use schema snowschema;
create ​or​ replace table customersnow ​as​ select ​*​ ​from
SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER;

pyspark ​--​packages
net.snowflake:snowflake​-​jdbc:​3.8​.​0​,net.snowflake:spark​-​snowflake_2.11:​2.4​.​14​-​s
park_2.4

##read data from snowflake to df in spark

sfOptions ​=​ {
​"sfURL"​ : ​"da67402.ap-south-1.aws.snowflakecomputing.com"​,
​"sfUser"​ : ​"mahima"​,
​"sfPassword"​ : ​"password"​,
37

​"sfDatabase"​ : ​"SNOWDB"​,
​"sfSchema"​ : ​"SNOWSCHEMA"​,
​"sfWarehouse"​ : ​"COMPUTE_WH"​,
​"sfRole"​ : ​"ACCOUNTADMIN"​,
​"sfaccount"​:​"da67402"
}

SNOWFLAKE_SOURCE_NAME ​=​ ​"net.snowflake.spark.snowflake"

df ​=​ spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(​**​sfOptions) \
.option(​"query"​,​"select * from SNOWDB.SNOWSCHEMA.CUSTOMERSNOW"​) \
.load()

df.show()

## write data from spark to snowflake

spark.sql(“use project”)
nationdf​=​spark.sql(“select ​*​ ​from​ nation”)
nationdf.write\
.format(SNOWFLAKE_SOURCE_NAME)\
.options(​**​sfOptions)\
.option(​"dbtable"​, ​"nation"​)\
.save()
38

7. AWS Lambda script for loading data from S3 to Dynamodb

import​ json
import​ urllib.parse
import​ boto3
import​ csv

print​(​'Loading function'​)

s3 ​=​ boto3.client(​'s3'​)
dynamodb ​=​ boto3.client(​'dynamodb'​)

def​ ​lambda_handler​(​event​, ​context​):

bucket ​=​ event[​'Records'​][​0​][​'s3'​][​'bucket'​][​'name'​]


key ​=​ event[​'Records'​][​0​][​'s3'​][​'object'​][​'key'​]

response ​=​ s3.get_object(​Bucket​ ​=​ bucket, ​Key​ ​=​ key)


data ​=​ response[​'Body'​].read().decode(​'utf-8'​)

rows ​=​ data.split(​"​\n​"​)

nationr ​=​ ​list​(csv.reader(rows))

​print​(nationr)

arr ​=​ dynamodb.list_tables()

​if​ ​'Nation'​ ​not​ ​in​ arr[​'TableNames'​]:


dynamodb.create_table(​AttributeDefinitions​ ​=​ [{​'AttributeName'​:
nationr[​0​][​0​],​'AttributeType'​: ​'N'​}],
​TableName​ ​=​ ​'Nation'​,
​KeySchema​ ​=​ [{​'AttributeName'​: nationr[​0​][​0​], ​'KeyType'​:
39

'HASH'​}],
​BillingMode​=​'PROVISIONED'​,
​ProvisionedThroughput​=​{
​'ReadCapacityUnits'​: ​5​,
​'WriteCapacityUnits'​: ​5
})
cols ​=​ [nationr[​0​][​0​],nationr[​0​][​1​],nationr[​0​][​2​],nationr[​0​][​3​]]
​else​:
cols ​=​ [i[​'AttributeName'​] ​for​ i ​in​ dynamodb.describe_table(​TableName​ ​=
'Nation'​)[​'Table'​][​'AttributeDefinitions'​]]

​for​ row ​in​ nationr[​1​:]:


​try​:
dynamodb.put_item(​TableName​ ​=​ ​'Nation'​,
​Item​ ​=​ {
cols[​0​]:{​'N'​:row[​0​]},
nationr[​0​][​1​]: {​'S'​:row[​1​]},
nationr[​0​][​2​]: {​'S'​:row[​2​]},
nationr[​0​][​3​]: {​'S'​:row[​3​]}

})
​except​ ​IndexError​:
​break
​return​ {
​'statusCode'​ : ​200
}

## TEST:

{
​"Records"​: [
{
40

​"eventVersion"​: ​"2.0"​,
​"eventSource"​: ​"aws:s3"​,
​"awsRegion"​: ​"us-east-1"​,
​"eventTime"​: ​"1970-01-01T00:00:00.000Z"​,
​"eventName"​: ​"ObjectCreated:Put"​,
​"userIdentity"​: {
​"principalId"​: ​"EXAMPLE"
},
​"requestParameters"​: {
​"sourceIPAddress"​: ​"127.0.0.1"
},
​"responseElements"​: {
​"x-amz-request-id"​: ​"EXAMPLE123456789"​,
​"x-amz-id-2"​:
"EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH"
},
​"s3"​: {
​"s3SchemaVersion"​: ​"1.0"​,
​"configurationId"​: ​"testConfigRule"​,
​"bucket"​: {
​"name"​: ​"lti858"​,
​"ownerIdentity"​: {
​"principalId"​: ​"EXAMPLE"
},
​"arn"​: ​"arn:aws:s3:::lti858"
},
​"object"​: {
​"key"​: ​"Retail_Project1/dynamodbwrite/nation.csv"​,
​"size"​: ​1024​,
​"eTag"​: ​"0123456789abcdef0123456789abcdef"​,
​"sequencer"​: ​"0A1B2C3D4E5F678901"
}
}
}
]
41

}
42

STEP 6: ​VISUALIZATION
============================================================

Scenario 1: The marketing team at the retail company wants


to know the distribution of market segments across various
nations. To help them to get a clear overview of the market.

cust_nation_df ​= ​customerdf.join(nationdf,customerdf.c_nationkey ​==


nationdf.n_nationkey,​'left'​)

cust_nation_df.crosstab(​'n_name'​,​'c_mktsegment'​).show()

Market_Segment_Distribution_df ​=
cust_nation_df.crosstab(​'n_name'​,​'c_mktsegment'​).write.format(​'csv'​).save(​"file:
///home/ak/PROJECT3/outputCSV/Market_Segment_Distribution.csv"​, ​header​ ​=​ ​'true'​)
43
44

Scenario 2: The Shipping department at the retail company


wants to know the breakdown of services of the different
modes of shipment. Help him analyze the required data.

lineitemdf_​=​spark.sql(​"select * from lineitem_ limit 1000"​)

shipmode_price_df_​=​lineitemdf_.groupby(​'l_shipmode'​).agg({​'l_extendedprice'​:​'
mean'​}).sort(lineitemdf_.l_shipmode.asc())

shipmode_price_df_.write.format(​'csv’).save("file:///home/ak/PROJECT3/outputC
SV/ShipModeAvgPrice.csv", header = '​true​')

shipmode_price_df_.show()
45

Scenario 3: In the past 1 year (1998), which are the most


sold item with its count using lineitem and parts tables

a ​=​ lineitemdf.select(​"l_partkey"​,​"l_commitdate"​)
b ​ ​
= a.sort(a.l_commitdate.desc())
c ​ ​
= b.filter(b[​"l_commitdate"​]​>​"1997-12-31"​)
d ​ ​
= partdf.select(​"p_partkey"​,​"p_name"​)
e ​ ​
= c.join(d,c.l_partkey​==​d.p_partkey,​'left'​)
f ​ =
e.select(​"p_name"​,​"l_partkey"​).groupBy(​'p_name'​).count().sort(​'count'​,
ascending​=​False​)
46

Scenario 4: The Retail company is planning to give Diwali


Gifts to its most trustable and loyal customers who have
done the maximum amount of purchase. Help them identify
the suitable customers for the gift.
47

Scenario 5: To find out the number of customers region wise.

cust_join_natdf​=​customerdf.join(nationdf,customerdf.c_nationkey​==​nationdf.n_nat
ionkey,​'left'​)

cust_nat_regdf​=​cust_join_natdf.join(regiondf,regiondf.r_regionkey​==​cust_join_na
tdf.n_regionkey,​'left'​)

import​ pyspark.sql.functions ​as​ f

finaldf​=​cust_nat_regdf.groupBy(​"r_name"​).count().select(​"r_name"​,f.col(​'count'​)
.alias(​'total_number of customer'​))

finaldf.show()
48

Scenario 6:
49

CONCLUSION
========================================================

You might also like