Project Retail Group1
Project Retail Group1
Retail project
Group 1
Prepared By:
Name PSNo.
Mahima Tendulkar 10671415
Rohan Badgujar 10671304
Rahul Kumar Singh 10671510
2
INDEX
========================================================
Sr No. Content
1 Design and algorithm
2 Data Ingestion and Preparation
3 Transformations using spark
4 Using Delta lake for update and delete
5 Kafka spark structured streaming
6 Integration with AWS
7 Visualization
8 Conclusion
3
On AWS:
4
SCHEMA OF TABLES
==============================================
5
"hiveuser").option("password", "hivepassword").load()
"hiveuser").option("password", "hivepassword").load()
5. Data preparation
a. Checking for null values
select count(*) from lineitem where orderkey is NULL or partkey is NULL
or suppkey is NULL or linenumber is NULL or quantity is NULL or
extendedprice is NULL or discount is NULL or tax is NULL or returnflag
is NULL or linestatus is NULL or shipdate is NULL or commitdate is NULL
or receiptdate is NULL or shipinstruct is NULL or shipmode is NULL or
comment is NULL;
Nation 25 25 25
Region 5 5 5
lineitem=lineitemdf.select(col("l_shipdate"),year(col("l_shipdate")).alias("year"),mont
h(col("l_shipdate")).alias("month"),dayofmonth(col("l_shipdate")).alias("day"))
lineitem.write.partitionBy("year","month","day").option("path","/user/hive/warehouse/pr
oject3_rohan_hv.db/lineitem_part").save("lineitem_parts")
lineitem.write.partitionBy(“l_shipmode”).bucketBy(5,"l_linestatus").option("path","/use
r/ak/data/project_data/line_buck").saveAsTable("line_buck")
17
Buckets were obtained in following manner:
● Data format used was CSV. Delimited by comma and line separator ‘\n’.
But certain fields also contained the commas within the strings. Hence it
caused data mismatch while loading. This challenge we overcame by
using “--hive-drop-import-delims” while sqoop importing data.
● We can observe from point 6, the data has an enormous number of
records. During partitioning and bucketing the number of records had to
be limited due to system limitations.
18
##sort
customerdf=spark.sql("select * from customer")
customerdf.sort(customerdf.c_acctbal.desc()).show(10)
##filter
customerdf.filter(customerdf["c_acctbal"]>9990).show()
customerdf.filter(customerdf["c_acctbal"]>9990).count()
19
##join
##aggregation(group by)
customer_join_nation.select("c_nationkey","n_name").groupBy("c_nationkey","n_name").
count().show()
20
cust_join_Natdf.select('c_custkey','c_name','n_name','c_acctbal').filter((custNatdf
.n_name=="INDIA") & (custNatdf.c_acctbal < 0))
##join,filter,aggregation
cust_join_Natdf=custdf.join(nationdf,customerdf.c_nationkey==nationdf.n_nationkey,
"inner")
custNatReg=custNatdf.join(regiondf,cust_join_Natdf.n_regionkey==regiondf.r_regionk
ey,'inner')
custNatRegdf.groupby('r_name').agg({'c_acctbal':'sum','c_acctbal':'mean'}).show()
##dropDuplicates
custdf.select('c_nationkey','c_mktsegment').dropDuplicates().show()
22
##crosstab
cust_join_natdf.crosstab('nation_name','customer_mktsegment').show()
23
lineitemdf.groupby('l_shipmode').agg({'l_extendedprice':'mean'}).show()
custjoinOrd=customerdf.join(order,customerdf.c_custkey==orderdf.o_custkey,"inner")
total_Order_amt_df=custjoinOrd.select('c_name','o_totalprice').groupby('c_name').agg
(sum("o_totalprice").alias("total_order_amt")).limit(25)
total_Order_amt_df.sort(total_Order_amt_df.desc()).show()
24
import pyspark
from pyspark.sql.functions import *
.config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
.config("spark.sql.extensions",
"io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog",
"org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.getOrCreate()
spark.sql("use project")
spark.sql("show tables").show()
ordersdf.write.format("delta").save("file:///home/ak/delta/project")
spark.sql("show tables").show()
delta Table=DeltaTable.forPath(spark,"file:///home/ak/delta/project")
26
2. Updating records
Update the order status from F to Fail
df1=deltaTable.toDF().show()
3. Deleting records
27
##delete operation
deltaTable.delete("o_orderpriority='5-LOW'")
df1=delta Table.toDF()
df1.show()
cd kafka_2.11-2.4.1/bin
./zookeeper-server-start.sh ../config/zookeeper.properties
./kafka-server-start.sh ../config/server.properties
connect-standalone.sh ../config/connect-standalone.properties
../config/connect-file-source.properties
df = spark.readStream \
.format("kafka") \
29
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "retail") \
.option("startingOffsets", "earliest") \
.load()
#In this case you can choose if you want to re-read all the messages from the beginning
(earliest) or just after the last one (latest).
df2=df.selectExpr("timestamp","CAST(value AS STRING)")
df_split=df2.withColumn("c_custkey", split(col("value"),
",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(0)).withColumn("c_name ",
split(col("value"),
",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(1)).withColumn("c_address",
split(col("value"),
",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(2)).withColumn("c_nationkey ",
split(col("value"),
",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(3)).withColumn("c_phone",
split(col("value"),
",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(4)).withColumn("c_acctbal",
split(col("value"),
",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(5)).withColumn("c_mktsegment ",
split(col("value"),
",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(6)).withColumn("c_comment",
split(col("value"), ",(?=([^\\\"]\\\"[^\\\"]\\\")[^\\\"]$)").getItem(7))
df_sel=df_split.select("timestamp","c_nationkey ","c_acctbal")
query=df_agg.writeStream\
.outputMode("complete")\
.format("console")\
.option("truncate", "false")\
.start()\
.awaitTermination()
31
<property>
<name>fs.s3a.awsSecretAccessKey</name>
<value>secretaccesskey</value>
</property>
32
<property>
<name>fs.s3n.awsAccessKeyId</name>
<value>accesskey</value>
</property>
<property>
<name>fs.s3n.awsSecretAccessKey</name>
<value>secretaccesskey</value>
</property>
4) add below jars and put these in spark->jars as well as separate folder jar-s3-hive
https://repo1.maven.org/maven2/com/jamesmurty/utils/java-xmlbuilder/0.4/java-xmlbuilde
r-0.4.jar
https://repo1.maven.org/maven2/commons-logging/commons-logging/1.1.1/commons-logging-1
.1.1.jar
https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.4/commons-lang3-3.4.
jar
https://repo1.maven.org/maven2/commons-httpclient/commons-httpclient/3.1/commons-httpc
lient-3.1.jar
https://repo1.maven.org/maven2/commons-codec/commons-codec/1.3/commons-codec-1.3.jar
https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.892/aws-java-sdk
-bundle-1.11.892.jar
https://repo1.maven.org/maven2/net/java/dev/jets3t/jets3t/0.9.4/jets3t-0.9.4.jar
https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.10.0/hadoop-aws-2.10.0.j
ar
33
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$HOME/jar-s3-hive/*
set fs.s3a.awsSecretAccessKey=I3Yd6bSfcf8tkqp0vdYdhXvYdceMXoFO+KoNDcb1;
use project;
create external table if not exists supplier_s3a(s_suppkey int, s_name string,
s_address string, s_nationkey int, s_phone string, s_accbal double, s_comment string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES
("separatorChar"="," , "quoteChar"="\"" ) LOCATION 's3a://lti858/project3/';
create external table if not exists supplier_s3n(s_suppkey int, s_name string,
s_address string, s_nationkey int, s_phone string, s_accbal double, s_comment string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES
("separatorChar"="," , "quoteChar"="\"" ) LOCATION 's3n://lti858/project3/';
34
spark-submit \
--master spark://localhost:7077 \
--deploy-mode cluster \
--supervise \
--executor-memory 20G \
--total-executor-cores 100 \
/path/to/examples.py
spark-submit \
--master yarn \
--deploy-mode cluster \ # can be client for client mode
--executor-memory 20G \
--num-executors 50 \
/path/to/examples.py
35
.option("tempdir", "s3n://lti858/Retail_Project1/temdir_redshift") \
.option("aws_iam_role", "arn:aws:iam::123456789000:role/redshift_iam_role")\
.mode("error") \
.save()
https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/3.12.13/snowflake-
jdbc-3.12.13.jar
pyspark --packages
net.snowflake:snowflake-jdbc:3.8.0,net.snowflake:spark-snowflake_2.11:2.4.14-s
park_2.4
sfOptions = {
"sfURL" : "da67402.ap-south-1.aws.snowflakecomputing.com",
"sfUser" : "mahima",
"sfPassword" : "password",
37
"sfDatabase" : "SNOWDB",
"sfSchema" : "SNOWSCHEMA",
"sfWarehouse" : "COMPUTE_WH",
"sfRole" : "ACCOUNTADMIN",
"sfaccount":"da67402"
}
df = spark.read.format(SNOWFLAKE_SOURCE_NAME) \
.options(**sfOptions) \
.option("query","select * from SNOWDB.SNOWSCHEMA.CUSTOMERSNOW") \
.load()
df.show()
spark.sql(“use project”)
nationdf=spark.sql(“select * from nation”)
nationdf.write\
.format(SNOWFLAKE_SOURCE_NAME)\
.options(**sfOptions)\
.option("dbtable", "nation")\
.save()
38
import json
import urllib.parse
import boto3
import csv
print('Loading function')
s3 = boto3.client('s3')
dynamodb = boto3.client('dynamodb')
print(nationr)
'HASH'}],
BillingMode='PROVISIONED',
ProvisionedThroughput={
'ReadCapacityUnits': 5,
'WriteCapacityUnits': 5
})
cols = [nationr[0][0],nationr[0][1],nationr[0][2],nationr[0][3]]
else:
cols = [i['AttributeName'] for i in dynamodb.describe_table(TableName =
'Nation')['Table']['AttributeDefinitions']]
})
except IndexError:
break
return {
'statusCode' : 200
}
## TEST:
{
"Records": [
{
40
"eventVersion": "2.0",
"eventSource": "aws:s3",
"awsRegion": "us-east-1",
"eventTime": "1970-01-01T00:00:00.000Z",
"eventName": "ObjectCreated:Put",
"userIdentity": {
"principalId": "EXAMPLE"
},
"requestParameters": {
"sourceIPAddress": "127.0.0.1"
},
"responseElements": {
"x-amz-request-id": "EXAMPLE123456789",
"x-amz-id-2":
"EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH"
},
"s3": {
"s3SchemaVersion": "1.0",
"configurationId": "testConfigRule",
"bucket": {
"name": "lti858",
"ownerIdentity": {
"principalId": "EXAMPLE"
},
"arn": "arn:aws:s3:::lti858"
},
"object": {
"key": "Retail_Project1/dynamodbwrite/nation.csv",
"size": 1024,
"eTag": "0123456789abcdef0123456789abcdef",
"sequencer": "0A1B2C3D4E5F678901"
}
}
}
]
41
}
42
STEP 6: VISUALIZATION
============================================================
cust_nation_df.crosstab('n_name','c_mktsegment').show()
Market_Segment_Distribution_df =
cust_nation_df.crosstab('n_name','c_mktsegment').write.format('csv').save("file:
///home/ak/PROJECT3/outputCSV/Market_Segment_Distribution.csv", header = 'true')
43
44
shipmode_price_df_=lineitemdf_.groupby('l_shipmode').agg({'l_extendedprice':'
mean'}).sort(lineitemdf_.l_shipmode.asc())
shipmode_price_df_.write.format('csv’).save("file:///home/ak/PROJECT3/outputC
SV/ShipModeAvgPrice.csv", header = 'true')
shipmode_price_df_.show()
45
a = lineitemdf.select("l_partkey","l_commitdate")
b
= a.sort(a.l_commitdate.desc())
c
= b.filter(b["l_commitdate"]>"1997-12-31")
d
= partdf.select("p_partkey","p_name")
e
= c.join(d,c.l_partkey==d.p_partkey,'left')
f =
e.select("p_name","l_partkey").groupBy('p_name').count().sort('count',
ascending=False)
46
cust_join_natdf=customerdf.join(nationdf,customerdf.c_nationkey==nationdf.n_nat
ionkey,'left')
cust_nat_regdf=cust_join_natdf.join(regiondf,regiondf.r_regionkey==cust_join_na
tdf.n_regionkey,'left')
finaldf=cust_nat_regdf.groupBy("r_name").count().select("r_name",f.col('count')
.alias('total_number of customer'))
finaldf.show()
48
Scenario 6:
49
CONCLUSION
========================================================