SQL Final Document
SQL Final Document
csv")
df.show()
df1.show()
cust.show()
prod.show()
df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")
sc.setLogLevel("ERROR")
spark.sql("select * from df order by id").show()
spark.sql("select * from df1 order by id").show()
====================================
Validate data
====================================
====================================
Select two columns
====================================
====================================
Multi Column filter
====================================
spark.sql("select id,tdate,category,spendby from df where category='Exercise'
and spendby='cash' ").show()
====================================
Multi Value Filter
====================================
====================================
Like Filter
====================================
====================================
Not Filters
====================================
====================================
Not In Filters
====================================
spark.sql("select * from df where category not in
('Exercise','Gymnastics')").show()
====================================
Null Filters
====================================
spark.sql("select * from df where product is null").show()
====================================
Max Function
====================================
====================================
Count
====================================
====================================
Condition statement
====================================
====================================
Concat_ws data
====================================
spark.sql("select concat_ws('-',id,category,product) as concat from df
").show()
====================================
Lower Case data
====================================
====================================
Round the data
====================================
====================================
Trim the space
====================================
====================================
Distinct the columns
====================================
====================================
Substring/Split operation
====================================
====================================
Union all
====================================
====================================
Aggregate sum with two columns
====================================
spark.sql("select category,spendby,sum(amount) as total from df group by
category,spendby").show()
====================================
Aggregate Count
====================================
====================================
Aggregate Max
====================================
spark.sql("select category, max(amount) as max from df group by
category").show()
====================================
Aggregate with Order Descending
====================================
====================================
Window Row Number
====================================
spark.sql("SELECT category,amount, row_number() OVER ( partition by
category order by amount desc ) AS row_number FROM df").show()
====================================
Window Dense_rank Number
====================================
====================================
Window Lead function
====================================
====================================
Window lag function
====================================
====================================
Inner Join
====================================
spark.sql("select a.id,a.name,b.product from cust a join prod b on
a.id=b.id").show()
====================================
Left Join
====================================
spark.sql("select a.id,a.name,b.product from cust a left join prod b on
a.id=b.id").show()
====================================
Right Join
====================================
====================================
Date format
====================================
spark.sql("select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-
yyyy'),'yyyy-MM-dd') as con_date from df").show()
====================================
Sub query
====================================
spark.sql("""
select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-yyyy'),'yyyy-MM-
dd') as con_date,amount,category,product,spendby from df)
group by con_date
""").show()
====================================
collect_list
====================================
====================================
collect_set
====================================
====================================
Explode
====================================
====================================
Total Eclipse Code
====================================
package pack
object obj {
def main(args:Array[String]):Unit={
val conf = new
SparkConf().setAppName("Revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark =
SparkSession.builder().enableHiveSupport()
.config("spark.sql.warehouse.dir",
"file:///C:/hivewarehou/")
.config("spark.sql.catalogImplementation","hiv
e").getOrCreate()
import spark.implicits._
val df =
spark.read.option("header","true").csv("file:///C:/data/df.csv")
val df1 =
spark.read.option("header","true").csv("file:///C:/data/df1.csv")
val cust =
spark.read.option("header","true").csv("file:///C:/data/cust.csv")
val prod =
spark.read.option("header","true").csv("file:///C:/data/prod.csv")
df.show()
df1.show()
cust.show()
prod.show()
df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
cust.createOrReplaceTempView("cust")
prod.createOrReplaceTempView("prod")
spark.sql("select * from df ").show()
spark.sql("select id,tdate from df order by id").show()
spark.sql("select id,tdate,category from df where
category='Exercise' order by id").show()
spark.sql("select id,tdate,category,spendby from df where
category='Exercise' and spendby='cash' ").show()
spark.sql("select id,tdate from df where
category='Exercise' and spendby='cash' ").show()
spark.sql("select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-
yyyy'),'yyyy-MM-dd') as con_date from df").show()
spark.sql("""
select id,tdate,from_unixtime(unix_timestamp(tdate,'MM-dd-yyyy'),'yyyy-MM-
dd') as con_date,amount,category,product,spendby from df)
group by con_date
""").show()
spark.sql("select category,collect_list(spendby) as col_spend from df group by
category").show()