0% found this document useful (0 votes)
15 views

Scnfinal

The document describes Spark code to read CSV data, process it, and write it to Snowflake. It performs the following key steps: 1. Reads CSV data from local files and Snowflake, adds indexes, and joins dataframes. 2. Divides the data into batches, increments IDs, and writes to Snowflake in append mode. 3. Reads another CSV, calculates modulo of IDs to assign to batches, increments IDs against max values from Snowflake, and writes in append mode.

Uploaded by

aryas
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views

Scnfinal

The document describes Spark code to read CSV data, process it, and write it to Snowflake. It performs the following key steps: 1. Reads CSV data from local files and Snowflake, adds indexes, and joins dataframes. 2. Divides the data into batches, increments IDs, and writes to Snowflake in append mode. 3. Reads another CSV, calculates modulo of IDs to assign to batches, increments IDs against max values from Snowflake, and writes in append mode.

Uploaded by

aryas
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")

println("================Started1============")

val conf = new


SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val day1 = spark.read.option("header","true")


.csv("file:///C:/data/scenaridata/data1.csv")

day1.show()

val finaldf =
addColumnIndex(spark,day1).withColumn("batchid",lit(1))
.select("id","tdate","custnno","amt","state","batch
id")

finaldf.show()

finaldf.write.format("snowflake")
.option("sfURL","https://eogjppo-
wl54107.snowflakecomputing.com")
.option("sfAccount","eogjppo")
.option("sfUser","zeyobronanalytics66")
.option("sfPassword","Zeyo@908")
.option("sfDatabase","zeyodb")
.option("sfSchema","PUBLIC")
.option("sfRole","ACCOUNTADMIN")
.option("sfWarehouse","COMPUTE_WH")
.option("dbtable","dtab")
.save()
=============
Day 2
=============
val max_id_batc = spark.read.format("snowflake")
.option("sfURL","https://eogjppo-wl54107.snowflakecomputing.com")
.option("sfAccount","eogjppo")
.option("sfUser","zeyobronanalytics66")
.option("sfPassword","Zeyo@908")
.option("sfDatabase","zeyodb")
.option("sfSchema","PUBLIC")
.option("sfRole","ACCOUNTADMIN")
.option("sfWarehouse","COMPUTE_WH")
.option("query","select max(id) as maxid,max(batchid) as maxbatch from
zeyodb.public.dtab")
.load()

val day1 = spark.read.option("header","true")


.csv("file:///C:/data/scenaridata/data2.csv")

day1.show()
max_id_batc.show()

val batchid = day1.withColumn("batchid", lit(1))

batchid.show()

val crossjoin = batchid.crossJoin(max_id_batc)


.withColumn("batchid",col("batchid")+col("MAXBATCH"))
.drop("MAXBATCH")
.sort("batchid")

crossjoin.show()

val indexcolumn = addColumnIndex(spark, crossjoin)


.withColumn("id",expr("id+MAXID"))
.drop("MAXID")
.select("id","tdate","custnno","amt","state","batchid")

indexcolumn.show()

finaldf.write.format("snowflake")
.option("sfURL","https://eogjppo-wl54107.snowflakecomputing.com")
.option("sfAccount","eogjppo")
.option("sfUser","zeyobronanalytics66")
.option("sfPassword","Zeyo@908")
.option("sfDatabase","zeyodb")
.option("sfSchema","PUBLIC")
.option("sfRole","ACCOUNTADMIN")
.option("sfWarehouse","COMPUTE_WH")
.option("dbtable","dtab")
.mode("append")
.save()

===========
Day 5
===========
package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import scala.io.Source
import pack.urlobj
import org.apache.spark.sql._

object obj {

def addColumnIndex(spark: SparkSession,df: DataFrame) = {


spark.sqlContext.createDataFrame(
df.rdd.zipWithIndex.map {
case (row, index) => Row.fromSeq(row.toSeq :+
index+1)
},

StructType(df.schema.fields :+ StructField("id",
LongType, false)))
}

def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")

println("================Started1============")

val conf = new


SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val df = spark.read.format("csv")
.option("header","true")
.load("file:///C:/data/scenaridata/data55.csv")

df.show(100)

val inbatch = addColumnIndex(spark, df)


.withColumnRenamed("id","batchid")

inbatch.show(100)

val total = inbatch.count

val limit = 10

val mod = total/limit

val moddf = inbatch.withColumn("batchid",col("batchid")%mod +1)


.sort("batchid")
.withColumn("batchid", expr("cast(batchid as int)"))

moddf.show(100)

val increid = addColumnIndex(spark, moddf)


.select("id","tdate","custnno","amt","state","batch
id")

increid.show(100)

val snowdf = spark.read.format("snowflake")


.option("sfURL","https://eogjppo-
wl54107.snowflakecomputing.com")
.option("sfAccount","eogjppo")
.option("sfUser","zeyobronanalytics66")
.option("sfPassword","Zeyo@908")
.option("sfDatabase","zeyodb")
.option("sfSchema","PUBLIC")
.option("sfRole","ACCOUNTADMIN")
.option("sfWarehouse","COMPUTE_WH")
.option("query","select max(id) as maxid,max(batchid)
as maxbatch from zeyodb.public.dtab")
.load()

snowdf.show()

val finaldf1= increid.crossJoin(snowdf)


.withColumn("id",expr("id+maxid"))
.withColumn("batchid",expr("batchid+maxbatch"))
.drop("MAXID","MAXBATCH")

finaldf1.show(100)

finaldf1.write.format("snowflake")
.option("sfURL","https://eogjppo-wl54107.snowflakecomputing.com")
.option("sfAccount","eogjppo")
.option("sfUser","zeyobronanalytics66")
.option("sfPassword","Zeyo@908")
.option("sfDatabase","zeyodb")
.option("sfSchema","PUBLIC")
.option("sfRole","ACCOUNTADMIN")
.option("sfWarehouse","COMPUTE_WH")
.option("dbtable","dtab")
.mode("append")
.save()

/*df.show()

val batchdf = df.withColumn("batchid", lit(1))

batchdf.show()

val increid = addColumnIndex(spark, batchdf)

increid.show()

val finaldf =
increid.select("id","tdate","custnno","amt","state","batchid")

finaldf.show()

val snowdf = spark.read.format("snowflake")


.option("sfURL","https://eogjppo-
wl54107.snowflakecomputing.com")
.option("sfAccount","eogjppo")
.option("sfUser","zeyobronanalytics66")
.option("sfPassword","Zeyo@908")
.option("sfDatabase","zeyodb")
.option("sfSchema","PUBLIC")
.option("sfRole","ACCOUNTADMIN")
.option("sfWarehouse","COMPUTE_WH")
.option("query","select max(id) as maxid,max(batchid)
as maxbatch from zeyodb.public.dtab")
.load()

snowdf.show()

val finaldf1= finaldf.crossJoin(snowdf)


.withColumn("id",expr("id+maxid"))
.withColumn("batchid",expr("batchid+maxbatch"))
.drop("MAXID","MAXBATCH")

finaldf1.show(100)

finaldf1.write.format("snowflake")
.option("sfURL","https://eogjppo-wl54107.snowflakecomputing.com")
.option("sfAccount","eogjppo")
.option("sfUser","zeyobronanalytics66")
.option("sfPassword","Zeyo@908")
.option("sfDatabase","zeyodb")
.option("sfSchema","PUBLIC")
.option("sfRole","ACCOUNTADMIN")
.option("sfWarehouse","COMPUTE_WH")
.option("dbtable","dtab")
.mode("append")
.save()

*/
}
}

You might also like