The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge

class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator])
extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) {
def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = {
val grouped = dataset.groupBy()
val max = grouped.max().first().getDouble(0)
val min = grouped.min().first().getDouble(0)
new MinMaxNormEstimatorModel(min = min, max = max, operationName = operationName, uid = uid)
}
}
class MinMaxNormEstimatorModel(val min: Double, val max: Double, operationName: String, uid: String)
extends UnaryModel[Real, Real](operationName = operationName, uid = uid) {
def transformFn: Real => Real = _.value.map(v => (v - min) / (max - min)).toReal
}

@RunWith(classOf[JUnitRunner])
class MinMaxNormEstimatorTest extends FlatSpec with Matchers {
val conf = new SparkConf().setMaster("local[*]")
implicit val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
"MinMaxNormEstimator" should ”fit a min/max model" in {
val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_)))
val model = new MinMaxNormEstimator().fitFn(data)
model match {
case m: MinMaxNormEstimatorModel =>
m.min shouldBe -1.0
m.max shouldBe 5.0
case m => fail(s"Unexpected model type: ${m.getClass}")
}
}
}

class MinMaxNormEstimatorTest extends FlatSpec with Matchers with BeforeAndAfterAll {
val conf = new SparkConf().setMaster("local[*]")
implicit val spark = SparkSession.builder().config(conf).getOrCreate()
override def afterAll(): Unit = {
super.afterAll()
spark.stop()
}
// …
}

trait TestSpark extends BeforeAndAfterAll { self: Suite =>
val kryoClasses: Array[Class[_]] = Array()
val conf: SparkConf = new SparkConf()
.setMaster("local[*]")
.registerKryoClasses(kryoClasses)
.set("spark.serializer", classOf[org.apache.spark.serializer.KryoSerializer].getName)
.set("spark.ui.enabled", false.toString) // disable Spark Application UI for faster tests
implicit lazy val spark = SparkSession.builder().config(conf).getOrCreate()
override def afterAll(): Unit = {
super.afterAll()
spark.stop()
}
}
abstract class SparkFlatSpec extends FlatSpec with Matchers with TestSpark

class MinMaxNormEstimatorTest extends SparkFlatSpec {
val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_)))
"MinMaxNormEstimator" should "fit a min/max model" in {
new MinMaxNormEstimator().fitFn(data) match {
case model: MinMaxNormEstimatorModel =>
model.min shouldBe -1.0
model.max shouldBe 5.0
case model =>
fail(s"Unexpected model type: ${model.getClass}")
}
}
}
More here:
https://github.com/apache/spark/blob/master/core/src/test/scala/org/apache/spark/SparkFunSuite.scala

https://jungiangenealogy.weebly.com/serpent.html

More here: https://github.com/saurfang/sbt-spark-submit
For Gradle: https://github.com/salesforce/TransmogrifAI/blob/master/gradle/spark.gradle

package org.apache.spark.ml
abstract class Transformer extends PipelineStage
{
def transform(dataset: Dataset[_]): DataFrame
}
type DataFrame = Dataset[Row]
trait Row extends Serializable {
def get(i: Int): Any
}

package org.apache.spark.util
case object ClosureUtils {
def checkSerializable(closure: AnyRef): Try[Unit] = scala.util.Try {
ClosureCleaner.clean(closure, checkSerializable = true, cleanTransitively = true)
}
}
package my.app
val x = new NonSerializable { def fun() = ??? }
ClosureUtils.checkSerializable(x.fun _) // make the check early at application start
udf { d: Double => x.fun(d) } // if we are here – it’s safe to use x

Read more:
https://docs.transmogrif.ai/en/stable/developer-guide/index.html#wrapping-a-non-serializable-external-
library
udf { d: Double => new NonSerializable().fun(d) }
object NonSerializable {
val instance = new NonSerializable
}
udf { d: Double => NonSerializable.instance.fun(d) }

val ds: Dataset[_] = ???
println("Dataset schema & plans:")
ds.printSchema()
ds.explain(extended = true)
https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html
https://databricks.com/blog/2016/05/23/apache-spark-as-a-compiler-joining-a-billion-rows-per-second-on-a-laptop.html

https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/inter
nal/SQLConf.scala#L780

// Apply all the transformers one by one as org.apache.spark.ml.Pipeline does
transformers.zipWithIndex.foldLeft(data) { case (df, (transformer, i)) =>
val persist = i > 0 && i % persistEveryKStages == 0
val newDF = transformer.transform(df)
if (!persist) newDF
else {
// Converting to RDD and back here to break up Catalyst [SPARK-13346]
val persisted = newDF.rdd.persist()
lastPersisted.foreach(_.unpersist())
lastPersisted = Some(persisted)
spark.createDataFrame(persisted, newDF.schema)
}
}
https://github.com/salesforce/TransmogrifAI/blob/master/core/src/main/scala/com/salesforce/op/uti
ls/stages/FitStagesUtil.scala#L150

More here: https://spark.apache.org/docs/latest/tuning.html

https://github.com/salesforce/TransmogrifAI/blob/master/utils/src/main/scala/com/salesforce/op
/utils/spark/OpSparkListener.scala
https://github.com/LucaCanali/sparkMeasure
val myListener = new MySparkListener(
appName = spark.sparkContext.appName,
appId = spark.sparkContext.applicationId
) {
override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
super.onApplicationEnd(applicationEnd)
val m = super.metrics
log.info("Total run time: {}", m.appDurationPretty)
MyApp.onApplicationEnd(m)
}
}
spark.sparkContext.addSparkListener(myListener) // add the listener
def onApplicationEnd(metrics: AppMetrics): Unit = { /* consume metrics */ }

6B+
predictions
per day
Einstein
Platform
Compute
Orchestration
Data Store
Model Lifecycle
Management
Data Science
Experience
Configuration
Services
Infrastructure
Metrics
Health Monitoring
ETL/GDPR/
Data Processing
DL TransmogrifAI
Machine Learning
Lead Scoring Engagement ScoringCase Classification Prediction Builder
...

The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge

More Related Content

What's hot (16)

Similar to The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge (20)

More from Databricks (20)

Recently uploaded (20)

The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge