SlideShare a Scribd company logo
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator])
extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) {
def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = {
val grouped = dataset.groupBy()
val max = grouped.max().first().getDouble(0)
val min = grouped.min().first().getDouble(0)
new MinMaxNormEstimatorModel(min = min, max = max, operationName = operationName, uid = uid)
}
}
class MinMaxNormEstimatorModel(val min: Double, val max: Double, operationName: String, uid: String)
extends UnaryModel[Real, Real](operationName = operationName, uid = uid) {
def transformFn: Real => Real = _.value.map(v => (v - min) / (max - min)).toReal
}
@RunWith(classOf[JUnitRunner])
class MinMaxNormEstimatorTest extends FlatSpec with Matchers {
val conf = new SparkConf().setMaster("local[*]")
implicit val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
"MinMaxNormEstimator" should ”fit a min/max model" in {
val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_)))
val model = new MinMaxNormEstimator().fitFn(data)
model match {
case m: MinMaxNormEstimatorModel =>
m.min shouldBe -1.0
m.max shouldBe 5.0
case m => fail(s"Unexpected model type: ${m.getClass}")
}
}
}
@RunWith(classOf[JUnitRunner])
class MinMaxNormEstimatorTest extends FlatSpec with Matchers with BeforeAndAfterAll {
val conf = new SparkConf().setMaster("local[*]")
implicit val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
override def afterAll(): Unit = {
super.afterAll()
spark.stop()
}
// …
}
trait TestSpark extends BeforeAndAfterAll { self: Suite =>
val kryoClasses: Array[Class[_]] = Array()
val conf: SparkConf = new SparkConf()
.setMaster("local[*]")
.registerKryoClasses(kryoClasses)
.set("spark.serializer", classOf[org.apache.spark.serializer.KryoSerializer].getName)
.set("spark.ui.enabled", false.toString) // disable Spark Application UI for faster tests
implicit lazy val spark = SparkSession.builder().config(conf).getOrCreate()
override def afterAll(): Unit = {
super.afterAll()
spark.stop()
}
}
abstract class SparkFlatSpec extends FlatSpec with Matchers with TestSpark
@RunWith(classOf[JUnitRunner])
class MinMaxNormEstimatorTest extends SparkFlatSpec {
import spark.implicits._
val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_)))
"MinMaxNormEstimator" should "fit a min/max model" in {
new MinMaxNormEstimator().fitFn(data) match {
case model: MinMaxNormEstimatorModel =>
model.min shouldBe -1.0
model.max shouldBe 5.0
case model =>
fail(s"Unexpected model type: ${model.getClass}")
}
}
}
More here:
https://github.com/apache/spark/blob/master/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
https://jungiangenealogy.weebly.com/serpent.html
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
More here: https://github.com/saurfang/sbt-spark-submit
For Gradle: https://github.com/salesforce/TransmogrifAI/blob/master/gradle/spark.gradle
package org.apache.spark.ml
abstract class Transformer extends PipelineStage
{
def transform(dataset: Dataset[_]): DataFrame
}
type DataFrame = Dataset[Row]
trait Row extends Serializable {
def get(i: Int): Any
}
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
package org.apache.spark.util
case object ClosureUtils {
def checkSerializable(closure: AnyRef): Try[Unit] = scala.util.Try {
ClosureCleaner.clean(closure, checkSerializable = true, cleanTransitively = true)
}
}
package my.app
val x = new NonSerializable { def fun() = ??? }
ClosureUtils.checkSerializable(x.fun _) // make the check early at application start
udf { d: Double => x.fun(d) } // if we are here – it’s safe to use x
Read more:
https://docs.transmogrif.ai/en/stable/developer-guide/index.html#wrapping-a-non-serializable-external-
library
udf { d: Double => new NonSerializable().fun(d) }
object NonSerializable {
val instance = new NonSerializable
}
udf { d: Double => NonSerializable.instance.fun(d) }
val ds: Dataset[_] = ???
println("Dataset schema & plans:")
ds.printSchema()
ds.explain(extended = true)
https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html
https://databricks.com/blog/2016/05/23/apache-spark-as-a-compiler-joining-a-billion-rows-per-second-on-a-laptop.html
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/inter
nal/SQLConf.scala#L780
// Apply all the transformers one by one as org.apache.spark.ml.Pipeline does
transformers.zipWithIndex.foldLeft(data) { case (df, (transformer, i)) =>
val persist = i > 0 && i % persistEveryKStages == 0
val newDF = transformer.transform(df)
if (!persist) newDF
else {
// Converting to RDD and back here to break up Catalyst [SPARK-13346]
val persisted = newDF.rdd.persist()
lastPersisted.foreach(_.unpersist())
lastPersisted = Some(persisted)
spark.createDataFrame(persisted, newDF.schema)
}
}
https://github.com/salesforce/TransmogrifAI/blob/master/core/src/main/scala/com/salesforce/op/uti
ls/stages/FitStagesUtil.scala#L150
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
More here: https://spark.apache.org/docs/latest/tuning.html
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
https://github.com/salesforce/TransmogrifAI/blob/master/utils/src/main/scala/com/salesforce/op
/utils/spark/OpSparkListener.scala
https://github.com/LucaCanali/sparkMeasure
val myListener = new MySparkListener(
appName = spark.sparkContext.appName,
appId = spark.sparkContext.applicationId
) {
override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
super.onApplicationEnd(applicationEnd)
val m = super.metrics
log.info("Total run time: {}", m.appDurationPretty)
MyApp.onApplicationEnd(m)
}
}
spark.sparkContext.addSparkListener(myListener) // add the listener
def onApplicationEnd(metrics: AppMetrics): Unit = { /* consume metrics */ }
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge
6B+
predictions
per day
Einstein
Platform
Compute
Orchestration
Data Store
Model Lifecycle
Management
Data Science
Experience
Configuration
Services
Infrastructure
Metrics
Health Monitoring
ETL/GDPR/
Data Processing
DL TransmogrifAI
Machine Learning
Lead Scoring Engagement ScoringCase Classification Prediction Builder
...
The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge

More Related Content

What's hot (16)

PDF
From mysql to MongoDB(MongoDB2011北京交流会)
Night Sailer
 
PDF
はじめてのGroovy
Tsuyoshi Yamamoto
 
PDF
MongoDB Oplog入門
Takahiro Inoue
 
PDF
MongoDB全機能解説2
Takahiro Inoue
 
PPT
Tips and Tricks of Developing .NET Application
Joni
 
KEY
Grails UI Primer
Matthew Taylor
 
PPTX
Get started with YUI
Adam Lu
 
KEY
MTDDC 2010.2.5 Tokyo - Brand new API
Six Apart KK
 
PPTX
JQuery Presentation
Sony Jain
 
KEY
Backbone intro
nabeelahali
 
PDF
Say It With Javascript
Giovanni Scerra ☃
 
PDF
Groovy collection api
trygvea
 
PPTX
What’s new in C# 6
Fiyaz Hasan
 
PDF
The state of your own hypertext preprocessor
Alessandro Nadalin
 
PDF
Stubる - Mockingjayを使ったHTTPクライアントのテスト -
Kenji Tanaka
 
PDF
I Love Ruby
mahersaif
 
From mysql to MongoDB(MongoDB2011北京交流会)
Night Sailer
 
はじめてのGroovy
Tsuyoshi Yamamoto
 
MongoDB Oplog入門
Takahiro Inoue
 
MongoDB全機能解説2
Takahiro Inoue
 
Tips and Tricks of Developing .NET Application
Joni
 
Grails UI Primer
Matthew Taylor
 
Get started with YUI
Adam Lu
 
MTDDC 2010.2.5 Tokyo - Brand new API
Six Apart KK
 
JQuery Presentation
Sony Jain
 
Backbone intro
nabeelahali
 
Say It With Javascript
Giovanni Scerra ☃
 
Groovy collection api
trygvea
 
What’s new in C# 6
Fiyaz Hasan
 
The state of your own hypertext preprocessor
Alessandro Nadalin
 
Stubる - Mockingjayを使ったHTTPクライアントのテスト -
Kenji Tanaka
 
I Love Ruby
mahersaif
 

Similar to The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge (20)

PDF
Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
Tsuyoshi Yamamoto
 
KEY
Symfony2 Building on Alpha / Beta technology
Daniel Knell
 
PDF
TypeScript Introduction
Dmitry Sheiko
 
PDF
用Tornado开发RESTful API运用
Felinx Lee
 
PPTX
Taming that client side mess with Backbone.js
Jarod Ferguson
 
PDF
Python magicmethods
dreampuf
 
PDF
Ten useful JavaScript tips & best practices
Ankit Rastogi
 
PDF
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
Fabio Collini
 
PDF
Scala coated JVM
Stuart Roebuck
 
PDF
Swift for TensorFlow - CoreML Personalization
Jacopo Mangiavacchi
 
PDF
Kitura Todolist tutorial
Robert F. Dickerson
 
PPT
Unit testing with mock libs
Valentin Kolesnikov
 
KEY
Pyimproved again
rik0
 
KEY
Django quickstart
Marconi Moreto
 
PDF
Aplicacoes dinamicas Rails com Backbone
Rafael Felix da Silva
 
PDF
Unittests für Dummies
Lars Jankowfsky
 
PDF
From android/ java to swift (2)
allanh0526
 
PDF
Practical Google App Engine Applications In Py
Eric ShangKuan
 
KEY
Jython: Python para la plataforma Java (EL2009)
Leonardo Soto
 
PDF
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
Puppet
 
Grails 1.2 探検隊 -新たな聖杯をもとめて・・・-
Tsuyoshi Yamamoto
 
Symfony2 Building on Alpha / Beta technology
Daniel Knell
 
TypeScript Introduction
Dmitry Sheiko
 
用Tornado开发RESTful API运用
Felinx Lee
 
Taming that client side mess with Backbone.js
Jarod Ferguson
 
Python magicmethods
dreampuf
 
Ten useful JavaScript tips & best practices
Ankit Rastogi
 
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
Fabio Collini
 
Scala coated JVM
Stuart Roebuck
 
Swift for TensorFlow - CoreML Personalization
Jacopo Mangiavacchi
 
Kitura Todolist tutorial
Robert F. Dickerson
 
Unit testing with mock libs
Valentin Kolesnikov
 
Pyimproved again
rik0
 
Django quickstart
Marconi Moreto
 
Aplicacoes dinamicas Rails com Backbone
Rafael Felix da Silva
 
Unittests für Dummies
Lars Jankowfsky
 
From android/ java to swift (2)
allanh0526
 
Practical Google App Engine Applications In Py
Eric ShangKuan
 
Jython: Python para la plataforma Java (EL2009)
Leonardo Soto
 
Absolute Beginners Guide to Puppet Through Types - PuppetConf 2014
Puppet
 
Ad

More from Databricks (20)

PPTX
DW Migration Webinar-March 2022.pptx
Databricks
 
PPTX
Data Lakehouse Symposium | Day 1 | Part 1
Databricks
 
PPT
Data Lakehouse Symposium | Day 1 | Part 2
Databricks
 
PPTX
Data Lakehouse Symposium | Day 2
Databricks
 
PPTX
Data Lakehouse Symposium | Day 4
Databricks
 
PDF
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
Databricks
 
PDF
Democratizing Data Quality Through a Centralized Platform
Databricks
 
PDF
Learn to Use Databricks for Data Science
Databricks
 
PDF
Why APM Is Not the Same As ML Monitoring
Databricks
 
PDF
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
Databricks
 
PDF
Stage Level Scheduling Improving Big Data and AI Integration
Databricks
 
PDF
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Databricks
 
PDF
Scaling your Data Pipelines with Apache Spark on Kubernetes
Databricks
 
PDF
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Databricks
 
PDF
Sawtooth Windows for Feature Aggregations
Databricks
 
PDF
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Databricks
 
PDF
Re-imagine Data Monitoring with whylogs and Spark
Databricks
 
PDF
Raven: End-to-end Optimization of ML Prediction Queries
Databricks
 
PDF
Processing Large Datasets for ADAS Applications using Apache Spark
Databricks
 
PDF
Massive Data Processing in Adobe Using Delta Lake
Databricks
 
DW Migration Webinar-March 2022.pptx
Databricks
 
Data Lakehouse Symposium | Day 1 | Part 1
Databricks
 
Data Lakehouse Symposium | Day 1 | Part 2
Databricks
 
Data Lakehouse Symposium | Day 2
Databricks
 
Data Lakehouse Symposium | Day 4
Databricks
 
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
Databricks
 
Democratizing Data Quality Through a Centralized Platform
Databricks
 
Learn to Use Databricks for Data Science
Databricks
 
Why APM Is Not the Same As ML Monitoring
Databricks
 
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
Databricks
 
Stage Level Scheduling Improving Big Data and AI Integration
Databricks
 
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Databricks
 
Scaling your Data Pipelines with Apache Spark on Kubernetes
Databricks
 
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Databricks
 
Sawtooth Windows for Feature Aggregations
Databricks
 
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Databricks
 
Re-imagine Data Monitoring with whylogs and Spark
Databricks
 
Raven: End-to-end Optimization of ML Prediction Queries
Databricks
 
Processing Large Datasets for ADAS Applications using Apache Spark
Databricks
 
Massive Data Processing in Adobe Using Delta Lake
Databricks
 
Ad

Recently uploaded (20)

PPTX
Multiscale Segmentation of Survey Respondents: Seeing the Trees and the Fores...
Sione Palu
 
PPTX
Enterprise Architecture and TOGAF Presn
starksolutionsindia
 
PDF
apidays Munich 2025 - The Physics of Requirement Sciences Through Application...
apidays
 
PPTX
Introduction to computer chapter one 2017.pptx
mensunmarley
 
PPTX
White Blue Simple Modern Enhancing Sales Strategy Presentation_20250724_21093...
RamNeymarjr
 
PDF
apidays Munich 2025 - Making Sense of AI-Ready APIs in a Buzzword World, Andr...
apidays
 
PPTX
Insurance-Analytics-Branch-Dashboard (1).pptx
trivenisapate02
 
PPTX
IP_Journal_Articles_2025IP_Journal_Articles_2025
mishell212144
 
PPT
From Vision to Reality: The Digital India Revolution
Harsh Bharvadiya
 
PDF
Basotho Satisfaction with Electricity(Statspack)
KatlehoMefane
 
PDF
An Uncut Conversation With Grok | PDF Document
Mike Hydes
 
PDF
apidays Munich 2025 - Integrate Your APIs into the New AI Marketplace, Senthi...
apidays
 
PPTX
Solution+Architecture+Review+-+Sample.pptx
manuvratsingh1
 
PDF
McKinsey - Global Energy Perspective 2023_11.pdf
niyudha
 
PPTX
Fluvial_Civilizations_Presentation (1).pptx
alisslovemendoza7
 
PDF
202501214233242351219 QASS Session 2.pdf
lauramejiamillan
 
PDF
SUMMER INTERNSHIP REPORT[1] (AutoRecovered) (6) (1).pdf
pandeydiksha814
 
PPTX
Fuzzy_Membership_Functions_Presentation.pptx
pythoncrazy2024
 
PPTX
MR and reffffffvvvvvvvfversal_083605.pptx
manjeshjain
 
PPTX
short term project on AI Driven Data Analytics
JMJCollegeComputerde
 
Multiscale Segmentation of Survey Respondents: Seeing the Trees and the Fores...
Sione Palu
 
Enterprise Architecture and TOGAF Presn
starksolutionsindia
 
apidays Munich 2025 - The Physics of Requirement Sciences Through Application...
apidays
 
Introduction to computer chapter one 2017.pptx
mensunmarley
 
White Blue Simple Modern Enhancing Sales Strategy Presentation_20250724_21093...
RamNeymarjr
 
apidays Munich 2025 - Making Sense of AI-Ready APIs in a Buzzword World, Andr...
apidays
 
Insurance-Analytics-Branch-Dashboard (1).pptx
trivenisapate02
 
IP_Journal_Articles_2025IP_Journal_Articles_2025
mishell212144
 
From Vision to Reality: The Digital India Revolution
Harsh Bharvadiya
 
Basotho Satisfaction with Electricity(Statspack)
KatlehoMefane
 
An Uncut Conversation With Grok | PDF Document
Mike Hydes
 
apidays Munich 2025 - Integrate Your APIs into the New AI Marketplace, Senthi...
apidays
 
Solution+Architecture+Review+-+Sample.pptx
manuvratsingh1
 
McKinsey - Global Energy Perspective 2023_11.pdf
niyudha
 
Fluvial_Civilizations_Presentation (1).pptx
alisslovemendoza7
 
202501214233242351219 QASS Session 2.pdf
lauramejiamillan
 
SUMMER INTERNSHIP REPORT[1] (AutoRecovered) (6) (1).pdf
pandeydiksha814
 
Fuzzy_Membership_Functions_Presentation.pptx
pythoncrazy2024
 
MR and reffffffvvvvvvvfversal_083605.pptx
manjeshjain
 
short term project on AI Driven Data Analytics
JMJCollegeComputerde
 

The Rule of 10,000 Spark Jobs: Learning From Exceptions and Serializing Your Knowledge

  • 8. class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator]) extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) { def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = { val grouped = dataset.groupBy() val max = grouped.max().first().getDouble(0) val min = grouped.min().first().getDouble(0) new MinMaxNormEstimatorModel(min = min, max = max, operationName = operationName, uid = uid) } } class MinMaxNormEstimatorModel(val min: Double, val max: Double, operationName: String, uid: String) extends UnaryModel[Real, Real](operationName = operationName, uid = uid) { def transformFn: Real => Real = _.value.map(v => (v - min) / (max - min)).toReal }
  • 9. @RunWith(classOf[JUnitRunner]) class MinMaxNormEstimatorTest extends FlatSpec with Matchers { val conf = new SparkConf().setMaster("local[*]") implicit val spark = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ "MinMaxNormEstimator" should ”fit a min/max model" in { val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_))) val model = new MinMaxNormEstimator().fitFn(data) model match { case m: MinMaxNormEstimatorModel => m.min shouldBe -1.0 m.max shouldBe 5.0 case m => fail(s"Unexpected model type: ${m.getClass}") } } }
  • 10. @RunWith(classOf[JUnitRunner]) class MinMaxNormEstimatorTest extends FlatSpec with Matchers with BeforeAndAfterAll { val conf = new SparkConf().setMaster("local[*]") implicit val spark = SparkSession.builder().config(conf).getOrCreate() import spark.implicits._ override def afterAll(): Unit = { super.afterAll() spark.stop() } // … }
  • 11. trait TestSpark extends BeforeAndAfterAll { self: Suite => val kryoClasses: Array[Class[_]] = Array() val conf: SparkConf = new SparkConf() .setMaster("local[*]") .registerKryoClasses(kryoClasses) .set("spark.serializer", classOf[org.apache.spark.serializer.KryoSerializer].getName) .set("spark.ui.enabled", false.toString) // disable Spark Application UI for faster tests implicit lazy val spark = SparkSession.builder().config(conf).getOrCreate() override def afterAll(): Unit = { super.afterAll() spark.stop() } } abstract class SparkFlatSpec extends FlatSpec with Matchers with TestSpark
  • 12. @RunWith(classOf[JUnitRunner]) class MinMaxNormEstimatorTest extends SparkFlatSpec { import spark.implicits._ val data = spark.createDataset(Seq(-1.0, 0.0, 5.0, 1.0).map(Option(_))) "MinMaxNormEstimator" should "fit a min/max model" in { new MinMaxNormEstimator().fitFn(data) match { case model: MinMaxNormEstimatorModel => model.min shouldBe -1.0 model.max shouldBe 5.0 case model => fail(s"Unexpected model type: ${model.getClass}") } } } More here: https://github.com/apache/spark/blob/master/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
  • 19. More here: https://github.com/saurfang/sbt-spark-submit For Gradle: https://github.com/salesforce/TransmogrifAI/blob/master/gradle/spark.gradle
  • 20. package org.apache.spark.ml abstract class Transformer extends PipelineStage { def transform(dataset: Dataset[_]): DataFrame } type DataFrame = Dataset[Row] trait Row extends Serializable { def get(i: Int): Any }
  • 24. package org.apache.spark.util case object ClosureUtils { def checkSerializable(closure: AnyRef): Try[Unit] = scala.util.Try { ClosureCleaner.clean(closure, checkSerializable = true, cleanTransitively = true) } } package my.app val x = new NonSerializable { def fun() = ??? } ClosureUtils.checkSerializable(x.fun _) // make the check early at application start udf { d: Double => x.fun(d) } // if we are here – it’s safe to use x
  • 25. Read more: https://docs.transmogrif.ai/en/stable/developer-guide/index.html#wrapping-a-non-serializable-external- library udf { d: Double => new NonSerializable().fun(d) } object NonSerializable { val instance = new NonSerializable } udf { d: Double => NonSerializable.instance.fun(d) }
  • 26. val ds: Dataset[_] = ??? println("Dataset schema & plans:") ds.printSchema() ds.explain(extended = true) https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html https://databricks.com/blog/2016/05/23/apache-spark-as-a-compiler-joining-a-billion-rows-per-second-on-a-laptop.html
  • 28. // Apply all the transformers one by one as org.apache.spark.ml.Pipeline does transformers.zipWithIndex.foldLeft(data) { case (df, (transformer, i)) => val persist = i > 0 && i % persistEveryKStages == 0 val newDF = transformer.transform(df) if (!persist) newDF else { // Converting to RDD and back here to break up Catalyst [SPARK-13346] val persisted = newDF.rdd.persist() lastPersisted.foreach(_.unpersist()) lastPersisted = Some(persisted) spark.createDataFrame(persisted, newDF.schema) } } https://github.com/salesforce/TransmogrifAI/blob/master/core/src/main/scala/com/salesforce/op/uti ls/stages/FitStagesUtil.scala#L150
  • 34. https://github.com/salesforce/TransmogrifAI/blob/master/utils/src/main/scala/com/salesforce/op /utils/spark/OpSparkListener.scala https://github.com/LucaCanali/sparkMeasure val myListener = new MySparkListener( appName = spark.sparkContext.appName, appId = spark.sparkContext.applicationId ) { override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { super.onApplicationEnd(applicationEnd) val m = super.metrics log.info("Total run time: {}", m.appDurationPretty) MyApp.onApplicationEnd(m) } } spark.sparkContext.addSparkListener(myListener) // add the listener def onApplicationEnd(metrics: AppMetrics): Unit = { /* consume metrics */ }
  • 36. 6B+ predictions per day Einstein Platform Compute Orchestration Data Store Model Lifecycle Management Data Science Experience Configuration Services Infrastructure Metrics Health Monitoring ETL/GDPR/ Data Processing DL TransmogrifAI Machine Learning Lead Scoring Engagement ScoringCase Classification Prediction Builder ...