SlideShare a Scribd company logo
PySpark Cheat Sheet
PySpark is an interface for Apache Spark in Python. It not only allows you to write Spark applications using Python APIs
Initializing Spark
SparkContext
>>> from pyspark import SparkContext

>>> sc = SparkContext(master =
'local[2]')

Inspect SparkContext
>>> sc.version #Retrieve
SparkContext version

>>> sc.pythonVer #Retrieve Python
version
>>> sc.master #Master URL to connect
to

>>> str(sc.sparkHome) #Path where
Spark is installed on worker nodes

>>> str(sc.sparkUser()) #Retrieve
name of the Spark User running
SparkContext

>>> sc.appName #Return application
name

>>> sc.applicationId #Retrieve
application ID
>>> sc.defaultParallelism #Return
default level of parallelism
>>> sc.defaultMinPartitions
#Default minimum number of
partitions forRDDs

Configuration
>>> from pyspark import SparkConf,
SparkContext

>>> conf = (SparkConf()

.setMaster( "local" )

.setAppName( "MyApp" )

.set( "spark.executor.memory", "1g"
))

>>> sc = SparkContext(conf = conf)

Using The Shell
In the PySpark shell, a special
interpreter aware SparkContext is
already created in the variable
called sc.

$ ./bin/spark shell --master
local[2]

$ ./bin/pyspark --master local[4] --
py files code.py

Set which master the context
connects to with the --master
argument, and add Python .zip, .egg
or .py files to the

runtime path by passing a comma
separated list to --py-files
Loading Data
Parallelized Collections
>>> rdd = sc.parallelize([('a' ,7),
('a' ,2),('b' ,2)])

>>> rdd2 = sc.parallelize([( 'a',2),
( 'd',1),( 'b',1)])

>>> rdd3 =
sc.parallelize(range(100))

>>> rdd4 = sc.parallelize([( "a",[
"x","y" , "z"]),

( "b",[ "p" ,"r" ])])

External Data
Read either one text file from HDFS,
a local file system or or any Hadoop
supported file system URI with
textFile(),

or read in a directory of text files
with wholeTextFiles()
>>> textFile =
sc.textFile("/my/directory/*.txt" )
>>> textFile2 =
sc.wholeTextFiles("/my/directory/")
Retrieving RDD Information
Basic Information
>>> rdd.getNumPartitions() #List the
number of partitions

>>> rdd.count() #Count RDD instances
3

>>> rdd.countByKey() #Count RDD
instances by key

defaultdict(<type 'int' >, { 'a':2,
'b':1})
>>> rdd.countByValue() #Count RDD
instances by value

defaultdict(<type 'int'>, {( 'b'
,2):1,( 'a' ,2):1,( 'a' ,7):1})

>>> rdd.collectAsMap() #Return
(key,value) pairs as adictionary

{ 'a' : 2, 'b' : 2}

>>> rdd3.sum() #Sum of RDD elements
4950

>>> sc.parallelize([]).isEmpty()
#Check whether RDD is empty

True

Summary
>>> rdd3.max() #Maximum value of RDD
elements

99

>>> rdd3.min() #Minimum value of RDD
elements

0

>>> rdd3.mean() #Mean value of RDD
elements

49.5

>>> rdd3.stdev() #Standard deviation
of RDD elements

28.866070047722118

>>> rdd3.variance() #Compute
variance of RDD elements

833.25
>>> rdd3.histogram(3) #Compute
histogram by bins

([0,33,66,99],[33,33,34])

>>> rdd3.stats() #Summary statistics
(count, mean, stdev, max & min)

Applying Functions
>>> def g(x): print(x)

>>> rdd.foreach(g) #Apply a function
to all RDD elements

( 'a', 7)

( 'b', 2)

( 'a', 2)

Selecting Data
Getting
>>> rdd.collect() #Return a list
with all RDD elements
[('a' , 7), ( 'a', 2), ( 'b', 2)]

>>> rdd.take(2) #Take first 2 RDD
elements

[( 'a', 7), ('a' , 2)]

>>> rdd.first() #Take first RDD
element
( 'a', 7)

>>> rdd.top(2) #Take top 2 RDD
elements

[( 'b', 2), ( 'a', 7)]

Sampling
>>> rdd3.sample(False, 0.15,
81).collect() #Return sampled subset
of rdd3
[3,4,27,31,40,41,42,43,60,76,79,80,86,
Filtering
>>> rdd.filter(lambda x: "a" in
x).collect() #Filter the RDD
[( 'a',7),('a' ,2)]

>>> rdd5.distinct().collect()
#Return distinct RDD values

[ 'a',2, 'b',7]

>>> rdd.keys().collect() #Return
(key,value) RDD's keys

['a' ,'a' , 'b']

Iterating
>>> def g(x): print(x)

>>> rdd.foreach(g) *#Apply a
function to all RDD elements*

( 'a', 7)

( 'b', 2)

( 'a', 2)

Reshaping Data
Reducing
>>> rdd.reduceByKey(lambda x,y :
x+y).collect() #Merge the rdd
values foreach key

[('a',9), ('b',2)]

>>> rdd.reduce(lambda a, b: a + b)
#Merge the rdd values
('a',7,'a',2,'b',2)

Grouping by
>>> rdd3.groupBy(lambda x: x % 2)
.mapValues(list) .collect() #Return
RDD of grouped values
>>>
rdd.groupByKey().mapValues(list).colle
#Group rdd by key

[('a' ,[7,2]), ( 'b',[2])]

Aggregating
>>> seqOp = (lambda x,y:
(x[0]+y,x[1]+1))

>>> combOp = (lambda x,y:
(x[0]+y[0],x[1]+y[1]))

# Aggregate RDD elements of
eachpartition and then the results

>>>
rdd3.aggregate((0,0),seqOp,combOp)
(4950,100) 

#Aggregate values of each RDD key

>>>
rdd.aggregateByKey((0,0),seqop,combop)
Sort
>>> rdd2.sortBy(lambda x:
x[1]).collect() #Sort RDD by given
function 

[('d' ,1), ('b' ,1), ('a',2)]

>>> rdd2.sortByKey().collect()
#Sort (key, value) RDD by key

[('a' ,2), ('b' ,1), ('d' ,1)]

Mathematical Operations
>>> rdd.subtract(rdd2).collect()
#Return each rdd value not contained
in rdd2
[('b' ,2), ('a' ,7)]
#Return each (key,value) pair of
rdd2 with no matching key in rdd

>>>
rdd2.subtractByKey(rdd).collect() 

[( 'b', 1)]

>>> rdd.cartesian(rdd2).collect()
#Return the Cartesian product of rdd
and rdd2

Repartitioning
>>> rdd.repartition(4) #New RDD with
4 partitions

>>> rdd.coalesce(1) #Decrease the
number of partitions in the RDD to 1
cheatsheetmaker.com © 2021
[('a' ,(9,2)), ('b' ​
,(2,1))]

#Aggregate the elements of eachp
artition, and then the results

>>> rdd3.fold(0,add)

4950

# Merge the values for each key

>>> rdd.foldByKey(0, add).collect()
[('a' ,9), ('b' ,2)]
#Create tuples of RDD elements
byapplying a function
>>> rdd3.keyBy(lambda x:
x+x).collect()

More Related Content

What's hot (17)

PDF
多治見IT勉強会 Groovy Grails
Tsuyoshi Yamamoto
 
PDF
Elm: give it a try
Eugene Zharkov
 
PDF
GPars For Beginners
Matt Passell
 
PDF
Swift Sequences & Collections
CocoaHeads France
 
PDF
Cycle.js: Functional and Reactive
Eugene Zharkov
 
PDF
Herding types with Scala macros
Marina Sigaeva
 
PDF
Map Reduce 〜入門編:仕組みの理解とアルゴリズムデザイン〜
Takahiro Inoue
 
PPTX
Modern technologies in data science
Chucheng Hsieh
 
PDF
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Sages
 
PDF
Building Real Time Systems on MongoDB Using the Oplog at Stripe
MongoDB
 
PDF
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Workhorse Computing
 
PDF
Typelevel summit
Marina Sigaeva
 
PDF
Source Code for Dpilot
Nidhi Chauhan
 
PDF
Dpilot Source Code With ScreenShots
DeepAnshu Sharma
 
PPTX
Using Change Streams to Keep Up with Your Data
Evan Rodd
 
PDF
AJUG April 2011 Cascading example
Christopher Curtin
 
PDF
Nosql hands on handout 04
Krishna Sankar
 
多治見IT勉強会 Groovy Grails
Tsuyoshi Yamamoto
 
Elm: give it a try
Eugene Zharkov
 
GPars For Beginners
Matt Passell
 
Swift Sequences & Collections
CocoaHeads France
 
Cycle.js: Functional and Reactive
Eugene Zharkov
 
Herding types with Scala macros
Marina Sigaeva
 
Map Reduce 〜入門編:仕組みの理解とアルゴリズムデザイン〜
Takahiro Inoue
 
Modern technologies in data science
Chucheng Hsieh
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Sages
 
Building Real Time Systems on MongoDB Using the Oplog at Stripe
MongoDB
 
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Workhorse Computing
 
Typelevel summit
Marina Sigaeva
 
Source Code for Dpilot
Nidhi Chauhan
 
Dpilot Source Code With ScreenShots
DeepAnshu Sharma
 
Using Change Streams to Keep Up with Your Data
Evan Rodd
 
AJUG April 2011 Cascading example
Christopher Curtin
 
Nosql hands on handout 04
Krishna Sankar
 

Similar to Py spark cheat sheet by cheatsheetmaker.com (20)

PDF
Write a C program that reads the words the user types at the command.pdf
SANDEEPARIHANT
 
PDF
From mysql to MongoDB(MongoDB2011北京交流会)
Night Sailer
 
PDF
Operations on rdd
sparrowAnalytics.com
 
PDF
Mongodb workshop
Harun Yardımcı
 
PPTX
Introduction to Apache Spark
Mohamed hedi Abidi
 
PDF
Sergi Álvarez & Roi Martín - Radare2 Preview [RootedCON 2010]
RootedCON
 
PDF
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
CloudxLab
 
DOCX
A Shiny Example-- R
Dr. Volkan OBAN
 
PPTX
04 spark-pair rdd-rdd-persistence
Venkat Datla
 
PDF
Deep dive to PostgreSQL Indexes
Ibrar Ahmed
 
PDF
groovy databases
Paul King
 
PDF
Spark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Summit
 
PDF
Imugi: Compiler made with Python
Han Lee
 
DOCX
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the b
ChereCheek752
 
PDF
Spark workshop
Wojciech Pituła
 
PDF
A Tour of Building Web Applications with R Shiny
Wendy Chen Dubois
 
PDF
Spray Json and MongoDB Queries: Insights and Simple Tricks.
Andrii Lashchenko
 
PDF
Apache spark: in and out
Ben Fradet
 
PDF
Hadoop Integration in Cassandra
Jairam Chandar
 
PDF
Refactoring to Macros with Clojure
Dmitry Buzdin
 
Write a C program that reads the words the user types at the command.pdf
SANDEEPARIHANT
 
From mysql to MongoDB(MongoDB2011北京交流会)
Night Sailer
 
Operations on rdd
sparrowAnalytics.com
 
Mongodb workshop
Harun Yardımcı
 
Introduction to Apache Spark
Mohamed hedi Abidi
 
Sergi Álvarez & Roi Martín - Radare2 Preview [RootedCON 2010]
RootedCON
 
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
CloudxLab
 
A Shiny Example-- R
Dr. Volkan OBAN
 
04 spark-pair rdd-rdd-persistence
Venkat Datla
 
Deep dive to PostgreSQL Indexes
Ibrar Ahmed
 
groovy databases
Paul King
 
Spark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Summit
 
Imugi: Compiler made with Python
Han Lee
 
finalprojtemplatev5finalprojtemplate.gitignore# Ignore the b
ChereCheek752
 
Spark workshop
Wojciech Pituła
 
A Tour of Building Web Applications with R Shiny
Wendy Chen Dubois
 
Spray Json and MongoDB Queries: Insights and Simple Tricks.
Andrii Lashchenko
 
Apache spark: in and out
Ben Fradet
 
Hadoop Integration in Cassandra
Jairam Chandar
 
Refactoring to Macros with Clojure
Dmitry Buzdin
 
Ad

More from Lam Hoang (14)

PDF
VS Code cheat sheet
Lam Hoang
 
PDF
PostgreSql cheat sheet
Lam Hoang
 
PDF
Nginx cheat sheet
Lam Hoang
 
PDF
MySql cheat sheet
Lam Hoang
 
PDF
Html cheat sheet
Lam Hoang
 
PDF
Git cheat sheet
Lam Hoang
 
PDF
Django cheat sheet
Lam Hoang
 
PDF
Css cheat sheet
Lam Hoang
 
PDF
Apache cheat sheet
Lam Hoang
 
DOCX
Battle chatter minecraft 1.4.7 mod
Lam Hoang
 
PDF
On thi dai_hoc_mon_van_2010 (1)
Lam Hoang
 
PDF
ôn thi môn văn
Lam Hoang
 
PDF
On thi dai_hoc_mon_van_2010
Lam Hoang
 
PDF
giáo trình c căn bản
Lam Hoang
 
VS Code cheat sheet
Lam Hoang
 
PostgreSql cheat sheet
Lam Hoang
 
Nginx cheat sheet
Lam Hoang
 
MySql cheat sheet
Lam Hoang
 
Html cheat sheet
Lam Hoang
 
Git cheat sheet
Lam Hoang
 
Django cheat sheet
Lam Hoang
 
Css cheat sheet
Lam Hoang
 
Apache cheat sheet
Lam Hoang
 
Battle chatter minecraft 1.4.7 mod
Lam Hoang
 
On thi dai_hoc_mon_van_2010 (1)
Lam Hoang
 
ôn thi môn văn
Lam Hoang
 
On thi dai_hoc_mon_van_2010
Lam Hoang
 
giáo trình c căn bản
Lam Hoang
 
Ad

Recently uploaded (20)

PPTX
Lecture 1 Shell and Tube Heat exchanger-1.pptx
mailforillegalwork
 
PDF
Basic_Concepts_in_Clinical_Biochemistry_2018كيمياء_عملي.pdf
AdelLoin
 
PPTX
Thermal runway and thermal stability.pptx
godow93766
 
PPTX
Damage of stability of a ship and how its change .pptx
ehamadulhaque
 
DOCX
8th International Conference on Electrical Engineering (ELEN 2025)
elelijjournal653
 
PPTX
Worm gear strength and wear calculation as per standard VB Bhandari Databook.
shahveer210504
 
PPTX
VITEEE 2026 Exam Details , Important Dates
SonaliSingh127098
 
PPTX
Solar Thermal Energy System Seminar.pptx
Gpc Purapuza
 
PDF
Design Thinking basics for Engineers.pdf
CMR University
 
PDF
Introduction to Productivity and Quality
মোঃ ফুরকান উদ্দিন জুয়েল
 
PDF
Viol_Alessandro_Presentazione_prelaurea.pdf
dsecqyvhbowrzxshhf
 
PDF
PORTFOLIO Golam Kibria Khan — architect with a passion for thoughtful design...
MasumKhan59
 
PPTX
DATA BASE MANAGEMENT AND RELATIONAL DATA
gomathisankariv2
 
PPTX
GitOps_Without_K8s_Training_detailed git repository
DanialHabibi2
 
PDF
Halide Perovskites’ Multifunctional Properties: Coordination Engineering, Coo...
TaameBerhe2
 
PPTX
Heart Bleed Bug - A case study (Course: Cryptography and Network Security)
Adri Jovin
 
PDF
Reasons for the succes of MENARD PRESSUREMETER.pdf
majdiamz
 
PPTX
美国电子版毕业证南卡罗莱纳大学上州分校水印成绩单USC学费发票定做学位证书编号怎么查
Taqyea
 
PPTX
Evaluation and thermal analysis of shell and tube heat exchanger as per requi...
shahveer210504
 
PPTX
Knowledge Representation : Semantic Networks
Amity University, Patna
 
Lecture 1 Shell and Tube Heat exchanger-1.pptx
mailforillegalwork
 
Basic_Concepts_in_Clinical_Biochemistry_2018كيمياء_عملي.pdf
AdelLoin
 
Thermal runway and thermal stability.pptx
godow93766
 
Damage of stability of a ship and how its change .pptx
ehamadulhaque
 
8th International Conference on Electrical Engineering (ELEN 2025)
elelijjournal653
 
Worm gear strength and wear calculation as per standard VB Bhandari Databook.
shahveer210504
 
VITEEE 2026 Exam Details , Important Dates
SonaliSingh127098
 
Solar Thermal Energy System Seminar.pptx
Gpc Purapuza
 
Design Thinking basics for Engineers.pdf
CMR University
 
Introduction to Productivity and Quality
মোঃ ফুরকান উদ্দিন জুয়েল
 
Viol_Alessandro_Presentazione_prelaurea.pdf
dsecqyvhbowrzxshhf
 
PORTFOLIO Golam Kibria Khan — architect with a passion for thoughtful design...
MasumKhan59
 
DATA BASE MANAGEMENT AND RELATIONAL DATA
gomathisankariv2
 
GitOps_Without_K8s_Training_detailed git repository
DanialHabibi2
 
Halide Perovskites’ Multifunctional Properties: Coordination Engineering, Coo...
TaameBerhe2
 
Heart Bleed Bug - A case study (Course: Cryptography and Network Security)
Adri Jovin
 
Reasons for the succes of MENARD PRESSUREMETER.pdf
majdiamz
 
美国电子版毕业证南卡罗莱纳大学上州分校水印成绩单USC学费发票定做学位证书编号怎么查
Taqyea
 
Evaluation and thermal analysis of shell and tube heat exchanger as per requi...
shahveer210504
 
Knowledge Representation : Semantic Networks
Amity University, Patna
 

Py spark cheat sheet by cheatsheetmaker.com

  • 1. PySpark Cheat Sheet PySpark is an interface for Apache Spark in Python. It not only allows you to write Spark applications using Python APIs Initializing Spark SparkContext >>> from pyspark import SparkContext >>> sc = SparkContext(master = 'local[2]') Inspect SparkContext >>> sc.version #Retrieve SparkContext version >>> sc.pythonVer #Retrieve Python version >>> sc.master #Master URL to connect to >>> str(sc.sparkHome) #Path where Spark is installed on worker nodes >>> str(sc.sparkUser()) #Retrieve name of the Spark User running SparkContext >>> sc.appName #Return application name >>> sc.applicationId #Retrieve application ID >>> sc.defaultParallelism #Return default level of parallelism >>> sc.defaultMinPartitions #Default minimum number of partitions forRDDs Configuration >>> from pyspark import SparkConf, SparkContext >>> conf = (SparkConf() .setMaster( "local" ) .setAppName( "MyApp" ) .set( "spark.executor.memory", "1g" )) >>> sc = SparkContext(conf = conf) Using The Shell In the PySpark shell, a special interpreter aware SparkContext is already created in the variable called sc. $ ./bin/spark shell --master local[2] $ ./bin/pyspark --master local[4] -- py files code.py Set which master the context connects to with the --master argument, and add Python .zip, .egg or .py files to the runtime path by passing a comma separated list to --py-files Loading Data Parallelized Collections >>> rdd = sc.parallelize([('a' ,7), ('a' ,2),('b' ,2)]) >>> rdd2 = sc.parallelize([( 'a',2), ( 'd',1),( 'b',1)]) >>> rdd3 = sc.parallelize(range(100)) >>> rdd4 = sc.parallelize([( "a",[ "x","y" , "z"]), ( "b",[ "p" ,"r" ])]) External Data Read either one text file from HDFS, a local file system or or any Hadoop supported file system URI with textFile(), or read in a directory of text files with wholeTextFiles() >>> textFile = sc.textFile("/my/directory/*.txt" ) >>> textFile2 = sc.wholeTextFiles("/my/directory/") Retrieving RDD Information Basic Information >>> rdd.getNumPartitions() #List the number of partitions >>> rdd.count() #Count RDD instances 3 >>> rdd.countByKey() #Count RDD instances by key defaultdict(<type 'int' >, { 'a':2, 'b':1}) >>> rdd.countByValue() #Count RDD instances by value defaultdict(<type 'int'>, {( 'b' ,2):1,( 'a' ,2):1,( 'a' ,7):1}) >>> rdd.collectAsMap() #Return (key,value) pairs as adictionary { 'a' : 2, 'b' : 2} >>> rdd3.sum() #Sum of RDD elements 4950 >>> sc.parallelize([]).isEmpty() #Check whether RDD is empty True Summary >>> rdd3.max() #Maximum value of RDD elements 99 >>> rdd3.min() #Minimum value of RDD elements 0 >>> rdd3.mean() #Mean value of RDD elements 49.5 >>> rdd3.stdev() #Standard deviation of RDD elements 28.866070047722118 >>> rdd3.variance() #Compute variance of RDD elements 833.25 >>> rdd3.histogram(3) #Compute histogram by bins ([0,33,66,99],[33,33,34]) >>> rdd3.stats() #Summary statistics (count, mean, stdev, max & min) Applying Functions >>> def g(x): print(x) >>> rdd.foreach(g) #Apply a function to all RDD elements ( 'a', 7) ( 'b', 2) ( 'a', 2) Selecting Data Getting >>> rdd.collect() #Return a list with all RDD elements [('a' , 7), ( 'a', 2), ( 'b', 2)] >>> rdd.take(2) #Take first 2 RDD elements [( 'a', 7), ('a' , 2)] >>> rdd.first() #Take first RDD element ( 'a', 7) >>> rdd.top(2) #Take top 2 RDD elements [( 'b', 2), ( 'a', 7)] Sampling >>> rdd3.sample(False, 0.15, 81).collect() #Return sampled subset of rdd3 [3,4,27,31,40,41,42,43,60,76,79,80,86, Filtering >>> rdd.filter(lambda x: "a" in x).collect() #Filter the RDD [( 'a',7),('a' ,2)] >>> rdd5.distinct().collect() #Return distinct RDD values [ 'a',2, 'b',7] >>> rdd.keys().collect() #Return (key,value) RDD's keys ['a' ,'a' , 'b'] Iterating >>> def g(x): print(x) >>> rdd.foreach(g) *#Apply a function to all RDD elements* ( 'a', 7) ( 'b', 2) ( 'a', 2) Reshaping Data Reducing >>> rdd.reduceByKey(lambda x,y : x+y).collect() #Merge the rdd values foreach key [('a',9), ('b',2)] >>> rdd.reduce(lambda a, b: a + b) #Merge the rdd values ('a',7,'a',2,'b',2) Grouping by >>> rdd3.groupBy(lambda x: x % 2) .mapValues(list) .collect() #Return RDD of grouped values >>> rdd.groupByKey().mapValues(list).colle #Group rdd by key [('a' ,[7,2]), ( 'b',[2])] Aggregating >>> seqOp = (lambda x,y: (x[0]+y,x[1]+1)) >>> combOp = (lambda x,y: (x[0]+y[0],x[1]+y[1])) # Aggregate RDD elements of eachpartition and then the results >>> rdd3.aggregate((0,0),seqOp,combOp) (4950,100) #Aggregate values of each RDD key >>> rdd.aggregateByKey((0,0),seqop,combop) Sort >>> rdd2.sortBy(lambda x: x[1]).collect() #Sort RDD by given function [('d' ,1), ('b' ,1), ('a',2)] >>> rdd2.sortByKey().collect() #Sort (key, value) RDD by key [('a' ,2), ('b' ,1), ('d' ,1)] Mathematical Operations >>> rdd.subtract(rdd2).collect() #Return each rdd value not contained in rdd2 [('b' ,2), ('a' ,7)] #Return each (key,value) pair of rdd2 with no matching key in rdd >>> rdd2.subtractByKey(rdd).collect() [( 'b', 1)] >>> rdd.cartesian(rdd2).collect() #Return the Cartesian product of rdd and rdd2 Repartitioning >>> rdd.repartition(4) #New RDD with 4 partitions >>> rdd.coalesce(1) #Decrease the number of partitions in the RDD to 1
  • 2. cheatsheetmaker.com © 2021 [('a' ,(9,2)), ('b' ​ ,(2,1))] #Aggregate the elements of eachp artition, and then the results >>> rdd3.fold(0,add) 4950 # Merge the values for each key >>> rdd.foldByKey(0, add).collect() [('a' ,9), ('b' ,2)] #Create tuples of RDD elements byapplying a function >>> rdd3.keyBy(lambda x: x+x).collect()