PySpark Cheat Sheet For RDD Operations
PySpark Cheat Sheet For RDD Operations
Broadcasting
rdd = spark.sparkContext.parallelize(data) # Sample RDD with duplicates print("Sum of Squares:", sum_result)
duplicate_rdd = spark.sparkContext.parallelize([1, 2, 2, 3, 3, 3, 4, 5])
# Show the RDD elements
# Unpersist (remove from cache) if no longer needed
print("Parallelized RDD:") # Use distinct to get unique elements rdd.unpersist()
print(rdd.collect()) distinct_rdd = duplicate_rdd.distinct() Using the broadcast Function
from pyspark.sql import SparkSession
# Show the distinct RDD
from pyspark.sql.functions import broadcast
2. Loading Data from External Sources: print("Distinct RDD:")
print(distinct_rdd.collect())
PySpark supports loading data from various external sources,
Pair RDDs
# Create a SparkSession
including text files, CSV files, JSON files, and more. This method spark = SparkSession.builder \
is suitable for larger datasets that are stored externally. .appName("Broadcast Example") \
.getOrCreate()
Pair RDD Transformations and Actions
# Load text file as an RDD
text_file_path = "path/to/text/file.txt"
Actions # Sample DataFrame and RDD
small_df = spark.createDataFrame([(1, 'A'), (2, 'B')], ['key', 'value'])
text_rdd = spark.sparkContext.textFile(text_file_path)
1. reduceByKey Transformation
large_rdd = spark.sparkContext.parallelize([(1, 'Alice'), (2, 'Bob'), (3, 'Charlie')])
# Load CSV file as an RDD 1. collect Action # Sample Pair RDD # Broadcast small DataFrame for efficient join
csv_file_path = "path/to/csv/file.csv" pair_rdd = spark.sparkContext.parallelize([(1, 10), (2, 20), (1, 30), (2, 40)]) joined_df = large_rdd.toDF(['key', 'name']).join(broadcast(small_df), on="key")
csv_rdd = spark.sparkContext.textFile(csv_file_path) # Sample RDD
# Use reduceByKey to sum values for each key # Show the joined DataFrame
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])
# Load JSON file as an RDD sum_by_key = pair_rdd.reduceByKey(lambda x, y: x + y) joined_df.show()
json_file_path = "path/to/json/file.json" # Use collect to retrieve all elements
json_rdd = spark.sparkContext.textFile(json_file_path) # Show the result
all_elements = rdd.collect()
print("Sum by Key:")
# Show the first few elements of each RDD print(sum_by_key.collect())
# Print the collected elements
try:
# Sample RDD
1. map Transformation # Use take to get the first 3 elements # Another Pair RDD rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])
first_three_elements = rdd.take(3) another_pair_rdd = spark.sparkContext.parallelize([(1, 'A'), (2, 'B')])
# Transformation with potential error
# Sample RDD # Print the first three elements # Use join to combine Pair RDDs using keys transformed_rdd = rdd.map(lambda x: 10 / x)
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5]) print("First Three Elements:") joined_rdd = pair_rdd.join(another_pair_rdd)
print(first_three_elements) # Action triggering computation
# Use map to double each element
# Show the joined result result = transformed_rdd.collect()
doubled_rdd = rdd.map(lambda x: x * 2)
4. reduce Action print("Joined RDD:") except ZeroDivisionError as e:
print(joined_rdd.collect()) print("Error:", e)
# Show the transformed RDD
print("Doubled RDD:") # Use reduce to calculate the sum of elements
print(doubled_rdd.collect()) sum_result = rdd.reduce(lambda x, y: x + y)
4. sortByKey Transformation
# Print the sum result
2. filter Transformation print("Sum of Elements:", sum_result) # Use sortByKey to sort elements by key
sorted_rdd = pair_rdd.sortByKey()
# Use filter to keep even numbers
even_rdd = rdd.filter(lambda x: x % 2 == 0) 5. foreach Action # Show the sorted result
print("Sorted RDD:")
# Show the filtered RDD # Use foreach to print each element print(sorted_rdd.collect())
print("Even Numbers RDD:") def print_element(element):
print(even_rdd.collect()) print("Element:", element)
rdd.foreach(print_element)
5. foreach Action
# Save the RDD elements to a text file
rdd.saveAsTextFile("path/to/output")