Apache Spark Notes
Load CSV file
data = spark.read.csv("/path/to/file.csv", header=True, inferSchema=True)
Display data
display(data)
Filter column with condition
data.filter(data["col"] == "value").show()
data.filter(data["col"] != "value1").show()
data.filter( (data.col1 > 1) & (data.col1 < 99) ).show()
data.filter( (data.col1 == "dog") | (data.col1 == "cat") ).show()
Create new column with existing data but multiply new column by 2
data.withColumn("new_col",data["col"]*2).show()
Exclude column
data.drop("col").show()
Exclude duplicate rows
data.distinct().count()
Exclude duplicate rows for specific column
data.dropDuplicates(["col"]).count())
Sort data by column
data.orderBy("col").show()
Get avg/max/min for col2 grouped by col
from pyspark.sql.functions import min, max, avg
data.groupBy("col").agg(
avg(data.col2),
max(data.col2),
min(data.col2)).show()