Spark實戰(5) DataFrame基礎之處理缺失值
阿新 • • 發佈:2018-12-18
Drop Missing Value
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('aggs').getOrCreate()
df = spark.read.csv('sales_info.csv', inferSchema = True, header = True)
df.printSchema()
df.show()
# drop missing data
# drop if any by row
df.na.drop().show()
# at least 2 non-null values will pass
df.na.drop(thresh = 2)
# use how parameter
df.na.drop(how='all').show()
df.na.drop(how='any').show()
# based on subset of column
df.na.drop(subset=['Sales']).show(
Fill Missing Data
df.na.fill('FILL VALUE').show() # only fill in string type data
df.na.fille(0).show() # only fill in number type data
df.na.fill('No Name', subset = ['Name']).show() # specify the subset
# fill the null with mean
from pyspark.sql.functions import mean
mean_val = df.select(mean(df['Sales'])).collect()
mean_sales = mean_val[0][0] # to show the number
df.na.fill(mean_sales,['Sales']).show()