1. 程式人生 > >Spark Python API函式:pyspark API(3)

Spark Python API函式:pyspark API(3)

文章目錄

•    1 histogram
•    2 mean
•    3 variance
•    4 stdev
•    5 sampleStdev
•    6 sampleVariance
•    7 countByValue
•    8 top
•    9 takeOrdered
•    10 take
•    11 first
•    12 collectAsMap
•    13 keys
•    14 values
•    15 reduceByKey
•    16 reduceByKeyLocally

histogram

spark histogram

# histogram (example #1)

x = sc.parallelize([1,3,1,2,3])

y = x.histogram(buckets = 2)

print(x.collect())

print(y)

 

[1, 3, 1, 2, 3]

([1, 2, 3], [2, 3])

 

# histogram (example #2)

x = sc.parallelize([1,3,1,2,3])

y = x.histogram([0,0.5,1,1.5,2,2.5,3,3.5])

print(x.collect())

print(y)

 

[1, 3, 1, 2, 3]

([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5], [0, 0, 2, 0, 1, 0, 2])

mean

spark mean

# mean

x = sc.parallelize([1,3,2])

y = x.mean()

print(x.collect())

print(y)

 

[1, 3, 2]

2.0

variance

spark variance

# variance

x = sc.parallelize([1,3,2])

y = x.variance() 

# divides by N

print(x.collect())

print(y)

[1, 3, 2]

0.666666666667

stdev

spark stdev

# stdev

x = sc.parallelize([1,3,2])

y = x.stdev()  # divides by N

print(x.collect())

print(y)

 

[1, 3, 2]

0.816496580928

sampleStdev

spark sampleStdev

# sampleStdev

x = sc.parallelize([1,3,2])

y = x.sampleStdev() # divides by N-1

print(x.collect())

print(y)

[1, 3, 2]

1.0

sampleVariance

spark sampleVariance

# sampleVariance

x = sc.parallelize([1,3,2])

y = x.sampleVariance()  # divides by N-1

print(x.collect())

print(y)

 

[1, 3, 2]

1.0

countByValue

spark countByValue

# countByValue

x = sc.parallelize([1,3,1,2,3])

y = x.countByValue()

print(x.collect())

print(y)

 

[1, 3, 1, 2, 3]

defaultdict(<type 'int'>, {1: 2, 2: 1, 3: 2})

top

spark top

# top

x = sc.parallelize([1,3,1,2,3])

y = x.top(num = 3)

print(x.collect())

print(y)

 

[1, 3, 1, 2, 3]

[3, 3, 2]

takeOrdered

spark takeOrdered

# takeOrdered

x = sc.parallelize([1,3,1,2,3])

y = x.takeOrdered(num = 3)

print(x.collect())

print(y)

 

[1, 3, 1, 2, 3]

[1, 1, 2]

take

spark take

# take

x = sc.parallelize([1,3,1,2,3])

y = x.take(num = 3)

print(x.collect())

print(y)

 

[1, 3, 1, 2, 3]

[1, 3, 1]

first

spark first

# first

x = sc.parallelize([1,3,1,2,3])

y = x.first()

print(x.collect())

print(y)

 

[1, 3, 1, 2, 3]

1

collectAsMap

spark collectAsMap

# collectAsMap

x = sc.parallelize([('C',3),('A',1),('B',2)])

y = x.collectAsMap()

print(x.collect())

print(y)

 

[('C', 3), ('A', 1), ('B', 2)]

{'A': 1, 'C': 3, 'B': 2}

keys

spark keys

# keys

x = sc.parallelize([('C',3),('A',1),('B',2)])

y = x.keys()

print(x.collect())

print(y.collect())

 

[('C', 3), ('A', 1), ('B', 2)]

['C', 'A', 'B']

values

spark values

# values

x = sc.parallelize([('C',3),('A',1),('B',2)])

y = x.values()

print(x.collect())

print(y.collect())

 

[('C', 3), ('A', 1), ('B', 2)]

[3, 1, 2]

reduceByKey

spark reduceByKey

# reduceByKey

x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])

y = x.reduceByKey(lambda agg, obj: agg + obj)

print(x.collect())

print(y.collect())

 

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]

[('A', 12), ('B', 3)]

reduceByKeyLocally

spark reduceByKeyLocally

# reduceByKeyLocally

x = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])

y = x.reduceByKeyLocally(lambda agg, obj: agg + obj)

print(x.collect())

print(y)

 

[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)]

{'A': 12, 'B': 3}