大資料系列之實時計算Spark(十八)Python生成圖表
阿新 • • 發佈:2019-02-06
1.啟動pyspark pyspark --master local[2] 2.編寫程式碼 #匯入sql from pyspark.sql import Row import matplotlib.pyplot as plt import numpy as np import pylab as P plt.rcdefaults() dataDir ="file:///home/zpx/ml-data/ml-1m/users.dat" lines = sc.textFile(dataDir) splitLines = lines.map(lambda l: l.split("::")) usersRDD = splitLines.map(lambda p: Row(id=p[0],gender=p[1],age=int(p[2]), occupation=p[3], zipcode=p[4])) usersDF = spark.createDataFrame(usersRDD) usersDF.createOrReplaceTempView("users") usersDF.show() #生成直方圖 ageDF = spark.sql("SELECT age FROM users") ageList = ageDF.rdd.map(lambda p: p.age).collect() ageDF.describe().show() plt.hist(ageList) plt.title("Age distribution of the users\n") plt.xlabel("Age") plt.ylabel("Number of users") plt.show(block=False) #密度圖 from scipy.stats import gaussian_kde density = gaussian_kde(ageList) xAxisValues = np.linspace(0,100,1000) density.covariance_factor = lambda : .5 density._compute_covariance() plt.title("Age density plot of the users\n") plt.xlabel("Age") plt.ylabel("Density") plt.plot(xAxisValues, density(xAxisValues)) plt.show(block=False) #生成巢狀子圖 plt.subplot(121) plt.hist(ageList) plt.title("Age distribution of the users\n") plt.xlabel("Age") plt.ylabel("Number of users") plt.subplot(122) plt.title("Summary of distribution\n") plt.xlabel("Age") plt.boxplot(ageList, vert=False) plt.show(block=False) #柱狀圖 occ10 = spark.sql("SELECT occupation, count(occupation) as usercount FROM users GROUP BY occupation ORDER BY usercount DESC LIMIT 10") occ10.show() occTuple = occ10.rdd.map(lambda p:(p.occupation,p.usercount)).collect() occList, countList = zip(*occTuple) occList y_pos = np.arange(len(occList)) plt.barh(y_pos, countList, align='center', alpha=0.4) plt.yticks(y_pos, occList) plt.xlabel('Number of users') plt.title('Top 10 user types\n') plt.gcf().subplots_adjust(left=0.15) plt.show(block=False) #堆疊條形圖 occGender = spark.sql("SELECT occupation, gender FROM users") occGender.show() occCrossTab = occGender.stat.crosstab("occupation","gender") occupationsCrossTuple = occCrossTab.rdd.map(lambda p:(p.occupation_gender,p.M, p.F)).collect() occList, mList, fList = zip(*occupationsCrossTuple) N = len(occList) ind = np.arange(N) width = 0.75 p1 = plt.bar(ind, mList, width, color='r') p2 = plt.bar(ind, fList, width, color='y', bottom=mList) plt.ylabel('Count') plt.title('Gender distribution by occupation\n') plt.xticks(ind + width/2., occList, rotation=90) plt.legend((p1[0], p2[0]), ('Male', 'Female')) plt.gcf().subplots_adjust(bottom=0.25) plt.show(block=False) #餅圖 occupationsBottom10 = spark.sql("SELECT occupation,count(occupation) as usercount FROM users GROUP BY occupation ORDER BY usercount LIMIT 10") occupationsBottom10Tuple = occupationsBottom10.rdd.map(lambda p:(p.occupation,p.usercount)).collect() occupationsBottom10List, countBottom10List =zip(*occupationsBottom10Tuple) explode = (0, 0.3, 0.2, 0.15,0.1,0,0,0,0,0.1) plt.pie(countBottom10List, explode=explode,labels=occupationsBottom10List, autopct='%1.1f%%', shadow=True,startangle=90) plt.title('Bottom 10 user types\n') plt.show(block=False)
所用資料集下載:
連結:https://pan.baidu.com/s/1vUTt2GvPtlsNfqcWvo3lIA 密碼:izpa