python爬取雙色球資料+資料統計
阿新 • • 發佈:2019-02-10
彩票資料爬去---寫入mysql: import requests import re#python中的正則表示式(re模組) import xlwt import time import pymysql as MySQLdb flag=True allres=[] def get_all_page(): global all_page url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html" reponse = requests.get(url=url) reponse.encoding='utf-8' html = reponse.text all_page = int(re.findall(r"class=\"pg\".*?<strong>(.*?)</strong>",html)[0]) return all_page def get_num(): for page_num in range(1,all_page+1): url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_"+str(page_num)+".html" reponse = requests.get(url=url) time.sleep(2) reponse.encoding = 'utf-8' html = reponse.text rule = r"<tr>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\" style=\"padding-left:10px;\">.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em>(.*?)</em></td>" num = re.findall(rule, html, re.S | re.M) # f = xlwt.Workbook(encoding='utf-8') # sheet01 = f.add_sheet(u'sheel1', cell_overwrite_ok=True) for k in range(0,len(num)): kjrq=num[k][0] qs=num[k][1] seq = (str(num[k][2]),str(num[k][3]),str(num[k][4]),str(num[k][5]),str(num[k][6]),str(num[k][7])) red_ball="|".join(seq) blue_ball = num[k][8] send_data(kjrq,qs,red_ball,blue_ball) def send_data(kjrq,qs,red_ball,blue_ball): ''' 連線資料庫,提交獲取到的期號、截止時間和系統時間 ''' conn = MySQLdb.connect( host='localhost', port=3306, user='root', passwd='root', db='lottery_ticket', charset='utf8' ) cur = conn.cursor() print(u'扒取到的最新期號為:%s' % kjrq) try: cur.execute("SELECT kjrq FROM ssq ORDER BY kjrq DESC") select_db_results = cur.fetchall() for select_db_result in select_db_results: allres.append(select_db_result[0]) if kjrq in select_db_results: print(u'*****<<資料已經存在,不需要更新!>>*****') else: sql_insert = """\ insert into ssq(kjrq,qs,red_ball,blue_ball) VALUES (%s,%s,%s,%s) """ cur.execute( sql_insert, (kjrq, qs, red_ball, blue_ball) ) conn.commit() print(u'*****<<更新期號成功,更新內容是:%s>>*****' % str(kjrq)) except Exception as e: print(e) finally: cur.close() conn.close() if __name__ == '__main__': get_all_page() get_num()
彩票資料統計分析:
import numpy as np import pandas as pd import pymysql from sklearn import linear_model from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn import svm import matplotlib.pyplot as plt from pyspark.sql import SparkSession from pyspark.mllib.fpm import FPGrowth from pylab import *#支援中文 import operator mpl.rcParams['font.sans-serif'] = ['SimHei'] #封裝彩票資料 alldata=[] red_balls=[] blue_balls=[] qs=[] kjrq=[] #讀取資料庫資訊----nums為0表示預設統計所有期,>0則統計最近nums期資料。 def getData(nums=0): db = pymysql.connect( host='localhost', port=3306, user='root', passwd='root', db='lottery_ticket', charset='utf8' ) cur = db.cursor() try: if nums>0: cur.execute("SELECT * FROM ssq ORDER BY kjrq DESC limit "+str(nums)) else: cur.execute("SELECT * FROM ssq ORDER BY kjrq DESC") select_db_results = cur.fetchall() for select_db_result in select_db_results: alldata.append(select_db_result) except Exception as e: print(e) finally: cur.close() db.close() def write2mysql(data={}): db = pymysql.connect( host='localhost', port=3306, user='root', passwd='root', db='lottery_ticket', charset='utf8' ) cur = db.cursor() frequent=[] cur.execute("SELECT numbers FROM fpgroupth ORDER BY `count` DESC") fp_results = cur.fetchall() for select_db_result in fp_results: frequent.append(select_db_result[0]) try: for key in data.keys(): numbers=key times=data[key] if numbers in frequent: print(u'*****<<資料已經存在,不需要更新!>>*****') else: sql_insert = """\ insert into fpgroupth(numbers,`count`) VALUES (%s,%s) """ cur.execute( sql_insert, (numbers, times) ) db.commit() except Exception as e: print(e) finally: cur.close() db.close() #貝葉斯迴歸模型 def bayes(): # 使用貝葉斯令迴歸 reg = linear_model.BayesianRidge() reg.fit(red_balls, blue_balls) print("貝葉斯分類器"+reg.predict([[4.0, 2.0, 5.0, 12.0, 20.0, 22.0], [1.0, 7.0, 8.0, 15.0, 23.0, 31.0]])) #SGDClassifier迴歸模型 def sGDClassifier(): # 使用貝葉斯令迴歸 clf1 = SGDClassifier(loss="hinge", penalty="l2") clf1.fit(red_balls, blue_balls) print("sgd分類器:"+str(clf1.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]]))) def svmsClassfier(): clf2 = svm.SVC() clf2.fit(red_balls, blue_balls) print("svm分類器:"+str(clf2.predict([[1.0, 2.0, 4.0, 12.0, 20.0, 22.0]]))) def randForest(): clf3 = RandomForestClassifier(n_estimators=10) clf3 = clf3.fit(red_balls, blue_balls) print("random分類器:"+str(clf3.predict([[1.0, 2.0, 4.0, 12.0, 25.0, 33.0]]))) def analysis(): #從資料庫獲取資料 getData(nums=10) #利用scikit-learn分析資料 for res in alldata: red=str(res[2]).split("|") red_balls.append(list(map(float,red))) blue_balls.append(res[3]) qs.append(res[1]) kjrq.append(res[0]) #sGDClassifier() #svmsClassfier() #randForest() ''' res=redStatisticCount(red=True) blue_res=redStatisticCount(red=False) val=res.values() blue_val=blue_res.values() X=[i for i in range(1,34)] print(res) print(val) print(X) plt.plot(X, val, marker='o', mec='r', mfc='w',label=u'紅球曲線圖') plt.plot(X, blue_val, marker='*', mec='r', mfc='w',label=u'藍球曲線圖') plt.legend() # 讓圖例生效 plt.xticks(X, X, rotation=45) plt.margins(0) plt.subplots_adjust(bottom=0.15) plt.xlabel(u"紅球數字") #X軸標籤 plt.ylabel("出現的次數") #Y軸標籤 plt.title("紅球的歷史次數統計") #標題 plt.show() ''' #singeShow(red=True) #trendAnaly() #fpgroupth() #redTrendAnaly() singeShow(red=True) #紅球頻次統計 def redStatisticCount(red=True): if red: arr = np.array(red_balls) else: arr=np.reshape(list(map(float,blue_balls)),len(blue_balls),1) #keyarry= np.array(red_balls) key = np.unique(arr) result = {} for k in key: mask = (arr == k) arr_new = arr[mask] v = arr_new.size result[k] = v return result def singeShow(red=True): if red: res=redStatisticCount(red=True) red_keys=res.keys() X=[i for i in red_keys] else: res = redStatisticCount(red=False) blue_keys=res.keys() X = [i for i in blue_keys] val1=res.values() plt.plot(X, val1, marker='o', mec='r', mfc='w',label=u'紅球曲線圖') # 設定數字標籤 for a, b in zip(X, val1): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) plt.legend() # 讓圖例生效 plt.xticks(X, X, rotation=45) plt.margins(0) plt.subplots_adjust(bottom=0.15) plt.xlabel(u"紅球數字") #X軸標籤 plt.ylabel("出現的次數") #Y軸標籤 plt.title("紅球的歷史次數統計") #標題 plt.show() #近期籃球走勢圖 def trendAnaly(): X = qs X.reverse() blue=list(map(int,blue_balls)) blue.reverse() plt.plot(X, blue, marker='o', mec='r', mfc='w',label=u'籃球走勢圖') # 設定數字標籤 for a, b in zip(X, blue): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) plt.legend() # 讓圖例生效 plt.xticks(X, X, rotation=45) #設定座標軸刻度 my_y_ticks = np.arange(-1, 17, 2) plt.yticks(my_y_ticks) plt.margins(0) plt.subplots_adjust(bottom=0.15) plt.xlabel(u"期數") # X軸標籤 plt.ylabel("籃球號碼") # Y軸標籤 plt.title("籃球的趨勢統計") # 標題 plt.show() #呼叫頻繁挖掘模式 def fpgroupth(): spark=SparkSession.builder\ .appName("fpgroupth")\ .master("local[*]")\ .getOrCreate() rdd=spark.sparkContext.parallelize(red_balls, 10) model = FPGrowth.train(rdd, minSupport=0.005, numPartitions=10) result = model.freqItemsets().collect() dictdata={} for r in result: if len(r[0])>1: dictdata[str(r[0])]=int(r[1]) print(str(r[0])+"=="+str(r[1])) # 按照item中的第一個字元進行排序,即按照value排序 sort_dict=sorted(dictdata.items(), key=operator.itemgetter(1),reverse=True) #sorted(dictdata.items(), key=lambda dictdata: dictdata[1], reverse=True) print(sort_dict) #寫入資料庫 write2mysql(data=dictdata) #近期紅球走勢圖 def redTrendAnaly(): X = qs X.reverse() allRed=red_balls allRed.reverse() y=np.array(allRed) #獲取第一個球 first=y[:,0] second = y[:, 1] third = y[:, 2] fouth = y[:, 3] fifth = y[:, 4] sixth = y[:, 5] plt.plot(X, first, marker='o', mec='r', mfc='w',label=u'紅球1走勢圖') plt.plot(X, second, marker='o', mec='r', mfc='b', label=u'紅球2走勢圖') plt.plot(X, third, marker='o', mec='r', mfc='g', label=u'紅球3走勢圖') plt.plot(X, fouth, marker='o', mec='r', mfc='y', label=u'紅球4走勢圖') plt.plot(X, fifth, marker='o', mec='r', mfc='r', label=u'紅球5走勢圖') plt.plot(X, sixth, marker='o', mec='r', mfc='m', label=u'紅球6走勢圖') # 設定數字標籤 for a, b in zip(X, first): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) # 設定數字標籤 for a, b in zip(X, second): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) # 設定數字標籤 for a, b in zip(X, third): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) # 設定數字標籤 for a, b in zip(X, fouth): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) # 設定數字標籤 for a, b in zip(X, fifth): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) # 設定數字標籤 for a, b in zip(X, sixth): plt.text(a, b, b, ha='center', va='bottom', fontsize=20) plt.legend() # 讓圖例生效 plt.xticks(X, X, rotation=45) #設定座標軸刻度 my_y_ticks = np.arange(-1, 35, 2) plt.yticks(my_y_ticks) plt.margins(0) plt.subplots_adjust(bottom=0.15) plt.xlabel(u"期數") # X軸標籤 plt.ylabel("籃球號碼") # Y軸標籤 plt.title("籃球的趨勢統計") # 標題 plt.show() if __name__ == '__main__': analysis()
效果圖:
紅球各個數字的歷史出現次數
最近10期紅球走勢:
最近10期籃球走勢:
fpgroupth--挖掘頻繁模式效果
參考部落格:
http://blog.51cto.com/tdcqvip/2105499