彩票資料爬蟲收集,儲存在CSV檔案中
阿新 • • 發佈:2019-01-06
彩票資料爬蟲收集,儲存在CSV檔案中,程式碼如下
# -*- coding: utf-8 -*- # author:Apples from requests import get from bs4 import BeautifulSoup from user_agent import generate_user_agent import time def request_content(start, end): url_link = 'https://datachart.500.com/ssq/history/newinc/history.php?start={0}&end={1}'.format(start, end) headers = { 'User-Agent': generate_user_agent(device_type='desktop', os=('mac', 'linux', 'win', 'android')) } response = get(url_link, headers=headers, timeout=6) page_content = BeautifulSoup(response.content, "html.parser") html_tag = page_content.find_all('tbody', id='tdata')[0] return html_tag.find_all('tr', 't_tr1') class ssqclazz: def __init__(self): self.period = '' # 期號 self.red_1 = '' # 紅球 self.red_2 = '' self.red_3 = '' self.red_4 = '' self.red_5 = '' self.red_6 = '' self.blue_1 = '' # 藍球 self.happy_sunday = '' # 快樂星期天 self.pool_prize = '' # 獎池獎金(元) self.first_count = '' # 一等獎 注數 self.first_prize = '' # 一等獎 獎金(元) self.second_count = '' # 二等獎 注數 self.second_prize = '' # 二等獎 獎金(元) self.total_prize = '' # 總投注額(元) self.lottery_date = '' # 開獎日期 def __str__(self): return '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15}'.format(self.period, self.red_1, self.red_2, self.red_3, self.red_4, self.red_5, self.red_6, self.blue_1, self.happy_sunday, self.pool_prize, self.first_count, self.first_prize, self.second_count, self.second_prize, self.total_prize, self.lottery_date) def tr_tag(self, tag): tds = tag.find_all('td') index = 0 self.period = tds[index].string index += 1 self.red_1 = tds[index].string index += 1 self.red_2 = tds[index].string index += 1 self.red_3 = tds[index].string index += 1 self.red_4 = tds[index].string index += 1 self.red_5 = tds[index].string index += 1 self.red_6 = tds[index].string index += 1 self.blue_1 = tds[index].string index += 1 self.happy_sunday = tds[index].string index += 1 self.pool_prize = tds[index].string index += 1 self.first_count = tds[index].string index += 1 self.first_prize = tds[index].string index += 1 self.second_count = tds[index].string index += 1 self.second_prize = tds[index].string index += 1 self.total_prize = tds[index].string index += 1 self.lottery_date = tds[index].string if __name__ == '__main__': import xlwt import re wbk = xlwt.Workbook() # file = open('ssq.txt', mode='a+', encoding='utf-8') localtime = time.localtime(time.time()) lyear = localtime.tm_year ymin = 3 # 雙色球03年上線 ymax = lyear - 2000 print('===抓取資料開始===,200%s-20%s' % (ymin, ymax)) for year in range(ymin, ymax + 1): sheet = wbk.add_sheet(str(year), cell_overwrite_ok=True) row = 0 start = '{0}001'.format(year) end = '{0}300'.format(year) trs = request_content(start, end) for tr in trs: ssqobj = ssqclazz() ssqobj.tr_tag(tr) objstr = ssqobj.__str__() column = 0 # 提取文字中的資料 for data in re.findall(r"\d+\.?\d*", objstr): sheet.write(row, column, data) # 第0行第一列寫入內容 column += 1 row += 1 # file.write(objstr) # file.write('\n') print(objstr) # file.write('\n') print() time.sleep(3) wbk.save('test.xls') # file.close() print('抓取完畢!!!')