python-虎撲爬蟲
阿新 • • 發佈:2018-01-30
pri 系列 als writer 初學 nbsp 本地 itl cep
Python作為一個高級編程語言,不知從何時起就在圈子裏流行起來了。個人也是圖個鮮,跟上時代步伐學習了一下。“魯迅”說過:不能學以致用,就是耍流氓。我用python對虎撲論壇作了一個爬蟲。腳本寫的糙了點,權作初學者交流使用,同時也方便以後查閱。本來是準備寫一個虎撲的分析帖子,可後來動力不足就沒有寫成了。不過,作為一個馬刺球迷很榮幸我們的組織是熱度前三。
準備工作:安裝Python、安裝MySQL、虛擬機【選擇性,後期將每日放在服務器上執行定時任務使用】
1、安裝python:選擇3.*,過程忽略
2、安裝MySQL:選擇5.6版本及以上,過程忽略
3、虛擬機:linux系列,過程忽略
需求描述
爬取虎撲論壇帖子,了解帖子內容、作者、熱度等。
寫腳本
一共分為三部分:part1通過對當前鏈接進行分析,提取帖子作者、閱讀的信息;part2取得帖子本身的內容;part3對發帖人進行數據提取,為後期分析提供思路。具體的腳本如下。需要註意的是:編碼、編碼、編碼。謝謝!
註:由於虎撲的反爬蟲導致可細分論壇的可讀取頁面數為10(突破防禦失敗,謝謝!)這種情況下,我的處理方式是將腳本放入服務器中每日爬取進行累積。
Part1:爬取帖子的名稱、作者、創建時間、閱讀/回復、作者鏈接等,並放入本地MySQL數據庫
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import requests import json import time import pymysql import importlib,sys importlib.reload(sys) forum_note_sum=[] #variavle:save the content of tiezi list_d=[‘原創‘,‘翻譯‘,‘討論‘] #內容判斷條件,如果帖子標題內容為此,取另一個值 type = sys.getfilesystemencoding() #num:the record number of one page;get tiezi of author and others def parent_li_web(num): forum_note_record = {} try: parent_tiezi=bs_obj.find(‘ul‘,class_=‘for-list‘).find_all(‘li‘)[num] div_one = parent_tiezi.find(‘div‘, class_=‘titlelink box‘) div_two = parent_tiezi.find(‘div‘, class_=‘author box‘) span_three = parent_tiezi.find(‘span‘, class_=‘ansour box‘).string.strip() div_four = parent_tiezi.find(‘div‘, class_=‘endreply box‘) subname=div_one.a.string sublink=‘https://bbs.hupu.com‘+div_one.a[‘href‘] team_tmp=theme_tmp for i in list_d: if i==subname: subname=div_one.find_all(‘a‘)[1].string sublink=‘https://bbs.hupu.com‘+div_one.find_all(‘a‘)[1][‘href‘] # print (i,subname,sublink) forum_note_record.update({ ‘subname‘:subname, ‘subname_link‘:sublink, ‘author‘:div_two.a.string, ‘author_link‘:div_two.a[‘href‘], ‘author_create_time‘:div_two.find(‘a‘,style=‘color:#808080;cursor: initial; ‘).string, ‘read_reply_number‘:span_three, ‘last_reply_writer‘:div_four.span.string, ‘last_reply_time‘:div_four.a.string, ‘team_tmp‘:team_tmp }) forum_note_sum.append(forum_note_record) except: return None if __name__==‘__main__‘: # all_spurs_note begin_time=time.time() print(‘---------腳本執行時間為:{}------------‘.format(time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime()))) team_list = [‘rockets‘, ‘warriors‘, ‘cavaliers‘, ‘spurs‘, ‘lakers‘, ‘celtics‘, ‘thunder‘, ‘clippers‘, ‘timberwolves‘, ‘mavericks‘, ‘knicks‘, ‘bulls‘, ‘nets‘, ‘sixers‘, ‘jazz‘, ‘pacers‘, ‘blazers‘, ‘heat‘, ‘suns‘, ‘grizzlies‘, ‘wizards‘, ‘pelicans‘, ‘bucks‘, ‘kings‘, ‘raptors‘, ‘nuggets‘, ‘hawks‘, ‘hornets‘, ‘pistons‘, ‘magic‘] for li in team_list: forum_note_sum_code=[] theme_tmp=li for i in range(1,11,1): #由於虎撲反爬,只能爬到10頁;後續可放入linux中定時執行 url = ‘https://bbs.hupu.com/{}-{}‘.format(li,i) print (url) wb_string = requests.get(url) bs_obj = BeautifulSoup(wb_string.content, ‘html.parser‘) with open(‘web_spider_original.txt‘,‘w‘,encoding=‘utf8‘) as f: f.write(str(bs_obj)) f.write(‘\r‘*10+‘-----我是分割線-----‘+‘\r‘*10) for j in range(1,61,1): #每個頁面數據有60個帖子 parent_li_web(j) with open(‘hupu_spider_spurs_load.txt‘, ‘w‘, encoding=‘utf8‘) as f: for item in forum_note_sum: json.dump(item,f,ensure_ascii=False) f.write(‘\r‘) #insert into mysql conn=pymysql.connect(host=‘localhost‘,user=‘root‘,passwd=‘1234‘,db=‘spider‘,port=3306,charset=‘utf8‘) cur=conn.cursor() cur.execute(‘delete from hupuforum_spurs_note_daytmp‘) with open(‘hupu_spider_spurs_load.txt‘,‘r‘,encoding=‘utf8‘) as f: for item in f: item=json.loads(item) #how convert string to dict # print(type(item)) cur.execute(‘insert into hupuforum_spurs_note_daytmp(subname,subname_link,author,author_link,author_create_time,read_reply_number,last_reply_writer,last_reply_time,theme_title) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)‘,(item[‘subname‘],item[‘subname_link‘],item[‘author‘],item[‘author_link‘],item[‘author_create_time‘],item[‘read_reply_number‘],item[‘last_reply_writer‘],item[‘last_reply_time‘],item[‘team_tmp‘])) conn.commit() cur.close() conn.close() print(‘Finished!本次執行消耗時間為:{}秒‘.format(time.time()-begin_time))
Part2:增加貼子內容並更新部分字段
# coding=utf8 import time import requests from bs4 import BeautifulSoup import pymysql import signal begin_time=time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime()) conn=pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,passwd=‘1234‘,db=‘spider‘,charset=‘utf8‘) cur=conn.cursor() sub_cur= conn.cursor() cur.execute(‘INSERT INTO hupuforum_spurs_note SELECT * FROM hupuforum_spurs_note_daytmp WHERE subname_link NOT IN (SELECT a.subname_link FROM hupuforum_spurs_note a);‘) cur.execute(‘update hupuforum_spurs_note a,hupuforum_spurs_note_daytmp b set a.read_reply_number=b.read_reply_number,a.last_reply_writer=b.last_reply_writer,a.last_reply_time=b.last_reply_time where a.subname_link=b.subname_link ‘) # conn.commit() cur.execute(‘use spider;‘) conn.commit() cur.execute(‘select subname_link from hupuforum_spurs_note where sub_text is null;‘) for url in cur.fetchall(): url=list(url) # print(url) try: wb_page = requests.get(url[0],timeout=2) #實際執行中,存在網頁假死狀態,設置超時 bs_obj = BeautifulSoup(wb_page.content, ‘html.parser‘) tmp_text = bs_obj.select(‘#tpc > div > div.floor_box > table.case > tbody > tr > td > div.quote-content‘) sub_text=tmp_text[0].get_text(strip=True) sub_text=sub_text.replace(‘\‘‘,‘’‘) sql="""update hupuforum_spurs_note set sub_text=\‘{}\‘ where subname_link={};""".format((sub_text[:1000]),str(url).replace(‘[‘,‘‘).replace(‘]‘,‘‘)) # print(sql) sub_cur.execute(sql) conn.commit() print(‘success‘) except IndexError as e: #這個錯誤意味著頁面也不存在 sql="""update hupuforum_spurs_note set sub_text=\‘{}\‘ where subname_link={};""".format(‘網頁不存在‘,str(url).replace(‘[‘,‘‘).replace(‘]‘,‘‘)) sub_cur.execute(sql) conn.commit() except pymysql.err.InternalError as e: #說明內容中包含emoj等utf8四字符內容 sql="""update hupuforum_spurs_note set sub_text=\‘{}\‘ where subname_link={};""".format(‘內容格式有誤,導致出錯!‘,str(url).replace(‘[‘,‘‘).replace(‘]‘,‘‘)) sub_cur.execute(sql) conn.commit() except requests.exceptions.ReadTimeout as e: #網頁響應超時 sql="""update hupuforum_spurs_note set sub_text=\‘{}\‘ where subname_link={};""".format(‘網頁打開超時‘,str(url).replace(‘[‘,‘‘).replace(‘]‘,‘‘)) sub_cur.execute(sql) conn.commit() else: sql="""update hupuforum_spurs_note set sub_text=\‘{}\‘ where subname_link={};""".format(‘其他類型錯誤‘,str(url).replace(‘[‘,‘‘).replace(‘]‘,‘‘)) sub_cur.execute(sql) conn.commit() conn.commit() cur.close() sub_cur.close() conn.close() end_time=time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime()) print(‘Finished,任務開始時間為:{},結束時間為:{}‘.format(begin_time,end_time))
Part3:爬取註冊用戶信息
# coding=utf8 import time import requests from bs4 import BeautifulSoup import pymysql begin_time=time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime()) conn=pymysql.connect(host=‘localhost‘,port=3306,user=‘root‘,passwd=‘1234‘,db=‘spider‘,charset=‘utf8‘) cur=conn.cursor() sub_cur=conn.cursor() cur.execute(‘select distinct author_link from hupuforum_spurs_note;‘) for author_url in cur.fetchall(): try: author_url=list(author_url) wb_obj=requests.get(author_url[0],timeout=2) bs_obj=BeautifulSoup(wb_obj.content,‘html.parser‘) author=bs_obj.select(‘#main > div.personal > div.personal_right > h3 > div‘)[0].string author_visited=bs_obj.select(‘#main > div.personal > div.personal_right > h3 > span‘)[0].string.replace(‘有‘,‘‘).replace(‘人次訪問‘,‘‘) author_info=bs_obj.select(‘#main > div.personal > div.personal_right > div‘)[0].get_text(strip=True) sub_cur.execute(‘insert into hupuforum_authors_info(author,author_link,author_visited,author_info,author_status) values(%s,%s,%s,%s,%s)‘,(author,author_url[0],author_visited,author_info,‘正常‘)) except IndexError as e: sub_cur.execute( ‘insert into hupuforum_authors_info(author,author_link,author_visited,author_info,author_status) values(%s,%s,%s,%s,%s)‘, (author, author_url[0], ‘‘, ‘‘, ‘無法訪問‘)) conn.commit() conn.commit() cur.close() conn.close() end_time=time.strftime(‘%Y-%m-%d %H:%M:%S‘,time.localtime()) print(‘Finished,任務開始時間為:{},結束時間為:{}‘.format(begin_time,end_time))
python-虎撲爬蟲