python爬蟲例項 python爬蟲例項
阿新 • • 發佈:2018-12-31
python爬蟲例項
這裡有兩個爬蟲的例項,是剛開始學python用的,一個是爬取京東茅臺酒評論的,另一個是爬取新浪網國內新聞的,兩個都是網上的教程裡邊的,程式碼略微有些不同,供參考學習。
都可以在andconda裡跑
import requests from bs4 import BeautifulSoup from datetime import datetime import re import json import pandas news_total=[] commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047' def parseListLinks(url): newsdetails=[] res = requests.get(url) jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');')) for ent in jd['result']['data']: newsdetails.append(getNewsDetail(ent['url'])) return newsdetails def getNewsDetail(newsurl): result={} res=requests.get(newsurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') result['title']=soup.select('#artibodyTitle')[0].text result['newssource']=soup.select('.time-source span a')[0].text timesource=soup.select('.time-source')[0].contents[0].strip() dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M') result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M') result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) result['editor']=soup.select('.article-editor')[0].text.strip('責任編輯:') result['comments']=getCommentCounts(newsurl) print('獲得一條新聞') return result def getCommentCounts(newsurl): m=re.search('doc-i(.+).shtml',newsurl) newsid=m.group(1) comments=requests.get(commentURL.format(newsid)) jd=json.loads(comments.text.strip('var data=')) return jd['result']['count']['total'] for i in range(1,8): print('正在爬取第'+str(i)+'頁......') newsurl=url.format(i) newsary= parseListLinks(newsurl) news_total.extend(newsary) print('抓取結束') df=pandas.DataFrame(news_total) df.to_excel('news.xlsx')
import requests import re import json import time import xlwt # # #配置表格 #不需要明白是幹啥的 #有下面4行程式碼就可以往表格寫中文了 # style=xlwt.XFStyle() font=xlwt.Font() font.name='SimSun' style.font=font #建立一個表格 w=xlwt.Workbook(encoding='utf-8') #添加個sheet ws=w.add_sheet('sheet 1',cell_overwrite_ok=True) #當前寫入表格到第 row行 row=1 # #寫入表格頭 # ws.write(0,0,'content') ws.write(0,1,'userClientShow') ws.write(0,2,'creationTime') ws.write(0,3,'userLevelName') ws.write(0,4,'productColor') ws.write(0,5,'userLevelId') ws.write(0,6,'score') ws.write(0,7,'referenceName') ws.write(0,8,'referenceTime') ws.write(0,9,'isMobile') ws.write(0,10,'nickname') # #接受一個json物件 #將內容寫進表格 #一次一頁評論 # def write_json_to_xls(dat): global row for comment in dat['comments']: ws.write(row,0,comment['content']) ws.write(row,1,comment['userClientShow']) ws.write(row,2,comment['creationTime']) ws.write(row,3,comment['userLevelName']) ws.write(row,4,comment['productColor']) ws.write(row,5,comment['userLevelId']) ws.write(row,6,comment['score']) ws.write(row,7,comment['referenceName']) ws.write(row,8,comment['referenceTime']) ws.write(row,9,comment['isMobile']) ws.write(row,10,comment['nickname']) row+=1 # # # 迴圈獲取資料 # # for i in range(1,10+1): url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i try: json_req = requests.get(url) dat = json_req.json() write_json_to_xls(dat) print(u'寫入一頁資料') except Exception as e: print(u'獲取資料失敗資料',e) time.sleep(0.5) #將資料存進表格 w.save('result.xls')
這裡有兩個爬蟲的例項,是剛開始學python用的,一個是爬取京東茅臺酒評論的,另一個是爬取新浪網國內新聞的,兩個都是網上的教程裡邊的,程式碼略微有些不同,供參考學習。
都可以在andconda裡跑
import requests from bs4 import BeautifulSoup from datetime import datetime import re import json import pandas news_total=[] commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047' def parseListLinks(url): newsdetails=[] res = requests.get(url) jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');')) for ent in jd['result']['data']: newsdetails.append(getNewsDetail(ent['url'])) return newsdetails def getNewsDetail(newsurl): result={} res=requests.get(newsurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') result['title']=soup.select('#artibodyTitle')[0].text result['newssource']=soup.select('.time-source span a')[0].text timesource=soup.select('.time-source')[0].contents[0].strip() dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M') result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M') result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) result['editor']=soup.select('.article-editor')[0].text.strip('責任編輯:') result['comments']=getCommentCounts(newsurl) print('獲得一條新聞') return result def getCommentCounts(newsurl): m=re.search('doc-i(.+).shtml',newsurl) newsid=m.group(1) comments=requests.get(commentURL.format(newsid)) jd=json.loads(comments.text.strip('var data=')) return jd['result']['count']['total'] for i in range(1,8): print('正在爬取第'+str(i)+'頁......') newsurl=url.format(i) newsary= parseListLinks(newsurl) news_total.extend(newsary) print('抓取結束') df=pandas.DataFrame(news_total) df.to_excel('news.xlsx')
import requests import re import json import time import xlwt # # #配置表格 #不需要明白是幹啥的 #有下面4行程式碼就可以往表格寫中文了 # style=xlwt.XFStyle() font=xlwt.Font() font.name='SimSun' style.font=font #建立一個表格 w=xlwt.Workbook(encoding='utf-8') #添加個sheet ws=w.add_sheet('sheet 1',cell_overwrite_ok=True) #當前寫入表格到第 row行 row=1 # #寫入表格頭 # ws.write(0,0,'content') ws.write(0,1,'userClientShow') ws.write(0,2,'creationTime') ws.write(0,3,'userLevelName') ws.write(0,4,'productColor') ws.write(0,5,'userLevelId') ws.write(0,6,'score') ws.write(0,7,'referenceName') ws.write(0,8,'referenceTime') ws.write(0,9,'isMobile') ws.write(0,10,'nickname') # #接受一個json物件 #將內容寫進表格 #一次一頁評論 # def write_json_to_xls(dat): global row for comment in dat['comments']: ws.write(row,0,comment['content']) ws.write(row,1,comment['userClientShow']) ws.write(row,2,comment['creationTime']) ws.write(row,3,comment['userLevelName']) ws.write(row,4,comment['productColor']) ws.write(row,5,comment['userLevelId']) ws.write(row,6,comment['score']) ws.write(row,7,comment['referenceName']) ws.write(row,8,comment['referenceTime']) ws.write(row,9,comment['isMobile']) ws.write(row,10,comment['nickname']) row+=1 # # # 迴圈獲取資料 # # for i in range(1,10+1): url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i try: json_req = requests.get(url) dat = json_req.json() write_json_to_xls(dat) print(u'寫入一頁資料') except Exception as e: print(u'獲取資料失敗資料',e) time.sleep(0.5) #將資料存進表格 w.save('result.xls')