ptython3+mysql爬蟲抓取新浪新聞
阿新 • • 發佈:2019-01-05
一、安裝套件
1、pip install requests
2、pip install BeautifulSoup4
二、剖析網頁元素
soup = BeautifulSoup(reshtml,'html.parser').select('.news-item')
三、安裝資料庫
1、pip install pymysql
四、python爬蟲示例
from bs4 import BeautifulSoup import requests import pymysql import logging import json import re def getnewsinfo(url): res = requests.get(url) res.encoding = 'utf-8' return res.text logger = logging.getLogger("simpleExample") soup = BeautifulSoup(getnewsinfo('http://news.sina.com.cn/china/'), 'html.parser') newsTitlesHtml = soup.select('.news-item') print(len(newsTitlesHtml)) i = 0 conn = pymysql.connect(host='localhost', user='使用者名稱', passwd='密碼', db='資料庫名', port=3306, charset='utf8') cur = conn.cursor() for newsTitleHtml in newsTitlesHtml: i = i + 1 print('編號:', i) if len(newsTitleHtml.select('h2')) > 0: title = newsTitleHtml.select('h2')[0].text captime = newsTitleHtml.select('.time')[0].text newsurl = newsTitleHtml.select('a')[0]['href'] print('新聞標題:', title) print('新聞抓取時間:', captime) print('新聞連結:', newsurl) #newsid = newsurl[-20:-6] #newsid = re.search('doc-i(.*).shtml', newsurl).group(1) newsid = newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i') pagesoup = BeautifulSoup(getnewsinfo(newsurl),'html.parser') if len(pagesoup.select('.time-source'))> 0: newsdate = pagesoup.select('.time-source')[0].contents[0].strip() media_name = pagesoup.select('.time-source span a')[0].text media_url = pagesoup.select('.time-source span a')[0]['href'] contents = [] content = "" for c in pagesoup.select('#artibody p')[:-1]: contents.append(c.text.strip()) # for j in range(len(contents)): # content = content + contents[j] content = ' '.join(contents) editor = pagesoup.select('.article-editor')[0].text commentcountres = getnewsinfo('http://comment5.news.sina.com.cn/page/info?version=1&format=js' '&channel=gn&newsid=comos-'+newsid+'&group=&compress=0' '&ie=utf-8&oe=utf-8&page=1&page_size=20') commentcount = json.loads(commentcountres.strip('var data='))['result']['count']['total'] print('新聞時間:', newsdate) print('新聞來源:', media_name) print('新聞來源連結:', media_url) print('新聞內容:', content) print('作者:', editor) print('評論數:', commentcount) try: sql = "insert into news(title, captime, newsurl, newsdate, media_name, media_url, content, editor, commentcount) values('"+title+"','"+captime+"','"+newsurl+"','"+newsdate+"', '"+media_name+"','"+media_url+"','"+content+"','"+editor+"','"+str(commentcount)+"')" #print(sql) cur.execute(sql) if cur.execute(sql) == 1: print("插入成功") conn.commit() except Exception as e: conn.rollback() logger.error(e) conn.close()