1. 程式人生 > >ptython3+mysql爬蟲抓取新浪新聞

ptython3+mysql爬蟲抓取新浪新聞

一、安裝套件

1、pip install requests

2、pip install BeautifulSoup4

二、剖析網頁元素

soup = BeautifulSoup(reshtml,'html.parser').select('.news-item')

三、安裝資料庫

1、pip install pymysql

四、python爬蟲示例

from bs4 import BeautifulSoup
import requests
import pymysql
import logging
import json
import re

def getnewsinfo(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    return res.text

logger = logging.getLogger("simpleExample")
soup = BeautifulSoup(getnewsinfo('http://news.sina.com.cn/china/'), 'html.parser')
newsTitlesHtml = soup.select('.news-item')
print(len(newsTitlesHtml))
i = 0
conn = pymysql.connect(host='localhost', user='使用者名稱', passwd='密碼', db='資料庫名', port=3306, charset='utf8')
cur = conn.cursor()
for newsTitleHtml in newsTitlesHtml:
    i = i + 1
    print('編號:', i)
    if len(newsTitleHtml.select('h2')) > 0:
        title = newsTitleHtml.select('h2')[0].text
        captime = newsTitleHtml.select('.time')[0].text
        newsurl = newsTitleHtml.select('a')[0]['href']
        print('新聞標題:', title)
        print('新聞抓取時間:', captime)
        print('新聞連結:', newsurl)
        #newsid = newsurl[-20:-6]
        #newsid = re.search('doc-i(.*).shtml', newsurl).group(1)
        newsid = newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
        pagesoup = BeautifulSoup(getnewsinfo(newsurl),'html.parser')
        if len(pagesoup.select('.time-source'))> 0:
            newsdate = pagesoup.select('.time-source')[0].contents[0].strip()
            media_name = pagesoup.select('.time-source span a')[0].text
            media_url = pagesoup.select('.time-source span a')[0]['href']
            contents = []
            content = ""
            for c in pagesoup.select('#artibody p')[:-1]:
                contents.append(c.text.strip())
            # for j in range(len(contents)):
            #     content = content + contents[j]
            content = ' '.join(contents)
            editor = pagesoup.select('.article-editor')[0].text
            commentcountres = getnewsinfo('http://comment5.news.sina.com.cn/page/info?version=1&format=js'
                                           '&channel=gn&newsid=comos-'+newsid+'&group=&compress=0'
                                                                              '&ie=utf-8&oe=utf-8&page=1&page_size=20')

            commentcount = json.loads(commentcountres.strip('var data='))['result']['count']['total']
            print('新聞時間:', newsdate)
            print('新聞來源:', media_name)
            print('新聞來源連結:', media_url)
            print('新聞內容:', content)
            print('作者:', editor)
            print('評論數:', commentcount)
    try:
        sql = "insert into news(title, captime, newsurl, newsdate, media_name, media_url, content, editor, commentcount) values('"+title+"','"+captime+"','"+newsurl+"','"+newsdate+"', '"+media_name+"','"+media_url+"','"+content+"','"+editor+"','"+str(commentcount)+"')"
        #print(sql)
        cur.execute(sql)
        if cur.execute(sql) == 1:
            print("插入成功")
            conn.commit()
    except Exception as e:
        conn.rollback()
        logger.error(e)

conn.close()