1. 程式人生 > >python requests bs4練習

python requests bs4練習

豆瓣評論分析:
# 1). 獲取豆瓣最新上映的所有電影的前10頁評論資訊;
# 2). 清洗資料;
# 3). 分析每個電影評論資訊分析繪製成詞雲, 儲存為png圖片,檔名為: 電影名.png;

import re

import jieba
import requests
import wordcloud
from bs4 import BeautifulSoup

def getpagecomments(id,pageNum):
    start = (pageNum-1)*20
    url = "https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P" %(id,start)
    content = requests.get(url).text
    soup = BeautifulSoup(content,'html5lib')
    commentsList = soup.find_all('span',class_='short')
    comments = ""
    for commentTag in commentsList:
        comments += commentTag.text
    return comments

def getID():
    url = 'https://movie.douban.com/cinema/nowplaying/xian/'
    response = requests.get(url)
    content = response.text
    soup = BeautifulSoup(content, 'html5lib')
    nowplaying_movie_list = soup.find_all('li', class_='list-item')
    # print(nowplaying_movie_list,type(nowplaying_movie_list))
    movies_info = []
    for item in nowplaying_movie_list:
        nowplaying_movie_dict = {}
        nowplaying_movie_dict['title'] = item['data-title']
        nowplaying_movie_dict['id'] = item['id']
        movies_info.append(nowplaying_movie_dict)
    return movies_info

threads = []
movies_info = getID()
for i in movies_info:
    id = i['id']
    comments = ""
    for j in range(3):
        pageNum = j+1
        pagecomments = getpagecomments(id,pageNum)
        comments += pagecomments
    pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
    deal_comments = re.findall(pattern, comments)
    newComments = ''
    for item in deal_comments:
        newComments += item
    result = jieba.lcut(newComments)
    print("切分結果:", result)
    wc = wordcloud.WordCloud(
        background_color='snow',
        font_path='./font/msyh.ttf',
        min_font_size=5,
        max_font_size=55,
        width=200,
    )
    wc.generate(",".join(result))
    wc.to_file('./font/%s.png' %i['title'])

在這裡插入圖片描述
爬取慕客網所有關於python的課程名及描述資訊, 並通過詞雲進行分析展示;
- 網址: https://www.imooc.com/search/course?words=python

import re

import jieba
import requests
import wordcloud
from bs4 import BeautifulSoup
def getclassinform(page):
    url = 'https://www.imooc.com/search/course?words=python&page=%d' %(page)
    content = requests.get(url).text
    soup = BeautifulSoup(content,'html5lib')
    commentList = soup.find_all('a',attrs={'class':"course-detail-title"})
    commentListIntroduce = soup.find_all('div',attrs={'class':"course-item"})
    comments = ""
    for comment in commentList:
        comments += comment.text
    for comment in commentListIntroduce:
        a =  comment.find('p')
        comments += a.text
    return comments
comments = ''
for i in range(2):
    page = i+1
    pagecomments = getclassinform(page)
    comments += pagecomments
pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments = re.findall(pattern, comments)
newComments = ''
for item in deal_comments:
    newComments += item
result = jieba.lcut(newComments)
print("切分結果:", result)
wc = wordcloud.WordCloud(
    background_color='snow',
    font_path='./font/msyh.ttf',
    min_font_size=5,
    max_font_size=55,
    width=300,
)
wc.generate(",".join(result))
wc.to_file('python.png')

在這裡插入圖片描述
)
python爬取今日百度熱點前10的新聞;

import requests
from bs4 import BeautifulSoup
def getnews():
    url = 'http://top.baidu.com/buzz?b=1'
    content = requests.get(url).content
    soup = BeautifulSoup(content,'lxml')
    news = soup.find_all('a',class_='list-title')
    for new in news:
        print(new.text)
getnews()

在這裡插入圖片描述