python requests bs4練習
阿新 • • 發佈:2018-11-03
豆瓣評論分析:
# 1). 獲取豆瓣最新上映的所有電影的前10頁評論資訊;
# 2). 清洗資料;
# 3). 分析每個電影評論資訊分析繪製成詞雲, 儲存為png圖片,檔名為: 電影名.png;
import re import jieba import requests import wordcloud from bs4 import BeautifulSoup def getpagecomments(id,pageNum): start = (pageNum-1)*20 url = "https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P" %(id,start) content = requests.get(url).text soup = BeautifulSoup(content,'html5lib') commentsList = soup.find_all('span',class_='short') comments = "" for commentTag in commentsList: comments += commentTag.text return comments def getID(): url = 'https://movie.douban.com/cinema/nowplaying/xian/' response = requests.get(url) content = response.text soup = BeautifulSoup(content, 'html5lib') nowplaying_movie_list = soup.find_all('li', class_='list-item') # print(nowplaying_movie_list,type(nowplaying_movie_list)) movies_info = [] for item in nowplaying_movie_list: nowplaying_movie_dict = {} nowplaying_movie_dict['title'] = item['data-title'] nowplaying_movie_dict['id'] = item['id'] movies_info.append(nowplaying_movie_dict) return movies_info threads = [] movies_info = getID() for i in movies_info: id = i['id'] comments = "" for j in range(3): pageNum = j+1 pagecomments = getpagecomments(id,pageNum) comments += pagecomments pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)') deal_comments = re.findall(pattern, comments) newComments = '' for item in deal_comments: newComments += item result = jieba.lcut(newComments) print("切分結果:", result) wc = wordcloud.WordCloud( background_color='snow', font_path='./font/msyh.ttf', min_font_size=5, max_font_size=55, width=200, ) wc.generate(",".join(result)) wc.to_file('./font/%s.png' %i['title'])
爬取慕客網所有關於python的課程名及描述資訊, 並通過詞雲進行分析展示;
- 網址: https://www.imooc.com/search/course?words=python
import re import jieba import requests import wordcloud from bs4 import BeautifulSoup def getclassinform(page): url = 'https://www.imooc.com/search/course?words=python&page=%d' %(page) content = requests.get(url).text soup = BeautifulSoup(content,'html5lib') commentList = soup.find_all('a',attrs={'class':"course-detail-title"}) commentListIntroduce = soup.find_all('div',attrs={'class':"course-item"}) comments = "" for comment in commentList: comments += comment.text for comment in commentListIntroduce: a = comment.find('p') comments += a.text return comments comments = '' for i in range(2): page = i+1 pagecomments = getclassinform(page) comments += pagecomments pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)') deal_comments = re.findall(pattern, comments) newComments = '' for item in deal_comments: newComments += item result = jieba.lcut(newComments) print("切分結果:", result) wc = wordcloud.WordCloud( background_color='snow', font_path='./font/msyh.ttf', min_font_size=5, max_font_size=55, width=300, ) wc.generate(",".join(result)) wc.to_file('python.png')
python爬取今日百度熱點前10的新聞;
import requests from bs4 import BeautifulSoup def getnews(): url = 'http://top.baidu.com/buzz?b=1' content = requests.get(url).content soup = BeautifulSoup(content,'lxml') news = soup.find_all('a',class_='list-title') for new in news: print(new.text) getnews()