詞雲分析的進一步理解
阿新 • • 發佈:2018-11-01
- 豆瓣電影
豆瓣評論分析:
1). 獲取豆瓣最新上映的所有電影的前10頁評論資訊;
2). 清洗資料;
3). 分析每個電影評論資訊分析繪製成詞雲, 儲存為png圖片,檔名為: 電影名.png;
import requests from bs4 import BeautifulSoup import re import jieba import wordcloud import numpy from PIL import Image from concurrent.futures import ThreadPoolExecutor def get_movie(url): response=requests.get(url) content=response.text soup=BeautifulSoup(content,'html.parser') nowplaying_movie_list=soup.find_all('li',class_='list-item') movies_info=[] for item in nowplaying_movie_list: nowplaying_movie_dict = {} nowplaying_movie_dict['title']=item['data-title'] nowplaying_movie_dict['id']=item['id'] movies_info.append(nowplaying_movie_dict) return movies_info def get_info(id,pageNum): start=20*(pageNum-1) url='https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P' %(id,start) content=requests.get(url).text soup=BeautifulSoup(content,'html.parser') commentsList=soup.find_all('span',class_='short') comments='' for commentTag in commentsList: comments+=commentTag.text return comments def word_cloud(comment,name): pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)') deal_comments = re.findall(pattern, comment) newComments = '' for item in deal_comments: newComments += item result = jieba.lcut(newComments) imageObj = Image.open('./image.jpg') cloud_mask = numpy.array(imageObj) wc = wordcloud.WordCloud( background_color='snow', mask=cloud_mask, font_path='./msyh.ttf', min_font_size=5, max_font_size=50, width=260, height=260, ) wc.generate(','.join(result)) wc.to_file('./img/%s.png' % (name)) def main(): url = 'https://movie.douban.com/cinema/nowplaying/xian/' movie_id=get_movie(url) for dict in movie_id: id=dict['id'] name=dict['title'] for page in range(1,10): pool = ThreadPoolExecutor(max_workers=10) comment=pool.map(get_info(id,page)) word_cloud(comment,name) main()
- 慕客網
爬取慕客網所有關於python的課程名及描述資訊, 並通過詞雲進行分析展示;
import re import requests from bs4 import BeautifulSoup import jieba import numpy from PIL import Image import wordcloud def get_html(url): return requests.get(url).text def get_name(text): soup = BeautifulSoup(text, 'html5lib') nametag_li = soup.find_all('div', class_="course-item-detail") info_li = [] for i in nametag_li: info = {} i = re.findall(r'[\u4E00-\u9FA5]+',str(i)) info['name'] = i[0] info['info'] = i[1:] info_li.append(info) return info_li def word_cloud(text): imgobj = Image.open('./image.jpg') cloud_mask = numpy.array(imgobj) result = jieba.lcut(text) wc = wordcloud.WordCloud( width=500, mask=cloud_mask, max_font_size=50, min_font_size=5, background_color='snow', font_path = './msyh.ttf', ) wc.generate(','.join(result)) wc.to_file('./muke.png') def main(): li = [] for i in range(2): url = 'https://www.imooc.com/search/course?words=python&page=%d' %(i+1) info = get_name(get_html(url)) for j in info: li.append(j['name']) li.append(''.join(j['info'])) word_cloud(''.join(li)) main()
執行結果:
- python爬取今日百度熱點前10的新聞
from bs4 import BeautifulSoup from urllib.request import urlopen def get_html(url): a = urlopen(url).read().decode('gb2312') return a def get_info(text): soup = BeautifulSoup(text, 'html5lib') info_li = soup.find_all('a', class_='list-title') news_li = [info_li[i].string for i in range(10)] return news_li def main(): url = 'http://top.baidu.com/buzz?b=341' new_li = get_info(get_html(url)) [print(i) for i in new_li] main()
執行結果: