python 爬取《延禧攻略》所有的演員參演的電視劇
阿新 • • 發佈:2018-12-11
# -*- coding: utf-8 -*- #@Time :18-9-23 上午11:22 #@Author : LiMeng #@Email : [email protected] #@File : yanxigonglvu.py #Software:PyCharm import requests import ppretty import collections from wordcloud import WordCloud from bs4 import BeautifulSoup import matplotlib.pyplot as plt from scipy.misc import imread import jieba def get(): url='http://www.tvzn.com/14784/yanyuanbiao.html' res=requests.get(url=url) html=res.content dianshiju_list=[] nameList=[] soup=BeautifulSoup(html,'lxml') dianshuju_x=[] # 主演 contents1 = soup.find('ul', attrs={'class':'gclearfix'}).findAll("li") for content in contents1: #actorNamezhuyan=content.find('p',attrs={'class':'mh-actor'}).find('a',attrs={'class':'mh-actor'}) actorNamezhuyan=content.find('a',attrs={'class':'mh-actor'}) # print(actorNamezhuyan) href=actorNamezhuyan.attrs['href'] # 將分析得到的網頁地址進行二次爬蟲,這裡是要尋找某個演員參演的電視劇,需要再次傳送請求 res1=requests.get(('http://www.tvzn.com/'+href)) rsp=res1.text soup1=BeautifulSoup(rsp,"lxml") content1 = soup1.find('ul',attrs={'class':'tn-avatar-list tn-helper-reset tn-helper-clearfix'}) # print(type(dianshiju_list)) for x in content1.strings:#這裡是獲取節點下面所有的內容 if (x):#有的節點下面沒有內容,所以需要將其過濾掉 dianshiju_list.append(x)#將電視劇目表新增到陣列中 # 配角 contents2=soup.find('div',attrs={'class':'mh-name-list'}).findAll('li') for contentx in contents2: aclist=contentx.findAll('p') for x in aclist: nameList.append(x.find('',attrs={'class':'mh-actor'}).getText()) # 得到包含演員的陣列 surnamelist = [] givennamelist = [] surname_dict = {} for actorname in nameList: surnamelist.append(actorname[0]) for givenname in actorname[2:]: givennamelist.append(givenname) if actorname[0] not in surname_dict: surname_dict[actorname[0]]=1 else: surname_dict[actorname[0]]+=1 file=open('./data.txt','w') for x in nameList: file.write(x) file.write(" ") file.close() word_count = collections.Counter(dianshiju_list); bg_pic = imread('mask.jpeg') wordcloud = WordCloud(font_path='./simhei.ttf', mask=bg_pic, background_color="white", width=1000, height=860, margin=2).generate_from_frequencies((word_count)) plt.imshow(wordcloud) plt.axis('off') plt.show() if __name__ == '__main__': get()