1. 程式人生 > >python 爬取《延禧攻略》所有的演員參演的電視劇

python 爬取《延禧攻略》所有的演員參演的電視劇

# -*- coding: utf-8 -*-
#@Time :18-9-23 上午11:22
#@Author : LiMeng
#@Email : [email protected]
#@File : yanxigonglvu.py
#Software:PyCharm
import  requests
import  ppretty
import collections
from wordcloud import WordCloud
from bs4 import  BeautifulSoup
import matplotlib.pyplot as plt
from scipy.misc import imread
import jieba
def get():
    url='http://www.tvzn.com/14784/yanyuanbiao.html'
    res=requests.get(url=url)
    html=res.content
    dianshiju_list=[]
    nameList=[]
    soup=BeautifulSoup(html,'lxml')
    dianshuju_x=[]

    # 主演
    contents1 = soup.find('ul', attrs={'class':'gclearfix'}).findAll("li")
    for content in contents1:
        #actorNamezhuyan=content.find('p',attrs={'class':'mh-actor'}).find('a',attrs={'class':'mh-actor'})
        actorNamezhuyan=content.find('a',attrs={'class':'mh-actor'})
        # print(actorNamezhuyan)
        href=actorNamezhuyan.attrs['href']
        # 將分析得到的網頁地址進行二次爬蟲,這裡是要尋找某個演員參演的電視劇,需要再次傳送請求
        res1=requests.get(('http://www.tvzn.com/'+href))
        rsp=res1.text
        soup1=BeautifulSoup(rsp,"lxml")
        content1 = soup1.find('ul',attrs={'class':'tn-avatar-list tn-helper-reset tn-helper-clearfix'})
        # print(type(dianshiju_list))
        for x in content1.strings:#這裡是獲取節點下面所有的內容
            if (x):#有的節點下面沒有內容,所以需要將其過濾掉
                dianshiju_list.append(x)#將電視劇目表新增到陣列中

     # 配角
    contents2=soup.find('div',attrs={'class':'mh-name-list'}).findAll('li')
    for contentx in contents2:
        aclist=contentx.findAll('p')
        for x in aclist:
             nameList.append(x.find('',attrs={'class':'mh-actor'}).getText())


    # 得到包含演員的陣列
    surnamelist = []
    givennamelist = []
    surname_dict = {}
    for actorname in nameList:
        surnamelist.append(actorname[0])
        for givenname in actorname[2:]:
            givennamelist.append(givenname)
            if actorname[0] not in surname_dict:
                surname_dict[actorname[0]]=1
            else:
                surname_dict[actorname[0]]+=1


    file=open('./data.txt','w')
    for x in nameList:
        file.write(x)
        file.write(" ")
    file.close()

    word_count = collections.Counter(dianshiju_list);
    bg_pic = imread('mask.jpeg')
    wordcloud = WordCloud(font_path='./simhei.ttf', mask=bg_pic, background_color="white", width=1000, height=860,
                           margin=2).generate_from_frequencies((word_count))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
if __name__ == '__main__':
        get()