1. 程式人生 > >爬蟲大作業

爬蟲大作業

rec att open search bs4 fun ret utf spa

import requests
import re
from bs4 import BeautifulSoup
import jieba.analyse
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator

# 獲取總頁數
def getnum(url):
    res = requests.get(url)
    res.encoding = gb2312
soup = BeautifulSoup(res.text, html.parser) Info = soup.select(".page-next")[0].extract().text TotalNum = re.search("共(\d+)頁.*",Info).group(1) return TotalNum #獲取單個頁面所有鏈接 def getpageurl(url): res = requests.get(url) res.encoding = gb2312 soup = BeautifulSoup(res.text,
html.parser) a = soup.select(".list-page ul") for i in soup.select(".list-page ul li"): if len(i.select("a"))>0: info = i.select("a")[0].attrs[href] pageurl = http://www.ckck.tv/ + info print(pageurl) getinfromation(pageurl) # 獲取頁面的信息
def getinfromation(url): res = requests.get(url) res.encoding = gb2312 soup = BeautifulSoup(res.text, html.parser) a = soup.select(".content .movie ul h1")[0].text print("電影:",a) b = soup.select(".content .movie ul li")[1].text name = re.search("【主 演】:(.*)",b).group(1) print("主演:",name) c = soup.select(".content .movie ul li")[4].text date = re.search("【年 代】:(.*) 【地 區】:", c).group(1) print("年代:", date) diqu = re.search("【地 區】:(.*)", c).group(1) print("地區:",diqu) # 將標簽內容寫入文件 f = open(gzccNews.txt, a, encoding=utf-8) f.write(a ) f.write(name ) f.write(date ) f.write(diqu) f.write("\n") f.close() # 生成詞雲 def getpicture(): lyric = ‘‘ f = open(gzccNews.txt, r, encoding=utf-8) for i in f: lyric += f.read() result = jieba.analyse.textrank(lyric, topK=50, withWeight=True) keywords = dict() for i in result: keywords[i[0]] = i[1] print(keywords) image = Image.open(input.jpg) graph = np.array(image) wc = WordCloud(font_path=./fonts/simhei.ttf, background_color=White, max_words=50, mask=graph) wc.generate_from_frequencies(keywords) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.imshow(wc.recolor(color_func=image_color)) plt.axis("off") plt.show() wc.to_file(output.png) url = http://www.ckck.tv/xj/Index.html a = getnum(url) getpageurl(url) for i in range(2,int(a)): page = http://www.ckck.tv/xj/List_4_{}.html.format(i) getpageurl(page) getpicture()

首先定義獲取總頁面、獲取頁面所有鏈接、獲取頁面信息、生成詞雲等的函數,過程中就是獲取所有頁面所有鏈接出現點問題,歸結於找標簽問題。這次爬取的是一個電影網站,將網站裏面的電影名、主演、年代、地區,然後進行詞雲生成

技術分享圖片

技術分享圖片

技術分享圖片

爬蟲大作業