scrapy實戰爬取電影天堂相關資訊

阿新 • • 發佈：2019-01-30

# encoding: utf-8
import scrapy

from scrapy import Selector
from scrapy import Request
from pacong.items import MovieNews, MovieChina, MovieOuMei, MovieRiHan;


class DmozSpider(scrapy.Spider):
    name = "movie"
    start_urls = [
        "http://www.dytt8.net"
    ]
    url = "http://www.dytt8.net"
    xq_url = "http://www.ygdy8.net" 


    def parse(self, response):
        selector = Selector(response)
        typeul = selector.xpath('//ul')
        types = typeul.xpath('li')
        for index, t in enumerate(types):
            href = t.xpath('a/@href').extract()
            title = t.xpath('a/text()').extract()
            if index < 5 
:
                href = href[0]
                title = title[0]

                if 'http://' in href:
                    if href == "http://www.ygdy8.net/html/gndy/index.html":
                        pass
                    else:
                        yield Request(href, callback=self.parseItems, meta={'title' 
: title,'url':href})
                else:
                    yield Request(self.xq_url + href, callback=self.parseItems, meta={'title': title,'url':self.xq_url+href})

    def parseItems(self, response):

        title = response.meta['title']
        urll = response.meta['url']

        selector = Selector(response)

        nextHrefs = selector.xpath('//select[@name="sldd"]')
        # for next in nextHrefs:
        hrefs = nextHrefs.xpath('option/@value').extract()
        for href in hrefs:
            nextUrl = urll[0:urll.rindex('/') + 1] + href
            yield Request(nextUrl, callback=self.parseItems, meta={'title': title, 'url': nextUrl})

        movies = selector.xpath('//div[@class="co_content8"]')
        mov = movies.xpath('ul/td/table')  # 審查元素和直接看網頁原始碼不一樣，，，審查元素沒有td標籤，艹。
        for movie in mov:
            name = movie.xpath('tr/td/b/a/text()').extract()
            href = movie.xpath('tr/td/b/a/@href').extract()
            time = movie.xpath('tr/td/font/text()').extract()
            zonghe = movie.xpath('tr/td[@colspan="2"]/text()').extract()

            name = name[len(name)-1]
            href = href[len(href)-1]
            time = time[0]
            zonghe = zonghe[0]
            yield Request(self.xq_url + href, callback=self.parseDetail, meta={'name': name, "time": time, 'title': title, 'zonghe': zonghe})

    def parseDetail(self,response):
        selector = Selector(response)
        zoom = selector.xpath('//div[@id="Zoom"]')
        imgs = zoom.xpath('td/img/@src').extract()
        img = ','.join(imgs)
        downloadUrl = zoom.xpath('td/table/tbody/tr/td/a/text()').extract()

        title = response.meta['title']
        name = response.meta['name']
        time = response.meta['time']
        zonghe = response.meta['zonghe']
        if downloadUrl:
            downloadUrl = downloadUrl[0]

        if title == u"最新影片":
            movieNews = MovieNews()
            movieNews['movie_name'] = name
            movieNews['movie_time'] = time
            movieNews['movie_image'] = img
            movieNews['movie_abstract'] = zonghe
            movieNews['movie_download'] = downloadUrl
            yield movieNews
        elif title == u"其它電影":
            movieRh = MovieRiHan()
            movieRh['movie_name'] = name
            movieRh['movie_time'] = time
            movieRh['movie_image'] = img
            movieRh['movie_abstract'] = zonghe
            movieRh['movie_download'] = downloadUrl
            yield movieRh
        elif title == u"歐美電影":
            movieOm = MovieOuMei()
            movieOm['movie_name'] = name
            movieOm['movie_time'] = time
            movieOm['movie_image'] = img
            movieOm['movie_abstract'] = zonghe
            movieOm['movie_download'] = downloadUrl
            yield movieOm
        elif title == u"國內電影":
            movieCh = MovieChina()
            movieCh['movie_name'] = name
            movieCh['movie_time'] = time
            movieCh['movie_image'] = img
            movieCh['movie_abstract'] = zonghe
            movieCh['movie_download'] = downloadUrl
            yield movieCh

恩，有點小基礎的同學應該不用我翻譯程式碼，下載跑一下估計就明白了。
我大概描述下把，首先parse方法是爬取首頁的分類的連結，我只取了4個分類，分別是最新，國內，歐美，日韓。然後根據分類連結爬取每個分類下的電影列表，也就是parseItems方法，並在裡面實現了下頁迴圈爬取的迭代，最後根據爬取到的電影詳情頁連結去爬取詳情頁的資料，並存在mongodb資料庫裡面。最後執行爬取了7000多條電影資料。這裡寫圖片描述

scrapy實戰爬取電影天堂相關資訊

沒有積分下載的同學，可以看我之前的3篇關於慕課網視訊文章裡面有我的聯絡方式，免費索取程式碼。

scrapy實戰爬取電影天堂相關資訊

Python爬蟲實戰專案2 | 動態網站的抓取（爬取電影網站的資訊）

java實現簡單的網路爬蟲（爬取電影天堂電影資訊）

爬取電影天堂的電影資訊

爬取電影天堂電影資訊

python3----練習題(爬取電影天堂資源)

Python爬取電影天堂

Scrapy ：爬取培訓網站講師資訊

爬蟲爬取電影天堂電影連結

分散式scrapy+redis 爬取房天下租房資訊

教程+資源,python scrapy實戰爬取知乎最性感妹子的爆照合集(12G)!

Scrapy+Splash爬取京東python書本資訊（遇到的問題記錄）

Python爬取電影天堂最新發布影片訊息

scrapy實戰爬取cl社群評論數超過設定值的連結

scrapy實戰爬取cl社區評論數超過設定值的鏈接

python爬取電影天堂網各個電影下載地址

python爬取電影天堂 _上

xpath；；利用xpath爬取電影天堂

python scrapy框架爬取知乎提問資訊

python爬取電影天堂的下載連結

scrapy實戰爬取電影天堂相關資訊

沒有積分下載的同學，可以看我之前的3篇關於慕課網視訊文章裡面有我的聯絡方式，免費索取程式碼。

相關推薦