1. 程式人生 > >python 爬取某音樂平臺所有歌單資訊

python 爬取某音樂平臺所有歌單資訊

# coding: utf-8
import requests
import os
from lxml import etree
import json
from spider_project.proxies import proxies
import random


class WangYiYunSpider:
    '''爬取所有歌單的資訊'''

    def __init__(self):
        self.root_url = 'http://music.163.com'
        self.start_url = 'http://music.163.com/discover/playlist'
        self.classname_list = []  # 所有小類名
        self.class_url = 'http://music.163.com/discover/playlist/?cat={}'
        self.class_url_list = []  # 所有小類url
        self.playlist_urls = []  # 每一小類所有歌單的url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        self.playlist_info = []
        self.classname = ''
        self.proxies = proxies

    def parse_url(self, url=None):
        print(url)
        if url is None:
            resp = requests.get(self.start_url,
                                headers=self.headers,
                                proxies=random.choice(self.proxies)
                                )
        else:
            resp = requests.get(url, headers=self.headers)
        resp.encoding = 'utf-8'
        return resp.text

    def get_cate_name_list(self, html):
        dl_list = html.xpath('//div[@id="cateListBox"]//dl')
        # print(dl_list)
        for dl in dl_list:
            # cate_name = dl.xpath('./dt/text()')[0]
            classname_list = dl.xpath('./dd/a/text()')
            self.classname_list.extend(classname_list)
        # print(self.classname_list)

    def get_class_url(self):
        for classname in self.classname_list:
            self.class_url_list.append(self.class_url.format(classname))

    def get_playlist(self, html):
        '''獲取歌單鏈接及下一頁url'''

        # 歌單標題
        # /playlist?id=2174792139" 要加上root_url
        playlist_url = html.xpath('//ul[@id="m-pl-container"]//a[@class="msk"]/@href')
        self.playlist_urls.extend(playlist_url)
        # print(playlist_url)

        try:
            next_url = html.xpath('//a[@class="zbtn znxt"]/@href')[0]
            print('next_url:%s'%next_url)
        except:
            return None
        else:
            return self.root_url + next_url

    def get_playlist_info(self):
        # 迴圈請求歌單詳情
        info_dict = {}
        for url in self.playlist_urls:
            # 請求url 獲取網頁
            url = self.root_url + url
            html_str = self.parse_url(url=url)
            html = etree.HTML(html_str)

            # 從網頁中提取資訊
            songs = []
            songs_li = html.xpath('//div[@id="song-list-pre-cache"]//li')
            for li in songs_li:
                song_info = {
                                'song_name': li.xpath('.//text()'),
                                'song_link': li.xpath('./a/@href')

                            },
                songs.append(song_info)

            info_dict = {
                'class': self.classname,
                'title': html.xpath('//title/text()'),
                'url': url,
                'author': html.xpath('//div[@class="user f-cb"]/a[@class="s-fc7"]/text()'),
                'create_time': html.xpath('//div[@class="user f-cb"]/span[@class="u-icn u-icn-84 "]/text()'),
                'tags': html.xpath('//div[@clas="tags f-cb"]/b//text()'),
                'description': html.xpath('//p[@id="album-desc-dot"]/text()'),
                'transmit': html.xpath('//a[@class="u-btni u-btni-share "]/i/text()'),
                'store': html.xpath('//a[@class="u-btni u-btni-fav "]/i/text()'),
                'comments': html.xpath('//span[@id="cnt_comment_count"]/text()'),
                'played_times': html.xpath('//strong[@id="play-count"]/text()'),
                'songs': songs
            }
            self.playlist_info.append(info_dict)
        # 清空url列表
        self.playlist_urls = []

    def save_palylist_info(self):
        '''儲存歌單資訊'''
        with open('{}.json'.format(self.classname), 'a', encoding='utf-8') as f:
            f.write(json.dumps(self.playlist_info, ensure_ascii=False, indent=4))

    def run(self):
        '''程式執行主邏輯'''
        # 請求初始url
        html_str = self.parse_url()
        html = etree.HTML(html_str)
        # print(html)

        # 獲取所有分類名
        self.get_cate_name_list(html)

        # 組織沒各小類的url get_class_url
        # print(self.classname_list)
        self.get_class_url()

        # 遍歷url列表獲取每小類的首頁頁面
        '''
        每個大類一個資料夾
        每個小類一個json檔案
        每個歌單一條資料
        沒首歌在歌單裡一個欄位
        '''
        # print(self.class_url_list)
        for url in self.class_url_list:
            print(url)
            # 請求小類url
            html_str = self.parse_url(url=url)
            html = etree.HTML(html_str)
            # 小類名 作為檔名
            self.classname = html.xpath('//span[@class="f-ff2 d-flag"]/text()')[0]

            #  獲取歌單鏈接及下一頁url
            next_url = self.get_playlist(html)

            # 重複的請求與獲取 直到沒有下一頁
            while True:
                if next_url is None:
                    break
                else:
                    html_str = self.parse_url(url=next_url)
                    html = etree.HTML(html_str)

                    #  獲取歌單鏈接及下一頁url
                    next_url = self.get_playlist(html)
                    # print(next_url)

            # 請求歌單列表裡的歌單url 進入詳情頁面
            self.get_playlist_info()
            # 獲取詳情頁的資訊並儲存
            self.save_palylist_info()


if __name__ == '__main__':
    spider = WangYiYunSpider()
    spider.run()

網易雲網站的層級是 大類-小類-歌單-歌曲列表 因此我們爬取的層級也是按照網站的層級來的,大多數網站的爬蟲都是差不多的套路,主要是爬取與被爬的網站的後端人員進行鬥爭吧,不過爬蟲總歸會勝利。