python 爬取某音樂平臺所有歌單資訊
阿新 • • 發佈:2019-02-06
# coding: utf-8 import requests import os from lxml import etree import json from spider_project.proxies import proxies import random class WangYiYunSpider: '''爬取所有歌單的資訊''' def __init__(self): self.root_url = 'http://music.163.com' self.start_url = 'http://music.163.com/discover/playlist' self.classname_list = [] # 所有小類名 self.class_url = 'http://music.163.com/discover/playlist/?cat={}' self.class_url_list = [] # 所有小類url self.playlist_urls = [] # 每一小類所有歌單的url self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} self.playlist_info = [] self.classname = '' self.proxies = proxies def parse_url(self, url=None): print(url) if url is None: resp = requests.get(self.start_url, headers=self.headers, proxies=random.choice(self.proxies) ) else: resp = requests.get(url, headers=self.headers) resp.encoding = 'utf-8' return resp.text def get_cate_name_list(self, html): dl_list = html.xpath('//div[@id="cateListBox"]//dl') # print(dl_list) for dl in dl_list: # cate_name = dl.xpath('./dt/text()')[0] classname_list = dl.xpath('./dd/a/text()') self.classname_list.extend(classname_list) # print(self.classname_list) def get_class_url(self): for classname in self.classname_list: self.class_url_list.append(self.class_url.format(classname)) def get_playlist(self, html): '''獲取歌單鏈接及下一頁url''' # 歌單標題 # /playlist?id=2174792139" 要加上root_url playlist_url = html.xpath('//ul[@id="m-pl-container"]//a[@class="msk"]/@href') self.playlist_urls.extend(playlist_url) # print(playlist_url) try: next_url = html.xpath('//a[@class="zbtn znxt"]/@href')[0] print('next_url:%s'%next_url) except: return None else: return self.root_url + next_url def get_playlist_info(self): # 迴圈請求歌單詳情 info_dict = {} for url in self.playlist_urls: # 請求url 獲取網頁 url = self.root_url + url html_str = self.parse_url(url=url) html = etree.HTML(html_str) # 從網頁中提取資訊 songs = [] songs_li = html.xpath('//div[@id="song-list-pre-cache"]//li') for li in songs_li: song_info = { 'song_name': li.xpath('.//text()'), 'song_link': li.xpath('./a/@href') }, songs.append(song_info) info_dict = { 'class': self.classname, 'title': html.xpath('//title/text()'), 'url': url, 'author': html.xpath('//div[@class="user f-cb"]/a[@class="s-fc7"]/text()'), 'create_time': html.xpath('//div[@class="user f-cb"]/span[@class="u-icn u-icn-84 "]/text()'), 'tags': html.xpath('//div[@clas="tags f-cb"]/b//text()'), 'description': html.xpath('//p[@id="album-desc-dot"]/text()'), 'transmit': html.xpath('//a[@class="u-btni u-btni-share "]/i/text()'), 'store': html.xpath('//a[@class="u-btni u-btni-fav "]/i/text()'), 'comments': html.xpath('//span[@id="cnt_comment_count"]/text()'), 'played_times': html.xpath('//strong[@id="play-count"]/text()'), 'songs': songs } self.playlist_info.append(info_dict) # 清空url列表 self.playlist_urls = [] def save_palylist_info(self): '''儲存歌單資訊''' with open('{}.json'.format(self.classname), 'a', encoding='utf-8') as f: f.write(json.dumps(self.playlist_info, ensure_ascii=False, indent=4)) def run(self): '''程式執行主邏輯''' # 請求初始url html_str = self.parse_url() html = etree.HTML(html_str) # print(html) # 獲取所有分類名 self.get_cate_name_list(html) # 組織沒各小類的url get_class_url # print(self.classname_list) self.get_class_url() # 遍歷url列表獲取每小類的首頁頁面 ''' 每個大類一個資料夾 每個小類一個json檔案 每個歌單一條資料 沒首歌在歌單裡一個欄位 ''' # print(self.class_url_list) for url in self.class_url_list: print(url) # 請求小類url html_str = self.parse_url(url=url) html = etree.HTML(html_str) # 小類名 作為檔名 self.classname = html.xpath('//span[@class="f-ff2 d-flag"]/text()')[0] # 獲取歌單鏈接及下一頁url next_url = self.get_playlist(html) # 重複的請求與獲取 直到沒有下一頁 while True: if next_url is None: break else: html_str = self.parse_url(url=next_url) html = etree.HTML(html_str) # 獲取歌單鏈接及下一頁url next_url = self.get_playlist(html) # print(next_url) # 請求歌單列表裡的歌單url 進入詳情頁面 self.get_playlist_info() # 獲取詳情頁的資訊並儲存 self.save_palylist_info() if __name__ == '__main__': spider = WangYiYunSpider() spider.run()
網易雲網站的層級是 大類-小類-歌單-歌曲列表 因此我們爬取的層級也是按照網站的層級來的,大多數網站的爬蟲都是差不多的套路,主要是爬取與被爬的網站的後端人員進行鬥爭吧,不過爬蟲總歸會勝利。