1. 程式人生 > >【爬蟲入門5】爬取酷狗TOP500

【爬蟲入門5】爬取酷狗TOP500

#coding utf-8
import time
import requests
from bs4 import BeautifulSoup

class spider_KG_top500(object):

    def __init__(self):
        print('Welcome to spider_KG_top500')

    def get_song_info(self, page_num):

        for page_num in range(page_num):

            page_num += 1
            if page_num > 23:
                print('Spider end!')
                break
            url = r'http://www.kugou.com/yy/rank/home/%s-8888.html?from=rank'%page_num
            headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400'}
            res = requests.get(url, headers=headers)
            res.encoding = 'utf-8'
            page = res.text
            soup = BeautifulSoup(page,'html.parser')
            song_rank = soup.select('.pc_temp_num')
            #'.pc_temp_songlist > ul > li > a' 標籤之間必須帶空格
            song_info = soup.select('.pc_temp_songlist > ul > li > a')
            song_time = soup.select('.pc_temp_time')

            for rank, info, song_time in zip(song_rank, song_info, song_time):
                data = {
                            'rank': rank.text.strip(),
                            'name': info['title'].split('-')[0].split(),
                            'singer': info['title'].split('-')[1].split(),
                            'song_time': song_time.text.strip()
                }
                print('Page %s:\n%s'%(page_num,data))

if __name__ == '__main__':
    while True:
        try:
            page_num = int(input('please input how many page to spider: '))
            break
        except Exception as e:
            print('please input 數字: ')
    start_time = time.time()
    spider = spider_KG_top500()
    spider.get_song_info(page_num)
    end_time = time.time()
    spend_time = end_time - start_time
    print('spend: %ss'%spend_time)