【爬蟲入門5】爬取酷狗TOP500
阿新 • • 發佈:2018-12-26
#coding utf-8 import time import requests from bs4 import BeautifulSoup class spider_KG_top500(object): def __init__(self): print('Welcome to spider_KG_top500') def get_song_info(self, page_num): for page_num in range(page_num): page_num += 1 if page_num > 23: print('Spider end!') break url = r'http://www.kugou.com/yy/rank/home/%s-8888.html?from=rank'%page_num headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400'} res = requests.get(url, headers=headers) res.encoding = 'utf-8' page = res.text soup = BeautifulSoup(page,'html.parser') song_rank = soup.select('.pc_temp_num') #'.pc_temp_songlist > ul > li > a' 標籤之間必須帶空格 song_info = soup.select('.pc_temp_songlist > ul > li > a') song_time = soup.select('.pc_temp_time') for rank, info, song_time in zip(song_rank, song_info, song_time): data = { 'rank': rank.text.strip(), 'name': info['title'].split('-')[0].split(), 'singer': info['title'].split('-')[1].split(), 'song_time': song_time.text.strip() } print('Page %s:\n%s'%(page_num,data)) if __name__ == '__main__': while True: try: page_num = int(input('please input how many page to spider: ')) break except Exception as e: print('please input 數字: ') start_time = time.time() spider = spider_KG_top500() spider.get_song_info(page_num) end_time = time.time() spend_time = end_time - start_time print('spend: %ss'%spend_time)