python爬蟲,使用urllib + 正則
阿新 • • 發佈:2018-11-19
學習使用urllib + 正則爬取熊貓TV的直播,獲取直播人和直播人氣,並且進行了排序
from urllib import request
import re
class Spider():
url = 'https://www.panda.tv/cate/dota2'
root_pattern = r'<div class="video-info">([\s\S]*?)</div>'
title_pattern = r'</i>([\s\S]*?)</span>'
number_pattern = r'<span class="video-number">([\s\S]*?)</span>'
def __fetch_content(self):
r = request.urlopen(self.url)
htmls = r.read()
htmls = str(htmls, encoding = 'utf-8')
return htmls
def __analysis(self, htmls):
total = []
root_html = re.findall(self.root_pattern, htmls)
for i in range(len(root_html)):
title = re.findall(self.title_pattern, root_html[i])
number = re.findall(self.number_pattern, root_html[i])
live_room = {'title' :title, 'number':number}
total.append(live_room)
return total
def __refine(self, total):
l = lambda each_item: {'title':each_item['title'][0].strip(), 'number':each_item['number'][0].strip()}
return map(l, total)
def __sort(self, refine_total):
refine_total = sorted(refine_total, key = self.__sort_seed, reverse = True )
return refine_total
def __sort_seed(self,each_tiem):
r = re.findall('\d*',each_tiem['number'])
number = float(r[0])
if '萬' in each_tiem['number']:
number *= 10000
return number
def __show(self, refine_total):
for i in refine_total:
print(i['title'] + '---' + i['number'])
def start(self):
htmls = self.__fetch_content()
total = self.__analysis(htmls)
refine_total = list(self.__refine(total))
refine_total = self.__sort(refine_total)
self.__show(refine_total)
spider = Spider()
spider.start()