1. 程式人生 > >python爬蟲,使用urllib + 正則

python爬蟲,使用urllib + 正則

學習使用urllib + 正則爬取熊貓TV的直播,獲取直播人和直播人氣,並且進行了排序

from urllib import  request
import re
class Spider():
    url = 'https://www.panda.tv/cate/dota2'
    root_pattern = r'<div class="video-info">([\s\S]*?)</div>'
    title_pattern = r'</i>([\s\S]*?)</span>'
    number_pattern = r'<span class="video-number">([\s\S]*?)</span>'
def __fetch_content(self): r = request.urlopen(self.url) htmls = r.read() htmls = str(htmls, encoding = 'utf-8') return htmls def __analysis(self, htmls): total = [] root_html = re.findall(self.root_pattern, htmls) for i in range(len(root_html)): title = re.findall(self.title_pattern, root_html[i]) number = re.findall(self.number_pattern, root_html[i]) live_room = {'title'
:title, 'number':number} total.append(live_room) return total def __refine(self, total): l = lambda each_item: {'title':each_item['title'][0].strip(), 'number':each_item['number'][0].strip()} return map(l, total) def __sort(self, refine_total): refine_total = sorted(refine_total, key = self.__sort_seed, reverse = True
) return refine_total def __sort_seed(self,each_tiem): r = re.findall('\d*',each_tiem['number']) number = float(r[0]) if '萬' in each_tiem['number']: number *= 10000 return number def __show(self, refine_total): for i in refine_total: print(i['title'] + '---' + i['number']) def start(self): htmls = self.__fetch_content() total = self.__analysis(htmls) refine_total = list(self.__refine(total)) refine_total = self.__sort(refine_total) self.__show(refine_total) spider = Spider() spider.start()