1. 程式人生 > 其它 >對Python3解析html的幾種操作方式小結

對Python3解析html的幾種操作方式小結

解析html是爬蟲後的重要的一個處理資料的環節。一下記錄解析html的幾種方式。

先介紹基礎的輔助函式,主要用於獲取html並輸入解析後的結束

    #把傳遞解析函式,便於下面的修改
    def get_html(url, paraser=bs4_paraser):
     headers = {
      'Accept': '*/*',
      'Accept-Encoding': 'gzip, deflate, sdch',
      'Accept-Language': 'zh-CN,zh;q=0.8',
      'Host': 'www.360kan.com',
      'Proxy-Connection': 'keep-alive',
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
     }
     request = urllib2.Request(url, headers=headers)
     response = urllib2.urlopen(request)
     response.encoding = 'utf-8'
     if response.code == 200:
      data = StringIO.StringIO(response.read())
      gzipper = gzip.GzipFile(fileobj=data)
      data = gzipper.read()
      value = paraser(data) # open('E:/h5/haPkY0osd0r5UB.html').read()
      return value
     else:
      pass
     
     
    value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
    for row in value:
     print row
    

1,lxml.html的方式進行解析,

The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and
libxslt. It is unique in that it combines the speed and XML feature
completeness of these libraries with the simplicity of a native Python API,
mostly compatible but superior to the well-known ElementTree API. The latest
release works with all CPython versions from 2.6 to 3.5. See the introduction
for more information about background and goals of the lxml project. Some
common questions are answered in the FAQ.

官網

    def lxml_parser(page):
     data = []
     doc = etree.HTML(page)
     all_div = doc.xpath('//div[@class="yingping-list-wrap"]')
     for row in all_div:
      # 獲取每一個影評,即影評的item
      all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})
      for r in all_div_item:
       value = {}
       # 獲取影評的標題部分
       title = r.xpath('.//div[@class="g-clear title-wrap"][1]')
       value['title'] = title[0].xpath('./a/text()')[0]
       value['title_href'] = title[0].xpath('./a/@href')[0]
       score_text = title[0].xpath('./div/span/span/@style')[0]
       score_text = re.search(r'\d+', score_text).group()
       value['score'] = int(score_text) / 20
       # 時間
       value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]
       # 多少人喜歡
       value['people'] = int(
         re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
       data.append(value)
     return data

2,使用BeautifulSoup,不多說了,大家網上找資料看看

    def bs4_paraser(html):
     all_value = []
     value = {}
     soup = BeautifulSoup(html, 'html.parser')
     # 獲取影評的部分
     all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)
     for row in all_div:
      # 獲取每一個影評,即影評的item
      all_div_item = row.find_all('div', attrs={'class': 'item'})
      for r in all_div_item:
       # 獲取影評的標題部分
       title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)
       if title is not None and len(title) > 0:
        value['title'] = title[0].a.string
        value['title_href'] = title[0].a['href']
        score_text = title[0].div.span.span['style']
        score_text = re.search(r'\d+', score_text).group()
        value['score'] = int(score_text) / 20
        # 時間
        value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string
        # 多少人喜歡
        value['people'] = int(
          re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())
       # print r
       all_value.append(value)
       value = {}
     return all_value
    

3,使用SGMLParser,主要是通過start、end tag的方式進行了,解析工程比較明朗,但是有點麻煩,而且該案例的場景不太適合該方法,(哈哈)

    class CommentParaser(SGMLParser):
     def __init__(self):
      SGMLParser.__init__(self)
      self.__start_div_yingping = False
      self.__start_div_item = False
      self.__start_div_gclear = False
      self.__start_div_ratingwrap = False
      self.__start_div_num = False
      # a
      self.__start_a = False
      # span 3中狀態
      self.__span_state = 0
      # 資料
      self.__value = {}
      self.data = []
     
     def start_div(self, attrs):
      for k, v in attrs:
       if k == 'class' and v == 'yingping-list-wrap':
        self.__start_div_yingping = True
       elif k == 'class' and v == 'item':
        self.__start_div_item = True
       elif k == 'class' and v == 'g-clear title-wrap':
        self.__start_div_gclear = True
       elif k == 'class' and v == 'rating-wrap g-clear':
        self.__start_div_ratingwrap = True
       elif k == 'class' and v == 'num':
        self.__start_div_num = True
     
     def end_div(self):
      if self.__start_div_yingping:
       if self.__start_div_item:
        if self.__start_div_gclear:
         if self.__start_div_num or self.__start_div_ratingwrap:
          if self.__start_div_num:
           self.__start_div_num = False
          if self.__start_div_ratingwrap:
           self.__start_div_ratingwrap = False
         else:
          self.__start_div_gclear = False
        else:
         self.data.append(self.__value)
         self.__value = {}
         self.__start_div_item = False
       else:
        self.__start_div_yingping = False
     
     def start_a(self, attrs):
      if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
       self.__start_a = True
       for k, v in attrs:
        if k == 'href':
         self.__value['href'] = v
     
     def end_a(self):
      if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
       self.__start_a = False
     
     def start_span(self, attrs):
      if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
       if self.__start_div_ratingwrap:
        if self.__span_state != 1:
         for k, v in attrs:
          if k == 'class' and v == 'rating':
           self.__span_state = 1
          elif k == 'class' and v == 'time':
           self.__span_state = 2
        else:
         for k, v in attrs:
          if k == 'style':
           score_text = re.search(r'\d+', v).group()
         self.__value['score'] = int(score_text) / 20
         self.__span_state = 3
       elif self.__start_div_num:
        self.__span_state = 4
     
     def end_span(self):
      self.__span_state = 0
     
     def handle_data(self, data):
      if self.__start_a:
       self.__value['title'] = data
      elif self.__span_state == 2:
       self.__value['time'] = data
      elif self.__span_state == 4:
       score_text = re.search(r'\d+', data).group()
       self.__value['people'] = int(score_text)
      pass
    def sgl_parser(html):
     parser = CommentParaser()
     parser.feed(html)
     return parser.data
    

4,HTMLParaer,與3原理相識,就是呼叫的方法不太一樣,基本上可以公用,

    class CommentHTMLParser(HTMLParser.HTMLParser):
     def __init__(self):
      HTMLParser.HTMLParser.__init__(self)
      self.__start_div_yingping = False
      self.__start_div_item = False
      self.__start_div_gclear = False
      self.__start_div_ratingwrap = False
      self.__start_div_num = False
      # a
      self.__start_a = False
      # span 3中狀態
      self.__span_state = 0
      # 資料
      self.__value = {}
      self.data = []
     
     def handle_starttag(self, tag, attrs):
      if tag == 'div':
       for k, v in attrs:
        if k == 'class' and v == 'yingping-list-wrap':
         self.__start_div_yingping = True
        elif k == 'class' and v == 'item':
         self.__start_div_item = True
        elif k == 'class' and v == 'g-clear title-wrap':
         self.__start_div_gclear = True
        elif k == 'class' and v == 'rating-wrap g-clear':
         self.__start_div_ratingwrap = True
        elif k == 'class' and v == 'num':
         self.__start_div_num = True
      elif tag == 'a':
       if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
        self.__start_a = True
        for k, v in attrs:
         if k == 'href':
          self.__value['href'] = v
      elif tag == 'span':
       if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
        if self.__start_div_ratingwrap:
         if self.__span_state != 1:
          for k, v in attrs:
           if k == 'class' and v == 'rating':
            self.__span_state = 1
           elif k == 'class' and v == 'time':
            self.__span_state = 2
         else:
          for k, v in attrs:
           if k == 'style':
            score_text = re.search(r'\d+', v).group()
          self.__value['score'] = int(score_text) / 20
          self.__span_state = 3
        elif self.__start_div_num:
         self.__span_state = 4
     
     def handle_endtag(self, tag):
      if tag == 'div':
       if self.__start_div_yingping:
        if self.__start_div_item:
         if self.__start_div_gclear:
          if self.__start_div_num or self.__start_div_ratingwrap:
           if self.__start_div_num:
            self.__start_div_num = False
           if self.__start_div_ratingwrap:
            self.__start_div_ratingwrap = False
          else:
           self.__start_div_gclear = False
         else:
          self.data.append(self.__value)
          self.__value = {}
          self.__start_div_item = False
        else:
         self.__start_div_yingping = False
      elif tag == 'a':
       if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
        self.__start_a = False
      elif tag == 'span':
       self.__span_state = 0
     
     def handle_data(self, data):
      if self.__start_a:
       self.__value['title'] = data
      elif self.__span_state == 2:
       self.__value['time'] = data
      elif self.__span_state == 4:
       score_text = re.search(r'\d+', data).group()
       self.__value['people'] = int(score_text)
      pass
    def html_parser(html):
     parser = CommentHTMLParser()
     parser.feed(html)
     return parser.data
    

3,4對於該案例來說確實是不太適合,趁現在有空記錄下來,功學習使用!

以上這篇對Python3 解析html的幾種操作方式小結就是小編分享給大家的全部內容了,希望能給大家一個參考,也希望大家多多支援指令碼之家。