爬蟲10-百度貼吧
阿新 • • 發佈:2018-11-01
""" __title__ = '' __author__ = 'Thompson' __mtime__ = '2018/8/21' # code is far away from bugs with the god animal protecting I love animals. They taste delicious. ┏┓ ┏┓ ┏┛┻━━━┛┻┓ ┃ ☃ ┃ ┃ ┳┛ ┗┳ ┃ ┃ ┻ ┃ ┗━┓ ┏━┛ ┃ ┗━━━┓ ┃ 神獸保佑 ┣┓ ┃ 永無BUG! ┏┛ ┗┓┓┏━┳┓┏┛ ┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ """ __title__ = '' __author__ = 'Thompson' __mtime__ = '2018/8/21' # code is far away from bugs with the god animal protecting I love animals. They taste delicious. ┏┓ ┏┓ ┏┛┻━━━┛┻┓ ┃ ☃ ┃ ┃ ┳┛ ┗┳ ┃ ┃ ┻ ┃ ┗━┓ ┏━┛ ┃ ┗━━━┓ ┃ 神獸保佑 ┣┓ ┃ 永無BUG! ┏┛ ┗┓┓┏━┳┓┏┛ ┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ from urllib import parse from urllib import request from lxml import etree import csv import codecs def ba_spider(): url = 'https://tieba.baidu.com/f?' headers = {} headers['User-Agent'] = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" ba_name = input('請輸入貼吧的名字:') word = {'kw': ba_name} begin_page = int(input('起始頁碼:')) end_page = int(input('終止頁碼:')) for page in range(begin_page, end_page+1): word['pn'] = (page-1)*50 wd = parse.urlencode(word) end_url = url + wd req = request.Request(end_url, headers=headers) response = request.urlopen(req) html = response.read().decode() temp = etree.HTML(html) links = temp.xpath("//li[contains(@class,'j_thread_list clearfix')]") print(len(links)) base_url = "https://tieba.baidu.com" for link in links: # 回覆數 pv = link.xpath('./div/div[1]/span[@class="threadlist_rep_num center_text"]/text()')[0] title = link.xpath('./div/div[2]/div[1]/div[1]/a/text()')[0] teizi_url = base_url + link.xpath('./div/div[2]/div[1]/div[1]/a/@href')[0] author = link.xpath('./div/div[2]/div[1]/div[2]/span[1]/span[1]/a/text()') if len(author) > 0: author = author[0] else: author = link.xpath('./div/div[2]/div[1]/div[2]/span[1]/span[2]/a/text()') author = author[0] print('author:', author) with codecs.open('data/tieba_'+ba_name+'.csv', 'a', encoding='utf-8') as file: wr = csv.writer(file) wr.writerow([title, author, pv, teizi_url]) print('Success') ba_spider()