spider———迴圈爬取花火所有期刊
阿新 • • 發佈:2018-12-18
熟悉soup和xpath方法,尋找Html標籤
import os import shutil import time import urllib.request from bs4 import BeautifulSoup from lxml import etree def handle_request(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', } request = urllib.request.Request(url=url,headers=headers) return request def get_content(text_src): request = handle_request(text_src) text_content = urllib.request.urlopen(request).read().decode('utf8') soup = BeautifulSoup(text_content,'lxml') other = soup.select('.pagelink > ul > li > b') if other == []: content_list = soup.select('.zw > p') content_all = '' for content in content_list: content1 = content.text # print(content) content_all += content1 + '\n' return content_all else: text_src = text_src others = int(other[0].text) content_all = '' for i in range(1,others+1): text_src_new = text_src.split('.') text_src_content = text_src_new[0] +'.'+ text_src_new[1] +'.'+ text_src_new[2] + '_%d.' % i + text_src_new[-1] if i == 1: text_src_content = text_src # print(text_src_content) request = handle_request(text_src_content) response = urllib.request.urlopen(request).read().decode('utf8') soup = BeautifulSoup(response, 'lxml') content_list = soup.select('.zw > p') content_all1 = '' for content in content_list: content1 = content.text # print(content) content_all1 += content1 + '\n' + '\n' content_all += content_all1 # print(content_all) return content_all def parse_catalog_text(catalog_text,title): tree = etree.HTML(catalog_text) catalog_text_src = tree.xpath('//div[@class="boxcon"]/ul//h3/a/@href') catalog_text_name = tree.xpath('//div[@class="boxcon"]/ul//h3/a/text()') for i in range(0,len(catalog_text_src)): text_src = 'https://m.feiyanqing.com' + catalog_text_src[i] text_name = catalog_text_name[i] print("正在下載--%s--...." % text_name) fp = open('%s.txt' % text_name, 'w', encoding='utf8') content = get_content(text_src) # print(content) fp.write(text_name + content) print("結束下載--%s--.." % text_name) time.sleep(2) fp.close() shutil.move('%s.txt' % text_name,title) def parse_catalog_content(catalog_content,dirname): soup = BeautifulSoup(catalog_content,'lxml') catalog = soup.select('.boxcon > ul > li > p > a') num = int(input("請輸入下載總目錄個數--")) for log in range(0,num): title = catalog[log].string # print("正在下載--%s--...." % title) if not os.path.exists(title): os.mkdir(title) catalog_src = 'https://m.feiyanqing.com' + catalog[log].attrs['href'] request = handle_request(catalog_src) catalog_text = urllib.request.urlopen(request).read().decode('utf8') parse_catalog_text(catalog_text,title) shutil.move(title,dirname) # print("結束下載--%s--.." % title) time.sleep(2) def main(): dirname = '花火' if not os.path.exists(dirname): os.mkdir(dirname) url = 'https://m.feiyanqing.com/huahuo/' request = handle_request(url) catalog_content = urllib.request.urlopen(request).read().decode('utf8') parse_catalog_content(catalog_content,dirname) # print(catalog_content) if __name__ == '__main__': main()