爬取站長之家免費簡歷
阿新 • • 發佈:2020-10-09
爬取站長之家免費簡歷
import os import requests from lxml import etree dir_name = './簡歷模板' if not os.path.exists(dir_name): os.mkdir(dir_name) url = 'http://sc.chinaz.com/jianli/free_%d.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', } for page in range(1, 21): if page == 1: new_url = 'http://sc.chinaz.com/jianli/free.html' else: new_url = format(url%page) page_text = requests.get(new_url,headers=headers).text tree = etree.HTML(page_text) detail_url_list = tree.xpath('//div[@id="container"]/div') for detail_url in detail_url_list: detail_url = detail_url.xpath('./a/@href')[0] detail_page = requests.get(detail_url, headers=headers).text tree = etree.HTML(detail_page) resume_name = tree.xpath('//div[@class="bgwhite"]/div[1]/h1/text()')[0].encode('iso-8859-1').decode('utf-8') resume_url = tree.xpath('//div[@id="down"]/div[2]//li[6]/a/@href')[0] file_path = dir_name + '/' + resume_name + '.rar' print(resume_name,'開始下載') rar_content = requests.get(resume_url, headers=headers).content with open(file_path, 'wb') as f: f.write(rar_content) print(resume_name, '下載完成') print('全部下載完成')