Python爬蟲網頁的列表頁
私募基金公示列表頁:
開啟網頁,右鍵檢查,檢視network中的url,找出頁面url變化的規律:
發現就是page這個引數的變化,ok
# -*- coding: utf-8 -*- """ Created on Sat Oct 27 11:21:11 2018
@author: Belinda """ from lxml import etree import requests import csv import time from multiprocessing import *
def spider(): headers = {'User-Agent': 'Mozilla/5.0 (Window NT 10.0; WOW64)\ AppleWebKit/537.36 (KTML,like Gecko) Chrome/46.0.2490.80 Safari/537.36'} for i in range(0,4): url='http://gs.amac.org.cn/amac-infodisc/api/pof/fund?rand=0.49229080398526315&page={}&size=20'.format(i) html=requests.get(url,headers=headers) time.sleep(1) #用獲取的頁面初始化etree,得到一個selector #然後用selector使用xpath提取資料 selector=etree.HTML(html.text) #先獲取基金列表,檢視每一行資料的xpath,提取相同部分作為simu_list的xapath simu_list=selector.xpath('//*[@id="fundlist"]/tbody/tr[1]') for simu in simu_list: id=''.join(simu.xpath('td[1]/text()')) fundName=''.join(simu.xpath('td[2]/a/text()')) managerName=''.join(simu.xpath('td[3]/a/text()')) mandatorName=''.join(simu.xpath('td[4]/text()')) establishDate=''.join(simu.xpath('td[5]/text()')) recordTime=''.join(simu.xpath('td[6]/text()')) item=(id,fundName,managerName,mandatorName,establishDate,recordTime) print(item) writer.writerow(item)
if __name__=='__main__': fp=open("./simuwang.csv",'a+',encoding="utf-8",newline="") writer=csv.writer(fp) writer.writerow(('id', 'fundName','managerName','mandatorName', 'establishDate','recordTime'))#csv檔案的每列的列表名 #pool=Pool(4) #pool=mutiprocessing.Pool(4) #pool.map(spider()) spider() fp.close() print("爬取結束!")