python爬取拉勾網之selenium
阿新 • • 發佈:2018-12-01
重點程式碼解釋:
1.呼叫lxml的etree實現xpath方法呼叫,xpath相對正則比較簡單,可以不在使用Beauitfulsoup定位
from lxml import etree
2.介面的可視話與否,對於你的執行資源只能用減少
opt=webdriver.ChromeOptions()
# 把chrome設定成無介面模式,不論windows還是linux都可以,自動適配對應引數
opt.set_headless()#無介面
self.driver=webdriver.Chrome(options=opt)
3.載入資料時到xpath定位的位置進行爬取
#此句話大致意思,執行driver 時間不超過20s 什麼時候載入到xpath定位的位置神魔時候停止開始執行頁面 內容爬去
WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,’//*[@id=“s_position_list”]/div[2]/div/a[6]’))
4.python 中join()函式strip() 函式和 split() 函式的詳解及例項詳細內容請到我的另一篇部落格
檢視
content = “”.join(html.xpath("//dd[@class=‘job_bt’]//text()")).strip()
‘’’
語法: ‘sep’.join(seq)
引數說明
sep:分隔符。可以為空
seq:要連線的元素序列、字串、元組、字典
上面的語法即:以sep作為分隔符,將seq所有的元素合併成一個新的字串
返回值:返回一個以分隔符sep連線各個元素後生成的字串
‘’’
from selenium import webdriver import lxml from lxml import etree import re import time import pymysql import urllib.request import requests from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By class LagouSpider(object): def __init__(self): opt=webdriver.ChromeOptions() # 把chrome設定成無介面模式,不論windows還是linux都可以,自動適配對應引數 opt.set_headless()#無介面 self.driver=webdriver.Chrome(options=opt) self.url="https://www.lagou.com/zhaopin/Python/" def run(self): self.driver.get(self.url) while True: source = self.driver.page_source #此句話大致意思,執行driver 時間不超過20s 什麼時候載入到xpath定位的位置神魔時候停止開始執行頁面 內容爬去 WebDriverWait(driver=self.driver,timeout=20).until(EC.presence_of_all_elements_located(By.XPATH,'//*[@id="s_position_list"]/div[2]/div/a[6]')) # WebDriverWait(driver=self.driver, timeout=20).until( # EC.presence_of_element_located((By.XPATH, '//*[@id="s_position_list"]/div[2]/div/a[6]')) # ) self.parse_list_page(source) # 點“下一頁” next_btn=self.driver.find_element_by_xpath( '//*[@id="s_position_list"]/div[2]/div/a[6]') # 提取下一頁的按鈕,注意class的值中有空格不可用。 if "pager_next_disabled" in next_btn.get_attribute("class"): break else: next_btn.click() time.sleep(1) # next_btn = self.driver.find_element_by_xpath( # '//*[@id="s_position_list"]/div[2]/div/a[6]') # if "pager_next_disabled" in next_btn.get_attribute("class"): # break # else: # next_btn.click() # time.sleep(1) # source=self.driver.page_source # #print(source) # self.parse_list_page(source) #職位url列表 def parse_list_page(self,source): #t通過etree呼叫xpath html=etree.HTML(source) links=html.xpath('//*[@id="s_position_list"]/ul/li/div[1]/div[1]/div[1]/a/@href') for link in links: self.request_detail_page(link) # print(link) #time.sleep(1) #執行提取的url def request_detail_page(self,url): #self.driver.get(url) #開啟新的頁面 self.driver.execute_script("window.open('%s')"%url) #切換控制代碼進入新開啟的頁面 self.driver.switch_to.window((self.driver.window_handles[1])) # self.driver.execute_script("window.open('%s')" % url) # self.driver.switch_to.window(self.driver.window_handles[1]) #加載出來工作名開始爬取 WebDriverWait(driver=self.driver, timeout=20).until( EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']/span[@class='name']")) ) source=self.driver.page_source self.parse_detail_page(source) # 關閉當前詳情頁,並且切換到列表頁 self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) #self.parse_list_page(source) #提取具體資訊 def parse_detail_page(self,source): html=etree.HTML(source) positionName=html.xpath("//div[@class='position-head']/div/div[1]/div/span/text()")[0] job_request_spans=html.xpath("//div[@class='position-head']/div/div[1]/dd/p[1]/span") salary=job_request_spans[0].xpath(".//text()")[0].strip() city=job_request_spans[1].xpath('.//text()')[0].strip() #city = re.match(r'<span class="xh-highlight">/(.*?) /</span>',city) city = re.sub(r"[\s/]", "", city)#此處將"/"替換為空"" work_years = job_request_spans[2].xpath('.//text()')[0].strip() work_years = re.sub(r"[\s/]", "", work_years) education = job_request_spans[3].xpath('.//text()')[0].strip() education = re.sub(r"[\s/]", "", education) content = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() #print(positionName) #mysql=MySQLPipeline() #mysql.process_item(positionName,salary,city,work_years,education,content) ''' 語法: 'sep'.join(seq) 引數說明 sep:分隔符。可以為空 seq:要連線的元素序列、字串、元組、字典 上面的語法即:以sep作為分隔符,將seq所有的元素合併成一個新的字串 返回值:返回一個以分隔符sep連線各個元素後生成的字串 ''' class MySQLPipeline(object): def __init__(self): self.conn = pymysql.connect(host="localhost",user="root",password="root",db="lagou", charset='utf8') self.cursor = self.conn.cursor() def process_item(self,positionName,salary,city,work_years,education,content): insert_sql = ''' insert into lagou_table(positionName,salary,city,work_years,education,content) values(%s,%s,%s,%s,%s,%s) ''' self.cursor.execute(insert_sql,(positionName,salary,city,work_years,education,content)) self.conn.commit() def close_spider(self,spider): #TypeError: close_spider() takes 1 positional argument but 2 were given self.cursor.close() self.conn.close() if __name__=="__main__": spider=LagouSpider() spider.run()