本套是一個可以隨時檢視的爬蟲模板，方便寫爬蟲時候用

阿新 • • 發佈：2022-03-01

1.本套是一個可以隨時檢視的爬蟲模板，方便寫爬蟲時候用

urllib,request模型

from urlli.request import Request, urlopen
from urllib.paarse import urlencode
url = ''
headers = {
    'user-agent':'',
    'cookie':''
    'x-requested-with':''
    
}
def 函式名():
    request = Request(url=url,headers=headers)
	response = urlopen(request) #發起請求
	assert response.code == 200
    resp_text = response.read() # 讀取資料
    with open('檔名','wb') as f: # 寫入資料,如果是圖片寫wb，如果是文字，w即可
        f.write(resp_text)
if __name__ == '__main__':
    函式名('內容')# 呼叫
    
 #######################################################

import requests
def getHTMLText(url):
      try:
            r=requests.get(url,timeout=30)
            ''' 
		   res=requests.post(url,data=data,headers=headers)
            '''
            r.raise_for_status()  #如果狀態不是200，引發HTMLError異常
            r.encoding=r.apparent_encoding
            return r.text
       except:
             return "產生異常"
if __name__=="__main__":
     url="http://www.baidu.com"
     print(getHTMLText(url))

bs4

from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = ''
headers = {
    'user-agent':'',
    'cookie':''
    'x-requested-with':''
    
}
req = urllib.request.Request(url,headers=headers)
resp = urllib.request.urlopen(req)
html_text = resp.read()

# 把字串解析為html的文件
html = BeautifulSoup(html_text,'lxml')
with open('檔名','w','utf8') as f:
    f.write(html)

xpath

# -*- coding: utf-8 -*-
import requests
from lxml import etree
import os
if __name__ == '__main__':
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
     }
     url='http://sc.chinaz.com/jianli/free.html'
     page_text=requests.get(url=url,headers=headers).text
     tree = etree.HTML(page_text)
     div_list=tree.xpath('//div[@id="main"]/div/div')

     if not os.path.exists('./muban'):
         os.mkdir('./muban')

     url_list=[]
     name_list=[]
     for div in div_list:

         muban_href= div.xpath('./a/@href')[0]
         #print(muban_href)
         download_url = requests.get(url=muban_href,headers=headers).text
         #print(download_url)
         download_tree = etree.HTML(download_url)
         name=download_tree.xpath('//div[@class="bread clearfix"]/a[3]/text()')[0]+'.rar'
         download_name = name.encode('iso-8859-1').decode('utf-8')
         #print(download_name)
         download_url = download_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')
         url_list.append(download_url)
         name_list.append(download_name)

     for name,url in zip(name_list,url_list):
         print(name,url)



########################################
import request
from lxml import etree
class RequestError(Exception):
    pass

class ParseError(Exception):
    pass
def get(url):
    headers = {
        'user-agent':''
    }
    resp = request.get(url,headers=headers)
	if resp.status_code == 200:
        parse(resp.text)
	else:
        raise RequestError('請求失敗')

def parse(html):
    root = etree.HTML(html)
    divs = root.xpath('//div')
    '''
    // 相對於整個文件
    ./相對於當前節點
    .//當前下標籤的路徑查詢
    //title/text() 提取文字
    //img/@href 提取屬性
    獲取網頁中的資料型別與字符集, 獲取第一個`<meta>`標籤


    '''

re

import re
import os
headers = {
    'user-agent':'',
    'cookie':''
    'x-requested-with':''
}
url = ''
resp = request.get(url=url,headers=headers)
resp.encoding = 'utf-8'
assert resp.status_code == 200
html = resp.text
with open('檔名.html','w',encoding='utf-8') as f:
    f.write(html)
compile_ = re.compile(r'<img src2="(.*?)" alt="(.*?)">')
compile_2 = re.compile(r'<img src="(.*?)" alt="(.*?)">')
imgs = compile_.findall(html)
if len(imgs) ==0:
    imgs = compile_2.findall(html)
print(len(imgs),imgs,sep='\n')
next_url = re.findall(r'<a href="(.*?)" class="nextpage">下一頁</a>',html,re.S) #獲取下一頁

'''
正則
^,起始
,$末尾
\w 字母數字下劃線
\b 匹配邊界
\s 空格
\W 匹配任意不是字母數字下劃線
\S 匹配任意不是空白字元的
\D 匹配任意非數字的字元
\B 匹配不是單詞開頭或結束的位置
'''

selenium

# 模板一
from selenium import webdriver
import unittest
from selenium.webdriver.support.wait import WebDriverWait

class TestLogin(unittest.TestCase):
    # 指定瀏覽器
    def setUp(self):
        self.driver = webdriver.Firefox(executable_path="F:\Program Files (x86)\Mozilla Firefox\geckodriver.exe")
    # 開啟url
        self.driver.get("http://192.168.1.151:8080/login?from=%2F")

    # 登入操作
    def test_login(self):
        username = "test001"
        password = "pass001"

        # 執行登入操作
        #使用者名稱的定位
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('j_username')).clear()
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('j_username')).send_keys(username)
        #密碼的定位
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_name('j_password')).clear()
        WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_name('j_password')).send_keys(password)
        # 點選登入
        self.driver.find_element_by_id("yui-gen1-button").click()
        # 登入成功斷言
        currUrl = self.driver.current_url
        print("currUrl"+currUrl)
        if currUrl == "http://192.168.1.151:8080/":
            print("success")
        else:
            print("failure")

    # 關閉瀏覽器
    def tearDown(self):
        self.driver.quit()


if __name__ == "__main__":
    unittest.main()

#####爬取招聘網
import re
import json
import time

import requests
from selenium import webdriver
from selenium.webdriver import Chrome, ActionChains
from  selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
from selenium.webdriver.support import ui,expected_conditions


from utils.get_headers import user_agent

headers = user_agent()

chrome = Chrome(executable_path='chromedriver')

def get_all_city():
    url = 'https://www.zhaopin.com/citymap'
    resp = requests.get(url,headers=headers)
    if resp.status_code == 200:
        # resp.encoding = 'gbk'
        html = resp.text
        s = re.search(r'<script>__INITIAL_STATE__=(.*?)</script>',html)
        json_data = s.group()[0]
        data = json.loads(json_data)
        cityMapList = data['cityList']['cityMapList']
        for letter, citys in cityMapList.items():
            print(f'---{letter}---')
            for city in citys:
                yield city


def get_city_job(url):
    chrome.get(url)

    #找到搜尋框

    search = chrome.find_element_by_xpath('//input[@class="zp-search__input"]')

    search.send_keys('python')
    #向右移動
    # chrome.execute_script('window.scrollTop(2000,document.body.scrollwidth)')
    btn = chrome.find_element_by_class_name('zp-search__btn--blue')
    btn.click()
    #瀏覽器開啟第二個視窗
    chrome.switch_to.window(chrome.window_handles[1])
    time.sleep(10) #只能手動登入了
    divs = chrome.find_elements_by_class_name('iteminfo')
    for div in divs:

        title = div.find_element(By.XPATH,'.//span[@class="iteminfo__line1__jobname__name"]')
        salary = div.find_element(By.XPATH, './/p[@class="iteminfo__line2__jobdesc__salary"]')
        title = title.text
        salary = salary.text
    #     tags = chrome.find_element_by_xpath('div[@class="iteminfo__line3__welfare"]/div/text()')
    # for tag in tags:
    #     print(tag)
        print(title,salary)
    # divs = chrome.find_element_by_class_name('list')
    # for div in divs:
    #
    #
    #     job_title = div.find_element(By.XPATH,'.//a/div[1]/span/text()')
        # salary =
        # company =
        # print(job_title)
def get_city_jobs2(url):
    pass

if __name__ == '__main__':
    get_all_city()
    get_city_job('https://www.zhaopin.com/citymap')