【反爬】之雲鎖伺服器安全反爬
阿新 • • 發佈:2021-10-14
什麼是雲鎖?
雲鎖其實是個伺服器安全軟體,主業也不是反爬蟲,不過有一條是可以防止cc攻擊,而爬蟲行為就像是頻次不高的cc攻擊,因而直接請求目標站並不能返回目標內容。
雲鎖如何反爬蟲?
(1)封禁高頻IP
(2)放個cookie
# -*- coding: UTF-8 -*- import os import sys from spiders.market_supervision_penalty.govement_penalty_base_spider import govement_penalty_base_spider from utils.common_util import * importdatetime import time from bs4 import BeautifulSoup from spiders.base_spiders.base_spider import * from urllib.parse import urlencode from config.proxy.config import * from utils.date_util import current_datetime class nmg_market_gov_hlbe(govement_penalty_base_spider): name = "nmg_market_gov_hlbe" def __init__(self, increment=None, *args, **kwargs): super(nmg_market_gov_hlbe, self).__init__(*args, **kwargs) self.increment = increment self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'scjdglj.hlbe.gov.cn', 'Referer': 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', } def start_requests(self): index_url = "http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/" yield scrapy.Request(url=index_url, method='GET', headers=self.headers, encoding="utf-8", dont_filter=True) def parse(self, response): resp_url = response.url resp_meta = copy.deepcopy(response.meta) try: resp_js = ''' var screen = { width : 1920, height: 1080 } var cookie = null; var location = null; var window = { location: { href: 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/' } } function stringToHex(str) { var val = ""; for (var i = 0; i < str.length; i++) { if (val == "") val = str.charCodeAt(i).toString(16); else val += str.charCodeAt(i).toString(16); } return val; } function YunSuoAutoJump() { var width = screen.width; var height = screen.height; var screendate = width + "," + height; var curlocation = window.location.href; if (-1 == curlocation.indexOf("_security_verify_")) { cookie = "srcurl=" + stringToHex(window.location.href) + ";path=/;"; } location = stringToHex(screendate) return [location , cookie] } ''' scurl = pyv8_engine_service(resp_js, functionName='YunSuoAutoJump').split(',')[1].split(';')[0] header = deepCopy(self.headers) cookie = response.headers["Set-Cookie"].decode().split(";")[0] header["cookie"] = cookie + ';' + scurl cookie_url = 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/?security_verify_data=313932302c31303830' yield scrapy.Request(url=cookie_url, method='GET', headers=header,meta={**resp_meta,'cookie':cookie}, encoding="utf-8", dont_filter=True,callback=self.parse_scurl) except: traceback.print_exc() self.logger.info(f"parse error url: {resp_url}") def parse_scurl(self, response): resp_url = response.url resp_meta = copy.deepcopy(response.meta) try: resp_soup = BeautifulSoup(response.text, 'html5lib') cookie_str, coolie_dict = getSetcookie2Str(response) if "security_session_mid_verify" in cookie_str: header = deepCopy(self.headers) header["cookie"] = cookie_str if "list" in str(resp_meta): yield scrapy.Request(url=resp_meta['list'], method='GET',headers=header, encoding="utf-8", dont_filter=True, callback=self.parse_list, meta=resp_meta) elif "detail" in str(resp_meta): yield scrapy.Request(url=resp_meta['detail'], method='GET',headers=header, encoding="utf-8", dont_filter=True, callback=self.parse_detail, meta=resp_meta) else: yield scrapy.Request(url='http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/', method='GET', headers=header, encoding="utf-8", dont_filter=True, callback=self.parse_number,meta=resp_meta) else: header = deepCopy(self.headers) header["cookie"] = resp_meta['cookie'] yield scrapy.Request(url=resp_url, method='GET', headers=self.headers,meta=resp_meta, encoding="utf-8", dont_filter=True, callback=self.parse_scurl) except: traceback.print_exc() self.logger.info(f"parse error url: {resp_url}") def parse_number(self, response): resp_url = response.url resp_meta = copy.deepcopy(response.meta) try: resp_soup = BeautifulSoup(response.text, 'html5lib') search_number = 2 if self.increment else 2 for index in range(1, search_number + 1): if index > 1: send_url = 'http://scjdglj.hlbe.gov.cn/chufa/qiangzhi2/{}/'.format(index) yield scrapy.Request(url=send_url, method='GET', headers=self.headers, encoding="utf-8", dont_filter=True, callback=self.parse_list) else: yield scrapy.Request(url=resp_url, method='GET', headers=self.headers, encoding="utf-8", dont_filter=True, callback=self.parse_list) except: traceback.print_exc() self.logger.info(f"parse error url: {resp_url}") def parse_list(self, response): resp_url = response.url try: resp_soup = BeautifulSoup(response.text, 'html5lib') if "security_verify_" not in response.text: detail_list = resp_soup.select('div.w670 li')[1:21] for detail in detail_list: if "href" in str(detail): detail_url = response.urljoin(detail.select_one('a')['href']) meta = { "pub_time":detail.select('div')[-1].text.strip(), 'docno': detail.select_one('div').text, 'fileno':detail.select('div')[1].text } yield scrapy.Request(url=detail_url, method='GET', headers=self.headers,meta=meta, encoding="utf-8", dont_filter=True, callback=self.parse_detail) else: yield scrapy.Request(url=resp_url, method='GET', headers=self.headers, meta={"list":resp_url}, encoding="utf-8", dont_filter=True, callback=self.parse) except: traceback.print_exc() self.logger.info(f"parse error url: {resp_url}")