[Python] [爬蟲] 5.批量政府網站的招投標、中標資訊爬取和推送的自動化爬蟲——網頁下載器
阿新 • • 發佈:2018-11-09
目錄
1.Intro
檔名:pageDownloader.py
模組名:網頁下載器
引用庫:
selenium | random | sys |
socket | time | urllib2 |
自定義引用檔案:configManager、pageResolver、dataDisposer、Console_Color
Console_Color原始碼:
#!/usr/bin/env python # -*- coding:utf-8 -*- ''' Author: YSW Time: 2018-7-20 Version: 1.0 Describe: 控制檯顏色列印控制 Update: None ''' import sys reload(sys) sys.setdefaultencoding('utf-8') ''' 開頭部分: \033[顯示方式;前景色;背景色m 結尾部分: \033[0m 完整格式: <\033[顯示方式;前景色;背景色m><要列印的文字><\033[0m> 如果有空格,空格也會打印出來 ''' # 顯示方式引數 DISPLAY_TYPE = { "預設值": 0, "高亮": 1, "非粗體": 22, "下劃線": 4, "非下劃線": 24, "閃爍": 5, "非閃爍": 25, "反顯": 7, "非反顯": 27, } # 前景色引數 FOREGROUND_COLOR = { "黑色": 30, "紅色": 31, "綠色": 32, "黃色": 33, "藍色": 34, "洋紅": 35, "青色": 36, "白色": 37, } # 背景色引數 BACKGROUND_COLOR = { "黑色": 40, "紅色": 41, "綠色": 42, "黃色": 43, "藍色": 44, "洋紅": 45, "青色": 46, "白色": 47, } def _parameters_3(type, forecolor, backcolor): ''' 返回引數(3) :param type: 顯示方式 :param forecolor: 前景色 :param backcolor: 背景色 :return: 返回兩個引數值 ''' display_type = DISPLAY_TYPE[type.encode('utf-8')] foreground_color = FOREGROUND_COLOR[forecolor.encode('utf-8')] back_color = BACKGROUND_COLOR[backcolor.encode('utf-8')] return display_type, foreground_color, back_color def _parameters_2(type, forecolor): ''' 返回引數(2) :param type: 顯示方式 :param forecolor: 前景色 :return: 返回兩個引數值 ''' display_type = DISPLAY_TYPE[type.encode('utf-8')] foreground_color = FOREGROUND_COLOR[forecolor.encode('utf-8')] return display_type, foreground_color def print_color_back(str, type, forecolor, backcolor): ''' 列印帶背景色的字型 :param str: 字串 :param type: 顯示方式 :param forecolor: 前景色 :param backcolor: 背景色 :return: 返回列印結果 ''' display_type, foreground_color, back_color = _parameters_3(type, forecolor, backcolor) head = "\033[{0};{1};{2}m".format(display_type, foreground_color, back_color) end = "\033[0m" str_color = head + str + end print(str_color) def print_color(str, type="預設值", forecolor="綠色"): ''' 列印字型 :param str: 字串 :param type: 顯示方式 :param forecolor: 前景色 :return: 返回列印結果 ''' display_type, foreground_color = _parameters_2(type, forecolor) head = "\033[{0};{1}m".format(display_type, foreground_color) end = "\033[0m" str_color = head + str + end print(str_color) def print_color_line(str, sign, step, type="預設值", forecolor="綠色"): ''' 列印帶符號行的字型 :param str: 字串 :param sign: 符號 :param step: 步長 :param type: 顯示方式 :param forecolor: 前景色 :return: 返回列印結果 ''' display_type, foreground_color = _parameters_2(type, forecolor) head = "\033[{0};{1}m".format(display_type, foreground_color) end = "\033[0m" print(head) print(sign*int(step)) print(str) print(sign*int(step)) print(end)
功能:大部分網站都是JS動態載入的,所以主要通過selenium獲取網頁原始碼,urllib2獲取靜態頁面原始碼。
2.Source
#!/usr/bin/env Python
# -*- coding: utf-8 -*-
'''
# Author : YSW
# Time : 2018/6/6 14:03
# File : pageDownloader.py
# Version : 2.0
# Describe: 網頁下載器
# Update :
1.重構了網頁下載器,只儲存當天的資料
2.優化了排序演算法
'''
from selenium import webdriver
# 引入配置物件DesiredCapabilities
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import configManager
import random
import sys
import socket
import time
import pageResolver
import dataDisposer
import urllib2
from Lib import Console_Color
# 設定預設編碼,防止出現中文字元亂碼
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
reload(sys)
sys.setdefaultencoding(defaultencoding)
HEADERS = {
"User-Agent": random.choice(configManager.headers)
}
URL = configManager.urlData
URL_ZB = configManager.urlData_ZB
class DownLoader(object):
def __init__(self, headers):
self.headers = headers
self.dcap = dict(DesiredCapabilities.PHANTOMJS)
# 從USER_AGENTS列表中隨機選一個瀏覽器頭,偽裝瀏覽器
self.dcap["phantomjs.page.settings.userAgent"] = self.headers
# 不載入圖片,爬頁面速度會快很多
self.dcap["phantomjs.page.settings.loadImages"] = False
@staticmethod
def get_url(url, proxy_dict):
'''
獲得靜態頁面
:param url: 靜態頁面 URL 地址
:param proxy_dict: 代理
:return: 返回靜態網頁原始碼
'''
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
proxy_handler = urllib2.ProxyHandler({proxyProtocol: "{0}:{1}".format(proxyIP, proxyPort)})
opener_proxy = urllib2.build_opener(proxy_handler)
urllib2.install_opener(opener_proxy)
request = urllib2.Request(url=url, headers=HEADERS)
response = urllib2.urlopen(request)
html = response.read()
return html
def current_time_parse(self, current_date):
'''
獲取當前時間,返回月份和天數
:return: 當前月份和天數
'''
current_month = current_date.month
current_day = current_date.day
return current_month, current_day
def check_exist(self, tender_table, condition1, value1, condition2, value2):
'''
判斷資料是否存在
:param tender_table: 資料表
:param condition1: 條件1
:param value1: 條件值1
:param condition2: 條件2
:param value2: 條件值2
:return: 不為空返回 False, 為空返回 True
'''
list_data = list(tender_table.find(
{
condition1: value1,
condition2: value2
}
)
)
if len(list_data) == 0:
return True
return False
#### 招投標資料 ####
def downloader_ynsggzxxt(self, url, proxy_dict):
'''
雲南省公共資源交易中心電子服務系統下載器
'''
website_name = "雲南省公共資源交易中心電子服務系統_工程建設"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynsggzxxt')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 201):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resovler_ynsggzxxt(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_class_name("mmggxlh").find_element_by_link_text('下一頁').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
projectNumber = resolve["專案編號"]
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
end_time = resolve["截止時間"]
status = resolve["狀態"]
href = resolve["連結"]
Console_Color.print_color("[+] 專案編號:{0},公告標題:{1},釋出時間:{2},截止時間:{3},狀態:{4},連結:{5}".format(projectNumber, title.encode('utf-8'),
start_time, end_time,
status.encode('utf-8'), href))
for resolve in data_list:
# 列印
projectNumber = resolve["專案編號"]
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
end_time = resolve["截止時間"]
status = resolve["狀態"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="專案編號", value1=projectNumber, condition2="公告標題", value2=title):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 專案編號:{0},公告標題:{1},釋出時間:{2},截止時間:{3},狀態:{4},連結:{5}".format(projectNumber, title.encode('utf-8'), start_time, end_time, status.encode('utf-8'), href))
print("[+] 儲存成功")
def downloader_ynsggzxxt_zf(self, url, proxy_dict):
'''
雲南省公共資源交易中心電子服務系統下載器
'''
website_name = "雲南省公共資源交易中心電子服務系統_政府採購"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynsggzxxt_zf')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 201):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resovler_ynsggzxxt_zf(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_class_name("mmggxlh").find_element_by_link_text('下一頁').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
projectNumber = resolve["專案編號"]
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
end_time = resolve["截止時間"]
status = resolve["狀態"]
href = resolve["連結"]
Console_Color.print_color(
"[+] 專案編號:{0},公告標題:{1},釋出時間:{2},截止時間:{3},狀態:{4},連結:{5}".format(projectNumber, title.encode('utf-8'),
start_time, end_time,
status.encode('utf-8'), href))
for resolve in data_list:
# 列印
projectNumber = resolve["專案編號"]
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
end_time = resolve["截止時間"]
status = resolve["狀態"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="專案編號", value1=projectNumber, condition2="公告標題", value2=title):
# 儲存資訊
dataSaver.insert_data(resolve)
print(
"[+] 專案編號:{0},公告標題:{1},釋出時間:{2},截止時間:{3},狀態:{4},連結:{5}".format(projectNumber, title.encode('utf-8'),
start_time, end_time,
status.encode('utf-8'), href))
print("[+] 儲存成功")
def downloader_ynsggzzw(self, url, proxy_dict):
'''
雲南省公共資源交易中心網(舊)下載器
'''
website_name = "雲南省公共資源交易中心網_工程建設"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynsggzzw')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 101):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
resolve_result = resolver.resovler_ynsggzzw(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_class_name("mmggxlh").find_element_by_link_text('下一頁').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
projectNumber = resolve["專案編號"]
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 專案編號:{0},公告標題:{1},釋出時間:{2},連結:{3}".format(projectNumber, title.encode('utf-8'), start_time,
href))
for resolve in data_list:
# 列印
projectNumber = resolve["專案編號"]
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="專案編號", value1=projectNumber, condition2="公告標題", value2=title):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 專案編號:{0},公告標題:{1},釋出時間:{2},連結:{3}".format(projectNumber, title.encode('utf-8'), start_time,
href))
print("[+] 儲存成功")
def downloader_kmsgg(self, url, proxy_dict):
'''
昆明市公共資源交易中心網下載器
'''
website_name = "昆明市公共資源交易中心網_政府採購"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('kmsgg')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 51):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resovler_kmsgg(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_id("btnAjax_NextPage").click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
end_time = resolve["結束時間"]
status = resolve["狀態"]
href = resolve["連結"]
Console_Color.print_color("[+] 編號:{0},工程名稱:{1},釋出時間:{2},結束時間:{3},狀態:{4},連結:{5}".format(projectNumber, project_name,
start_time, end_time, status, href))
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
end_time = resolve["結束時間"]
status = resolve["狀態"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="編號", value1=projectNumber, condition2="工程名稱",
value2=project_name):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 編號:{0},工程名稱:{1},釋出時間:{2},結束時間:{3},狀態:{4},連結:{5}".format(projectNumber, project_name,
start_time, end_time, status, href))
print("[+] 儲存成功")
def downloader_kmsgg_gc(self, url, proxy_dict):
'''
昆明市公共資源交易中心網下載器
'''
website_name = "昆明市公共資源交易中心網_工程建設"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('kmsgg_gc')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 51):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resovler_kmsgg_gc(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_id("btnAjax_NextPage").click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
end_time = resolve["結束時間"]
status = resolve["狀態"]
href = resolve["連結"]
Console_Color.print_color("[+] 編號:{0},工程名稱:{1},釋出時間:{2},結束時間:{3},狀態:{4},連結:{5}".format(projectNumber, project_name,
start_time, end_time, status, href))
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
end_time = resolve["結束時間"]
status = resolve["狀態"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="編號", value1=projectNumber, condition2="工程名稱",
value2=project_name):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 編號:{0},工程名稱:{1},釋出時間:{2},結束時間:{3},狀態:{4},連結:{5}".format(projectNumber, project_name,
start_time, end_time, status, href))
print("[+] 儲存成功")
def downloader_ynszfcgw(self, url, proxy_dict):
'''
雲南省政府採購網下載器
'''
website_name = "雲南省政府採購網"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
# 不知道為什麼 PhantomJS 打不開這個網站
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynszfcgw')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 201):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resovler_ynszfcgw(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_xpath('//a[@data-page="next"]').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
area = resolve["區劃"]
project_number = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 區劃:{0},編號:{1},工程名稱:{2},釋出時間:{3},連結:{4}".format(area.encode('utf-8'),
project_number.encode('utf-8'),
project_name.encode('utf-8'), start_time,
href))
for resolve in data_list:
# 列印
area = resolve["區劃"]
project_number = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="編號", value1=project_number, condition2="工程名稱",
value2=project_name):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 區劃:{0},編號:{1},工程名稱:{2},釋出時間:{3},連結:{4}".format(area.encode('utf-8'),
project_number.encode('utf-8'),
project_name.encode('utf-8'), start_time,
href))
print("[+] 儲存成功")
#### 中標資料 ####
def downloader_ynsggzxxt_gc_zb(self, url, proxy_dict):
'''
雲南省公共資源交易資訊網_工程建設_中標公告下載器
'''
website_name = "雲南省公共資源交易資訊網_工程建設_中標公告"
print(url)
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynsggzxxt_gc_zb')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 201):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resovler_ynsggzxxt_gc_zb(html, page_num, proxy_dict)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_class_name("mmggxlh").find_element_by_link_text('下一頁').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
project_name_parse = resolve["公告名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
people = resolve["中標公司"]
price = resolve["中標價格"]
Console_Color.print_color("[+] 專案名稱:{0},釋出時間:{1},連結:{2},中標人:{3},中標價:{4}".format(project_name_parse.encode('utf-8'), start_time, href.encode('utf-8'), people, price))
for resolve in data_list:
# 列印
project_name_parse = resolve["公告名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
people = resolve["中標公司"]
price = resolve["中標價格"]
if self.check_exist(tender_table, condition1="連結", value1=href, condition2="公告名稱", value2=project_name_parse):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 專案名稱:{0},釋出時間:{1},連結:{2},中標人:{3},中標價:{4}".format(project_name_parse.encode('utf-8'), start_time, href.encode('utf-8'), people, price))
print("[+] 儲存成功")
def downloader_ynsggzxxt_zf_zb(self, url, proxy_dict):
'''
雲南省公共資源交易資訊網_政府採購_中標結果下載器
'''
website_name = "雲南省公共資源交易資訊網_政府採購_中標結果"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynsggzxxt_zf_zb')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 201):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resovler_ynsggzxxt_zf_zb(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_class_name("mmggxlh").find_element_by_link_text('下一頁').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
project_name_parse = resolve["公告名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 專案名稱:{0},釋出時間:{1},連結:{2}".format(project_name_parse.encode('utf-8'), start_time, href.encode('utf-8')))
for resolve in data_list:
# 列印
project_name_parse = resolve["公告名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="連結", value1=href, condition2="公告名稱", value2=project_name_parse):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 專案名稱:{0},釋出時間:{1},連結:{2}".format(project_name_parse.encode('utf-8'), start_time, href.encode('utf-8')))
print("[+] 儲存成功")
def downloader_ynsggzzw_gc_zb(self, url, proxy_dict):
'''
雲南省公共資源交易中心_工程建設_中標結果下載器
'''
website_name = "雲南省公共資源交易中心_工程建設_中標結果"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynsggzzw_gc_zb')
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 101):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
resolve_result = resolver.resovler_ynsggzzw_gc_zb(html, page_num, proxy_dict)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_class_name("mmggxlh").find_element_by_link_text('下一頁').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
title = resolve["公告標題"]
people = resolve["中標公司"]
price = resolve["中標價格"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 公告標題:{0},釋出時間:{1},中標人:{2},中標價:{3},連結:{4}".format(title.encode('utf-8'), start_time,
people.encode('utf-8'), price.encode('utf-8'),
href))
for resolve in data_list:
# 列印
title = resolve["公告標題"]
people = resolve["中標公司"]
price = resolve["中標價格"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="連結", value1=href, condition2="公告標題", value2=title):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 公告標題:{0},釋出時間:{1},中標人:{2},中標價:{3},連結:{4}".format(title.encode('utf-8'), start_time,
people.encode('utf-8'), price.encode('utf-8'),
href))
print("[+] 儲存成功")
def downloader_ynsggzzw_zf_zb(self, url, proxy_dict):
'''
雲南省公共資源交易中心_政府採購_結果公示下載器
'''
website_name = "雲南省公共資源交易中心_政府採購_結果公示"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('ynsggzzw_zf_zb')
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 101):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
resolve_result = resolver.resovler_ynsggzzw_zf_zb(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_class_name("mmggxlh").find_element_by_link_text('下一頁').click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 公告標題:{0},釋出時間:{1},連結:{2}".format(title.encode('utf-8'), start_time, href))
for resolve in data_list:
# 列印
title = resolve["公告標題"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="連結", value1=href, condition2="公告標題", value2=title):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 公告標題:{0},釋出時間:{1},連結:{2}".format(title.encode('utf-8'), start_time, href))
print("[+] 儲存成功")
def downloader_kmsgg_gc_zb(self, url, proxy_dict):
'''
昆明市公共資源交易平臺公共服務系統_工程建設_中標結果公示下載器
'''
website_name = "昆明市公共資源交易平臺公共服務系統_工程建設_中標結果公示"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('kmsgg_gc_zb')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 51):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resolver_kmsgg_gc_zb(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_id("btnAjax_NextPage").click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 編號:{0},工程名稱:{1},釋出時間:{2},連結:{3}".format(projectNumber, project_name,
start_time, href))
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="編號", value1=projectNumber, condition2="工程名稱",
value2=project_name):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 編號:{0},工程名稱:{1},釋出時間:{2},連結:{3}".format(projectNumber, project_name,
start_time, href))
print("[+] 儲存成功")
def downloader_kmsgg_zf_zb(self, url, proxy_dict):
'''
昆明市公共資源交易平臺公共服務系統_政府採購_結果公示下載器
'''
website_name = "昆明市公共資源交易平臺公共服務系統_政府採購_結果公示"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('kmsgg_zf_zb')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 51):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resolver_kmsgg_zf_zb(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_id("btnAjax_NextPage").click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 編號:{0},工程名稱:{1},釋出時間:{2},連結:{3}".format(projectNumber, project_name,
start_time, href))
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="編號", value1=projectNumber, condition2="工程名稱",
value2=project_name):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 編號:{0},工程名稱:{1},釋出時間:{2},連結:{3}".format(projectNumber, project_name,
start_time, href))
print("[+] 儲存成功")
def downloader_kmsgg_gc_by(self, url, proxy_dict):
'''
昆明市公共資源交易平臺公共服務系統_工程建設_補遺通知下載器
'''
website_name = "昆明市公共資源交易平臺公共服務系統_工程建設_補遺通知"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('kmsgg_gc_by')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_num in range(1, 51):
# 設定時間戳,防止被識別
timeout = 20
socket.setdefaulttimeout(timeout)
sleep_download_time = 3
time.sleep(sleep_download_time)
print("[+] 正在抓取第{0}頁資訊".format(page_num))
html = driver.page_source
print("[+] 抓取成功")
# 解析網頁
resolve_result = resolver.resolver_kmsgg_gc_by(html, page_num)
# 獲取當前的月數和天數
current_month, current_day = self.current_time_parse(current_date)
# 如果載入到最後一頁就停止點選
try:
for resolve in sorted(resolve_result, key=lambda x: x['釋出時間'], reverse=True):
# 獲取釋出時間的月數和天數
resolve_month, resolve_day = self.current_time_parse(resolve['釋出時間'])
# 如果是當天時間的資料,則進行儲存
if int(current_month) == int(resolve_month):
if int(current_day) == int(resolve_day):
data_list.append(resolve)
else:
print("[+] 獲取完成")
raise Exception
else:
print("[+] 獲取完成")
raise Exception
# 模擬點選網頁的下一頁,獲取動態載入的全部網頁
driver.find_element_by_id("btnAjax_NextPage").click()
except Exception:
break
# 關閉當前頁面,如果只有一個頁面
driver.close()
# 關閉瀏覽器
driver.quit()
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
Console_Color.print_color("[+] 編號:{0},工程名稱:{1},釋出時間:{2},連結:{3}".format(projectNumber, project_name,
start_time, href))
for resolve in data_list:
# 列印
projectNumber = resolve["編號"]
project_name = resolve["工程名稱"]
start_time = resolve["釋出時間"]
href = resolve["連結"]
if self.check_exist(tender_table, condition1="編號", value1=projectNumber, condition2="工程名稱",
value2=project_name):
# 儲存資訊
dataSaver.insert_data(resolve)
print("[+] 編號:{0},工程名稱:{1},釋出時間:{2},連結:{3}".format(projectNumber, project_name,
start_time, href))
print("[+] 儲存成功")
def downloader_kmsgg_zf_by(self, url, proxy_dict):
'''
昆明市公共資源交易平臺公共服務系統_政府採購_補遺通知下載器
'''
website_name = "昆明市公共資源交易平臺公共服務系統_政府採購_補遺通知"
Console_Color.print_color("[+] 當前網站:{0}".format(website_name), forecolor="青色")
# 設定代理
proxyIP = proxy_dict['ip']
proxyPort = proxy_dict['port']
proxyProtocol = proxy_dict['protocol']
service_args = ['--proxy={0}:{1}', '--proxy-type={2}'.format(proxyIP, proxyPort, proxyProtocol)]
# 初始化driver物件,傳入瀏覽器報頭引數和代理IP,並獲取網頁URL
driver = webdriver.PhantomJS(desired_capabilities=self.dcap, service_args=service_args)
driver.get(url)
# 建立網頁解析器物件
resolver = pageResolver.Resolver()
# 建立資料儲存物件
dataSaver = dataDisposer.DataStore('kmsgg_zf_by')
# 建立表物件
tender_table = dataSaver.tender_table()
# 獲取當前時間
current_date = dataDisposer.current_time()
data_list = []
print("[*] 開始儲存資料")
for page_