【專案】爬取+匯入+定時器
需求:
#爬取資料
#檢查資料庫是否存在
#不:儲存資料庫
#是:不儲存
#每個月執行一次
#_*_ coding=utf-8 _*_
from html.parser import HTMLParser
import requests
import re
import pymysql
import time
import schedule
host = "http://alk.12348.gov.cn"
url = "http://alk.12348.gov.cn/LawMultiSearch/Search"
data ={"searchField":u"案例全文",
"checkDatabaseID":"44,66",
"pageSizeNow":"10"}
headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie":"ASP.NET_SessionId=uczdhbfwhdoe4ndbxmu0bune; lang=zh; __jsluid=3f2b0dcf29ea3a7dc2bd7dd00b48444e; ResultColumnInfo=H4sIAAAAAAAEAIuuVkrOT0lVslJyd3ePVNJRykvMBfFMTJRBAso7V19bcmnruY8bdhz4q2ygbGoIVJKcX5pXomRlZmBcq4Ok3c0Zod3MTBkkANZ+4cLVf1uWbV+8cNGFC5e2AlnXsBhmalYbCwBF/u1RjAAAAA==; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1540436998; Hm_lpvt_d7682ab43891c68a00de46e9ce5b76aa=1540442209",
"Host": "alk.12348.gov.cn",
"Origin":"http://alk.12348.gov.cn",
"Referer": "http://alk.12348.gov.cn/LawMultiSearch?checkDatabaseID=44%2C66",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
# 連線資料庫
conn = pymysql.connect(
host='192.168.88.52',
port=3306,
db= 'manage_system',
user='root',
passwd='123456',
charset='utf8',
use_unicode=True)
cursor = conn.cursor()
def get_info():
n=1
ms = []
while(True):
# print(n)
data["pageIndexNow"]=str(n)
res = requests.post(url,data=data,headers=headers)
#判斷是否請求成功
if res.status_code == 200:
html = res.text
parser = HTMLParser()
#禁止html頁面的轉義字元轉義
text1 = parser.unescape(html)
text=""
#去除html頁面每行的空格和換行
for txt in text1:
text=text + txt.strip().replace("/n","")
#print(text)
#抓取資訊的正則
p = '<tr><td><atarget="_blank"href="(.+?)"title="(.+?)">(.+?)</a></td><tdstyle="text-align:center">(.+?)</td><tdclass="tc">(.+?)</td><tdstyle="text-align:center">(.+?)</td></tr>'
pattern = re.compile(p)
m = pattern.findall(text)
#匹配到的字串為空,則結束抓取並列印已抓取的頁數
if m == []:
print(u"頁數:",n-1)
break
#將每次抓取到的資訊儲存到同一個列表中返回
ms.extend(m)
else:
#請求失敗列印狀態碼,結束迴圈
print(res.status_code)
break
n += 1
return ms
#檢查資料庫是否存在
#ms=['連結.','標題','2018-09-07','型別','int ']
def screen_data():
ms = get_info()
#獲取每個值
for m in ms:
gx_links =host+m[0]
gx_name =m[1]
gz_time1=m[3]
gz_gztype=m[4]
gz_accessnum=m[5]
value =(gx_links,gx_name,gz_time1,gz_gztype,gz_accessnum)
#查詢
cursor.execute('SELECT link FROM tb_law_notar_case_list')
data =cursor.fetchall()
d_exist=[]
#判斷
for da in data:
d_exist.append(da[0])
if gx_links in d_exist:
print("exist!!!")
pass
else:
cursor.execute('INSERT INTO tb_law_notar_case_list (`link`, `gzname`, `gztime`, `gztype`, `accessnum`) VALUES (%s,%s,%s,%s,%s)',value)
conn.commit();
print('-----插入成功');
if __name__ == "__main__":
ms = get_info()
screen_data()
schedule.every(30).day.do(screen_data)
while True:
schedule.run_pending()
time.sleep(1)