1. 程式人生 > >人民法院重大事件抓取

人民法院重大事件抓取

findall replace repl href win7 sql swd blog -a

時間:2017-8-3 23:30

Url:http://www.court.gov.cn

py3.4 + mysql + win7

import urllib.request
import re
import pymysql
from time import sleep
try:
    con = pymysql.connect(host = 127.0.0.1,user = root,passwd=root)
    con.query(create database PeopleCourt)
    con = pymysql.connect(host = 127.0.0.1,user = 
root,passwd=root,db = PeopleCourt) except: con = pymysql.connect(host = 127.0.0.1,user = root,passwd=root,db = PeopleCourt) try: con.query(create TABLE lawcase(title char(100),url char(100),time char(50))) except: print(Table existed) url_row = http://www.court.gov.cn/fabu-gengduo-15.html?page=1
header = {User-Agent:Mozilla/5.0} req = urllib.request.Request(url_row,headers=header) res = urllib.request.urlopen(req) data = res.read().decode() reg_page = re.compile(<li class="last"><a href="/fabu-gengduo-15\.html\?page=(.*?)">).findall(data) print(page:+str(reg_page[0])) for page in
range(1,int(reg_page[0])+1): print(Grab page:+str(page)) url = http://www.court.gov.cn/fabu-gengduo-15.html?page=+str(page) req = urllib.request.Request(url,headers=header) res = urllib.request.urlopen(req) data = res.read().decode() reg_item_string = <a title="(.*?)" target="_blank" href="(.*?)">.*?</a>.*?<i class="date">(.*?)</i> reg_item = re.compile(reg_item_string,re.S).findall(data) for item in reg_item: title = item[0].replace(\n,‘‘) Url = http://www.court.gov.cn+item[1] time = item[2] sql = "insert INTO lawcase(title,url,time) VALUES (‘"+title+"‘,‘"+Url+"‘,‘"+time+"‘)" con.query(sql) sleep(2) print(Ok)

數據庫截圖:

技術分享

人民法院重大事件抓取