python3 urllib爬蟲抓取記錄
阿新 • • 發佈:2018-12-26
# 目的:GET請求 抓取csdn部落格頁面所有文章標題,並儲存在csdn目錄下 import re import os from urllib import request #抓取整個頁面下來 data=request.urlopen('http://blog.csdn.net/a519395243').read().decode() #正則提取所有文章標題, ruler = re.compile('<span class="link_title"><a href="/a519395243/article/details/[1-9]{8}">(.*?)</a>',re.S) match = ruler.findall(data) #把抓取到的資料遍歷 for x in match: #把 \r\n 和空格 都去掉 content = x.replace('\r\n','').replace(' ','') #檔案儲存路徑,如果沒有,則建立 path = 'csdn' if not os.path.exists(path): os.makedirs(path) #儲存檔名 file_path = path+'/csdn.txt' #開啟檔案 f = open(file_path,'a+') #寫入檔案 f.write(content) #關閉檔案 f.close() pass
#模擬瀏覽器傳送GET請求,通過往Request物件新增HTTP頭,偽裝成瀏覽器 from urllib import request req = request.Request('http://blog.csdn.net/a519395243') req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36') data = request.urlopen(req).read().decode() print(data)
# 目的:模擬登入 csdn
import gzip
import re
import urllib.request
import urllib.parse
import http.cookiejar
def ungzip(data):
try:
print("嘗試解壓縮...")
data = gzip.decompress(data)
print("解壓完畢")
except:
print("未經壓縮,無需解壓")
return data
def getLt(data):
cer = re.compile('name=\"lt\" value=\"(.*)\"')
strlist = cer.findall(data)
return strlist[0]
def getExecution(data):
cer = re.compile('name=\"execution\" value=\"(.*)\"')
strlist = cer.findall(data)
return strlist[0]
def getOpener(head):
# cookies 處理
cj = http.cookiejar.CookieJar()
pro = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(pro)
header = []
for key,value in head.items():
elem = (key,value)
header.append(elem)
opener.addheaders = header
return opener
# header資訊可以通過firebug獲得
header = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Host':'passport.csdn.net',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Cookie':'uuid_tt_dd=-6281662822437337065_20171128; __message_district_code=440000; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22%24device_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22news0%22%7D%7D; kd_user_id=1f003860-eec5-424d-8a20-498a00b6ab73; UM_distinctid=160068870b25ec-07ca748d26f527-6a11157a-15f900-160068870b3750; UN=a519395243; UE=" [email protected]"; BT=1512011174110; shown_offset=20; Hm_lvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1511939807,1512007982,1512022346,1512026346; Hm_lpvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1512026346; __message_sys_msg_id=0; __message_gu_msg_id=0; __message_cnel_msg_id=0; __message_in_school=0; JSESSIONID=8669679CFA8B508DD860D5C76BDA9E69.tomcat1; LSSC=LSSC-55438-kdj63iwrBuHfcdst9TBrRIONZeKOQh-passport.csdn.net; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1512011295,1512022345,1512026346,1512029753; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1512032481; dc_tos=p083q9; dc_session_id=1512031760278'
}
url = url = 'https://passport.csdn.net/account/verify'
opener = getOpener(header)
op = opener.open(url)
data = op.read()
data = ungzip(data)
lt = getLt(data.decode())
execution = getExecution(data.decode())
username = "帳號"
password = "密碼"
postDict = {
'lt': lt,
'username': username,
'password': password,
'_eventId': 'submit',
'execution':execution
}
postData = urllib.parse.urlencode(postDict).encode()
op = opener.open(url,postData)
data = op.read()
data = ungzip(data)
print(data.decode())