1. 程式人生 > >python3 urllib爬蟲抓取記錄

python3 urllib爬蟲抓取記錄


# 目的:GET請求 抓取csdn部落格頁面所有文章標題,並儲存在csdn目錄下
import re
import os
from urllib import request

#抓取整個頁面下來
data=request.urlopen('http://blog.csdn.net/a519395243').read().decode()
#正則提取所有文章標題,
ruler = re.compile('<span class="link_title"><a href="/a519395243/article/details/[1-9]{8}">(.*?)</a>',re.S)
match = ruler.findall(data)
#把抓取到的資料遍歷
for x in match:
	#把 \r\n 和空格 都去掉
	content = x.replace('\r\n','').replace(' ','')
	#檔案儲存路徑,如果沒有,則建立
	path = 'csdn'
	if not os.path.exists(path):
		os.makedirs(path)
	#儲存檔名
	file_path = path+'/csdn.txt'
	#開啟檔案
	f = open(file_path,'a+')
	#寫入檔案
	f.write(content)
	#關閉檔案
	f.close()
pass


#模擬瀏覽器傳送GET請求,通過往Request物件新增HTTP頭,偽裝成瀏覽器
from urllib import request

req = request.Request('http://blog.csdn.net/a519395243')
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')
data = request.urlopen(req).read().decode()
print(data)



# 目的:模擬登入 csdn
import gzip  
import re  
import urllib.request  
import urllib.parse  
import http.cookiejar  
  
def ungzip(data):  
    try:  
        print("嘗試解壓縮...")  
        data = gzip.decompress(data)  
        print("解壓完畢")  
    except:  
        print("未經壓縮,無需解壓")  
      
    return data  
          
def getLt(data):  
    cer = re.compile('name=\"lt\" value=\"(.*)\"')  
    strlist = cer.findall(data)  
    return strlist[0]

def getExecution(data):
	cer = re.compile('name=\"execution\" value=\"(.*)\"')  
	strlist = cer.findall(data)  
	return strlist[0]

def getOpener(head):  
    # cookies 處理  
    cj = http.cookiejar.CookieJar()  
    pro = urllib.request.HTTPCookieProcessor(cj)  
    opener = urllib.request.build_opener(pro)  
    header = []  
    for key,value in head.items():  
        elem = (key,value)  
        header.append(elem)  
    opener.addheaders = header  
    return opener  
# header資訊可以通過firebug獲得  
header = {  
   	'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
	'Accept-Encoding':'gzip, deflate, sdch, br',
	'Accept-Language':'zh-CN,zh;q=0.8',
	'Connection':'keep-alive',
	'Host':'passport.csdn.net',
	'Upgrade-Insecure-Requests':'1',
	'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
	'Cookie':'uuid_tt_dd=-6281662822437337065_20171128; __message_district_code=440000; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22%24device_id%22%3A%22160058ffb5850c-0d114986a51ac1-6a11157a-1440000-160058ffb598c9%22%2C%22props%22%3A%7B%22%24latest_utm_source%22%3A%22news0%22%7D%7D; kd_user_id=1f003860-eec5-424d-8a20-498a00b6ab73; UM_distinctid=160068870b25ec-07ca748d26f527-6a11157a-15f900-160068870b3750; UN=a519395243; UE="
[email protected]
"; BT=1512011174110; shown_offset=20; Hm_lvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1511939807,1512007982,1512022346,1512026346; Hm_lpvt_3f9df99a208b69b45eb52cfbe2dc3bf8=1512026346; __message_sys_msg_id=0; __message_gu_msg_id=0; __message_cnel_msg_id=0; __message_in_school=0; JSESSIONID=8669679CFA8B508DD860D5C76BDA9E69.tomcat1; LSSC=LSSC-55438-kdj63iwrBuHfcdst9TBrRIONZeKOQh-passport.csdn.net; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1512011295,1512022345,1512026346,1512029753; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1512032481; dc_tos=p083q9; dc_session_id=1512031760278' } url = url = 'https://passport.csdn.net/account/verify' opener = getOpener(header) op = opener.open(url) data = op.read() data = ungzip(data) lt = getLt(data.decode()) execution = getExecution(data.decode()) username = "帳號" password = "密碼" postDict = { 'lt': lt, 'username': username, 'password': password, '_eventId': 'submit', 'execution':execution } postData = urllib.parse.urlencode(postDict).encode() op = opener.open(url,postData) data = op.read() data = ungzip(data) print(data.decode())