爬蟲基礎_張三
1.爬蟲入門程式
import cookielib
import urllib2
url = "http://www.baidu.com"
response1 = urllib2.urlopen(url)
2.爬蟲程式新增data、header,然後post請求
import urllib
import urllib2
url = 'http://www.server.com/login'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'username' : 'cqc', 'password' : 'XXXX' }
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
request = urllib2.Request(url, data, headers)
response = urllib2.urlopen(request)
page = response.read()
3.爬蟲程式新增cookie
import cookielib
import urllib2
#設定儲存cookie的檔案,同級目錄下的cookie.txt
filename = 'cookie.txt'
#宣告一個MozillaCookieJar物件例項來儲存cookie,之後寫入檔案
cookie = cookielib.MozillaCookieJar(filename)
#利用urllib2庫的HTTPCookieProcessor物件來建立cookie處理器
handler = urllib2.HTTPCookieProcessor(cookie)
#通過handler來構建opener
opener = urllib2.build_opener(handler)
4.正則表示式
import re
# 將正則表示式編譯成Pattern物件
pattern = re.compile(r'xxxxx')
paxg=re.match(patter,"xxxx")
print(paxg)