python爬蟲學習(一)requests爬取dytt下載地址
阿新 • • 發佈:2021-06-23
當網址有加密傳送安全證書時可以使用verify=False,因為dytt利用的字元編碼是gb2312,所以解碼也要用gb2312
import requests domain = "https://dy.dytt8.net/index.htm" resp = requests.get(domain,verify=False) #verify=False 去掉安全驗證 resp.encoding = 'gb2312' print(resp.text)
通過re獲取對應的url資訊,子頁面地址資訊儲存到陣列中
#獲取到ul裡的li obj = re.compile(r"最新電影更新:.*?<ul>(?P<ul>.*?)</ul>",re.S) obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S) result =obj.finditer(resp.text) #print(result) child_href_list = [] for i in result: ul = i.group('ul') #提取子頁面連線 result1 = obj1.finditer(ul) for i1 in result1: child_href = "https://dy.dytt8.net/" + i1.group('link').strip("/") child_href_list.append(child_href) print(child_href_list)
獲取子頁面內容
新增子頁面電影名和下載地址
obj2 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S)
將爬取內容儲存到csv檔案中
f = open("dytt.csv",mode="a+",newline="",encoding='utf8') csv_writer = csv.writer(f)
完整程式碼
import requests import re import csv header = { "User-Agent": "XXXXXXX" } domain = "https://dy.dytt8.net/index.htm" resp = requests.get(domain,verify=False,headers = header) #verify=False 去掉安全驗證 resp.encoding = 'gb2312' #print(resp.text) #獲取到ul裡的li obj = re.compile(r"最新電影更新:.*?<ul>(?P<ul>.*?)</ul>",re.S) obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S) obj2 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S) result =obj.finditer(resp.text) f = open("dytt.csv",mode="a+",newline="",encoding='utf8') csv_writer = csv.writer(f) #print(result) child_href_list = [] for i in result: ul = i.group('ul') #提取子頁面連線 result1 = obj1.finditer(ul) for i1 in result1: child_href = "https://dy.dytt8.net/" + i1.group('link').strip("/") child_href_list.append(child_href) #print(child_href_list) for href in child_href_list: #print(href) child_resp = requests.get(href,verify=False,headers = header) child_resp.encoding = 'gb2312' #print(child_resp.text) movies = obj2.finditer(child_resp.text) for i in movies: dic = i.groupdict() dic['movie'] = dic['movie'].strip('\u3000') dic['movie'] = dic['movie'].strip() #print(i.group("movie")) #print(i.group("magnet")) print(dic) #將字典裡的內容寫入csv檔案中 csv_writer.writerow(dic.values())