1. 程式人生 > 其它 >python爬蟲學習(一)requests爬取dytt下載地址

python爬蟲學習(一)requests爬取dytt下載地址

當網址有加密傳送安全證書時可以使用verify=False,因為dytt利用的字元編碼是gb2312,所以解碼也要用gb2312

import requests
domain = "https://dy.dytt8.net/index.htm"
resp = requests.get(domain,verify=False)
#verify=False 去掉安全驗證
resp.encoding = 'gb2312'
print(resp.text)

通過re獲取對應的url資訊,子頁面地址資訊儲存到陣列中

#獲取到ul裡的li
obj = re.compile(r"最新電影更新:.*?<ul>(?P<ul>.*?)</ul>
",re.S) obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S) result =obj.finditer(resp.text) #print(result) child_href_list = [] for i in result: ul = i.group('ul') #提取子頁面連線 result1 = obj1.finditer(ul) for i1 in result1: child_href = "https://dy.dytt8.net/" + i1.group('
link').strip("/") child_href_list.append(child_href) print(child_href_list)

獲取子頁面內容

新增子頁面電影名和下載地址

obj2 = re.compile(r'◎片  名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S)

將爬取內容儲存到csv檔案中

f = open("dytt.csv",mode="a+",newline="",encoding='utf8
') csv_writer = csv.writer(f)

完整程式碼

import requests
import re
import csv
header = {
    "User-Agent": "XXXXXXX"
}
domain = "https://dy.dytt8.net/index.htm"
resp = requests.get(domain,verify=False,headers = header)
#verify=False 去掉安全驗證
resp.encoding = 'gb2312'
#print(resp.text)
#獲取到ul裡的li
obj = re.compile(r"最新電影更新:.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S)
obj2 = re.compile(r'◎片  名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S)
result =obj.finditer(resp.text)
f = open("dytt.csv",mode="a+",newline="",encoding='utf8')
csv_writer = csv.writer(f)
#print(result)
child_href_list = []
for i in result:
    ul = i.group('ul')
    #提取子頁面連線
    result1 = obj1.finditer(ul)
    for i1 in result1:
        child_href = "https://dy.dytt8.net/" + i1.group('link').strip("/")
        child_href_list.append(child_href)
#print(child_href_list)
for href in child_href_list:
    #print(href)
    child_resp = requests.get(href,verify=False,headers = header)
    child_resp.encoding = 'gb2312'
    #print(child_resp.text)
    movies = obj2.finditer(child_resp.text)
    for i in movies:    
        dic = i.groupdict()
        dic['movie'] = dic['movie'].strip('\u3000')
        dic['movie'] = dic['movie'].strip()
        #print(i.group("movie"))
        #print(i.group("magnet"))
        print(dic)
        #將字典裡的內容寫入csv檔案中
        csv_writer.writerow(dic.values())