爬蟲實戰入門

阿新 • • 發佈：2022-12-01

爬蟲0（正則表示式查詢，get）

11月15日

##獲取豆瓣電影top250



import requests
import re


#首先獲取url，然後用瀏覽器F12發現是get請求
url='https://movie.douban.com/top250'
headers={
'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
}
i=0

for i in range(0,226,25):
    ##傳送請求，拿到網頁原始碼
    ##把param放到這裡是因為這個網頁每25個電影為一頁
    ##如果我想獲取top250，就需要跳轉10次
    param = {
        'start': i,
        'filter': ''
    }
    resp=requests.get(url,headers=headers,params=param)
    text=resp.text


    ##通過正則表示式找到名字
    obj=re.compile(r'<div class="info">.*?<span class="title">(?P<name>.*?)</span>',re.S)
    result=obj.finditer(text)
    for name in result:
        print(name.group('name'))



##記得關閉通道
resp.close()

獲取豆瓣電影top250

使用request和re方法

爬蟲1 （正則表示式查詢，進入子連結，get）

11月16日

import requests
import re
import urllib3
headers={

}
url='https://www.dytt89.com/'
resp=requests.get(url,verify=False)
resp.encoding='gb2312'
resp_text=resp.text

obj=re.compile(r'2022必看熱片.*<ul>(?P<ul>.*?)</ul>',re.S)
resule=obj.finditer(resp_text)
for it in resule:
    ul_text=it.group('ul')
son_link_list=[]
obj2=re.compile(r"href='/(?P<son_link>.*?)' title=")
result1=obj2.finditer(ul_text)
for son_link in result1:
    tem=son_link.group('son_link')
    son_link_list.append(tem)


for son_link_tem in son_link_list:
    url_tem=url+son_link_tem
    print(url_tem)
    urllib3.disable_warnings()
    resp=requests.get(url_tem,verify=False)
    resp.encoding = 'gb2312'
    resp_text=resp.text
    obj3=re.compile(r"<title>(?P<name>.*?)_電影天堂</title>",re.S)
    result_tem=obj3.finditer(resp_text)
    for name in result_tem:
        print(name.group("name"))

    obj4=re.compile(r"<tr>.*?<td style=.*?<a href=\".*?\">(?P<link_xunlei>.*?)</a>.*?</tr>.*?</tbody>",re.S)
    result_tem = obj4.finditer(resp_text)
    for link in result_tem:
        print(link.group("link_xunlei"))
        print('\n')
        break
resp.close()

採用requests庫和re庫

通過request傳送get or post請求，得到網頁原始碼，然後用正則表示式搜尋原始碼文字，找到電影子連結並進入，拿到電影名稱以及他的磁力連結

爬蟲2（找到菜名，菜價 post）

（post方式）

import requests
import re

url='http://www.xinfadi.com.cn/getCat.html'
headers={
'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'

}
dw={
'prodCatid'	:"1186"
}

resp=requests.post(url,headers=headers,data=dw)
text=resp.text
obj1=re.compile(r'"prodName":"(?P<prodName>.*?)"',re.S)
obj2=re.compile(r'"lowPrice":"(?P<lowPrice>.*?)"',re.S)
obj3=re.compile(r'"highPrice":"(?P<highPrice>.*?)"',re.S)
result1=obj1.finditer(text)
result2=obj2.finditer(text)
result3=obj3.finditer(text)

for find in result1:
	print(find.group('prodName'))
for find in result2:
	print(find.group('lowPrice'))
for find in result3:
	print(find.group('highPrice'))

抓包找post

模擬傳送請求得到響應，然後對響應分析

找出了菜名，菜的最低價、最高價

當然這個沒排序，學會匯出表格的話應該還湊活用。。

爬蟲3（豆瓣top250 demo版本，採用bs4中的BeautifulSoup函式）

##採用bs4中的BeautifulSoup 通過標籤來查詢內容

#首先還是獲得頁面原始碼
import requests
from bs4 import  BeautifulSoup
url='https://movie.douban.com/top250'
headers={
'User-Agent':
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
}
resp=requests.get(url,headers=headers)
page=BeautifulSoup(resp.text,"html.parser") ##注意這裡，通過這個函式獲取了page，為了後面能查標籤，"html.parser"是為了告訴這個函式你前面那個是什麼檔案
result1=page.find("ol",class_='grid_view' )##find是找到一個就返回，find_all是全找到
result2=result1.find_all('span',class_='title')[0:]  ##找到很多的話，直接切片
for tem in result2:
	tds=tem.text #找到被標籤標記的內容
	print(tds)
	print('\n',end='')


resp.close()

採用bs4庫中的Beautifulsoup函式

通過標籤來查詢內容

爬蟲4：

使用了正則表示式和BeautifulSoup

報錯了

爬整個網站的圖：（太爽了）

import requests
from bs4 import  BeautifulSoup
import re

##拿到頁面原始碼
for i in range(1,251,1):
	url_raw='http://www.netbian.com/'
	if(i!=1):
		son_url=f'index_{i}.htm'
	else:
		son_url=''
	url=url_raw+son_url
	headers={
	'User-Agent':
		'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
	}

	#抓取子連結
	resp=requests.get(url,headers=headers)
	resp.encoding='gbk'
	page=BeautifulSoup(resp.text,'html.parser')
	result=page.find('div',class_='list')
	obj=re.compile(r'<a href="(?P<href>.*?)"',re.S)
	href_list=obj.finditer(str(result))
	for href in href_list :
		if href.group("href")[0]!= 'h':
			chilf_href=href.group("href").strip('/')
			link=url_raw+chilf_href
			##套娃，再次進入子連結
			resp=requests.get(link)
			resp.encoding='gbk'
			page=BeautifulSoup(resp.text,'html.parser')

			result=page.find('div',class_='endpage')
			result=result.find('div',class_="pic")
			obj=re.compile(r'src="(?P<link>.*?)"',re.S)
			result_list=obj.finditer(str(result))
			for link in result_list:
				print(link.group("link"))
resp.close()