Python爬蟲段子網全程式碼
阿新 • • 發佈:2018-12-29
程式碼如下:
import itchat import requests from bs4 import BeautifulSoup itchat.auto_login(enableCmdQR='-1',hotReload=True) def send(url): users=itchat.search_friends(name=u'傳送物件微信名') userName=users[0]['UserName'] start_html = requests.get(url) soup = BeautifulSoup(start_html.text, 'lxml') list=soup.find_all(attrs={'class': 'post'}) url=soup.find(attrs={'class': 'next'}).get('href') for i in list: content=str(i.find(attrs={'class': 'post-title'}).get_text())+\ str(i.find(attrs={'class': 'post-content'}).get_text()) itchat.send(content, toUserName=userName) @itchat.msg_register(itchat.content.TEXT) def print_content(msg): y=str(msg['Text']) if y.isdigit(): url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/' url=url.format(y) send(str(url)) itchat.run()
1、微信登入
itchat.auto_login(enableCmdQR='-1',hotReload=True)
2、獲取傳送的物件
users=itchat.search_friends(name=u'傳送物件微信名')
userName=users[0]['UserName']
3、獲取要爬蟲的網頁物件
start_html = requests.get(url)
soup = BeautifulSoup(start_html.text, 'lxml')
4、解析網頁內容併發送
list=soup.find_all(attrs={'class': 'post'}) url=soup.find(attrs={'class': 'next'}).get('href') for i in list: content=str(i.find(attrs={'class': 'post-title'}).get_text())+\ str(i.find(attrs={'class': 'post-content'}).get_text()) itchat.send(content, toUserName=userName)
其中
url=soup.find(attrs={'class': 'next'}).get('href')
是獲取下一頁的連結,本想實現微信輸入下一頁,自動傳送下一頁網頁內容,但是沒能實現。
5、獲取對方微信傳送的內容,根據內容拼接網頁連線,呼叫傳送程式。
@itchat.msg_register(itchat.content.TEXT) def print_content(msg): y=str(msg['Text']) url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/' url=url.format(y) send(str(url))
6、由於這個網頁總共40多頁,所以微信輸入的數字要在這之內,輸入其他內容,不會爬取網頁內容。另外,為了方便測試,可以將傳送物件改為檔案傳輸助手,程式碼如下:
import itchat
import requests
from bs4 import BeautifulSoup
itchat.auto_login(enableCmdQR='-1',hotReload=True)
def send(url):
#users=itchat.search_friends(name=u'filehelper')
#userName=users[0]['UserName']
start_html = requests.get(url)
soup = BeautifulSoup(start_html.text, 'lxml')
list=soup.find_all(attrs={'class': 'post'})
url=soup.find(attrs={'class': 'next'}).get('href')
for i in list:
content=str(i.find(attrs={'class': 'post-title'}).get_text())+\
str(i.find(attrs={'class': 'post-content'}).get_text())
itchat.send(content, toUserName='filehelper')
@itchat.msg_register(itchat.content.TEXT)
def print_content(msg):
y=str(msg['Text'])
if y.isdigit():
url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/'
url=url.format(y)
send(str(url))
itchat.run()