1. 程式人生 > >Python爬蟲段子網全程式碼

Python爬蟲段子網全程式碼

程式碼如下:

import itchat
import requests
from bs4 import BeautifulSoup

itchat.auto_login(enableCmdQR='-1',hotReload=True)

def send(url):
    users=itchat.search_friends(name=u'傳送物件微信名')
    userName=users[0]['UserName']
    start_html = requests.get(url)   
    soup = BeautifulSoup(start_html.text, 'lxml')
    list=soup.find_all(attrs={'class': 'post'})
    url=soup.find(attrs={'class': 'next'}).get('href')
    for i in list:
         content=str(i.find(attrs={'class': 'post-title'}).get_text())+\
                 str(i.find(attrs={'class': 'post-content'}).get_text())
         itchat.send(content, toUserName=userName)
@itchat.msg_register(itchat.content.TEXT)
def print_content(msg):
    y=str(msg['Text']) 
	if y.isdigit():
    	url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/'
	    url=url.format(y)
    	send(str(url))  
itchat.run()

1、微信登入

 itchat.auto_login(enableCmdQR='-1',hotReload=True)

2、獲取傳送的物件

users=itchat.search_friends(name=u'傳送物件微信名')
userName=users[0]['UserName']

3、獲取要爬蟲的網頁物件


   start_html = requests.get(url)   
   soup = BeautifulSoup(start_html.text, 'lxml')

4、解析網頁內容併發送

 list=soup.find_all(attrs={'class': 'post'})
    url=soup.find(attrs={'class': 'next'}).get('href')
    for i in list:
         content=str(i.find(attrs={'class': 'post-title'}).get_text())+\
                 str(i.find(attrs={'class': 'post-content'}).get_text())
                 itchat.send(content, toUserName=userName)

其中

url=soup.find(attrs={'class': 'next'}).get('href')

是獲取下一頁的連結,本想實現微信輸入下一頁,自動傳送下一頁網頁內容,但是沒能實現。
5、獲取對方微信傳送的內容,根據內容拼接網頁連線,呼叫傳送程式。

@itchat.msg_register(itchat.content.TEXT)
def print_content(msg):
    y=str(msg['Text']) 
    url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/'
    url=url.format(y)
    send(str(url))

6、由於這個網頁總共40多頁,所以微信輸入的數字要在這之內,輸入其他內容,不會爬取網頁內容。另外,為了方便測試,可以將傳送物件改為檔案傳輸助手,程式碼如下:

import itchat
import requests
from bs4 import BeautifulSoup

itchat.auto_login(enableCmdQR='-1',hotReload=True)

def send(url):
    #users=itchat.search_friends(name=u'filehelper')
    #userName=users[0]['UserName']
    start_html = requests.get(url)   
    soup = BeautifulSoup(start_html.text, 'lxml')
    list=soup.find_all(attrs={'class': 'post'})
    url=soup.find(attrs={'class': 'next'}).get('href')
    for i in list:
         content=str(i.find(attrs={'class': 'post-title'}).get_text())+\
                 str(i.find(attrs={'class': 'post-content'}).get_text())
         itchat.send(content, toUserName='filehelper')
@itchat.msg_register(itchat.content.TEXT)
def print_content(msg):
    y=str(msg['Text']) 
	if y.isdigit():    
		url='http://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}/'
    	url=url.format(y)
    	send(str(url))  
itchat.run()