1. 程式人生 > >Python爬蟲刷Blog的閱讀量

Python爬蟲刷Blog的閱讀量

Version-2.0

工作之餘更新了一版,更新如下:
1.增加了對使用者多頁文章對獲取
2.增加了點選順序的隨機性
3.增加了點選行為時間的隨機性
4.增加了點選內容的隨機性
5.增加了點選的輪次

缺陷與不足

1.UA還未設定
2.未有多個IP

import requests
from bs4 import BeautifulSoup
import time
import random

#對使用者多頁文章對獲取
def get_writer_article_list(base_url,page_num):
    all_article_list = []
    for i in range(page_num):
        index = i +1
        print('cur index is ' + str(index))
        cur_page_url = base_url + str(index)
        all_article_list = get_article_list(cur_page_url) + all_article_list
    return all_article_list
        

#獲取單頁所有文章url
def get_article_list(base_url):
    web_data = requests.get(base_url)
    soup = BeautifulSoup(web_data.text,'lxml')
    divs = soup.find_all('div', class_='article-item-box csdn-tracking-statistics')
    
    url_list = []
    for div in divs:
        label = div.find_all('a')
        url = label[0].get('href')
        url_list.append(url)
    return url_list

#生成每一輪點選的隨機list
def click_random(url_list,min_random_rate):
    new_url_list = []
    max_url_count = len(url_list)
    min_url_count = int(max_url_count*min_random_rate)
    term_url_count = random.randint(min_url_count,max_url_count)
    for i in range(term_url_count):
        ramdom_index = random.randint(0,max_url_count-1)
        new_url_list.append(url_list[ramdom_index])
    return new_url_list


#多輪點選
def click_article_url(term_num,click_random_start,click_random_end,term_random_start,term_random_end,all_list):
    for i in range(term_num):
        term_url_list = click_random(all_list,0.7)
        for url in term_url_list:
            requests.get(url)
            print('click for '+url)
            click_sleep_time = random.randint(click_random_start,click_random_end)
            time.sleep(click_sleep_time)
            print('sleep for '+str(click_sleep_time))
        term_num = i +1
        print('finish the term of '+str(term_num))
        term_sleep_time = random.randint(term_random_start,term_random_end)
        time.sleep(term_sleep_time)
        print('sleep for the term '+str(term_sleep_time))

base_url1 = "https://blog.csdn.net/xxx1/article/list/"
base_url2 = "https://blog.csdn.net/xxx2/article/list/"

url_list_1 = get_writer_article_list(base_url1,2)
url_list_2 = get_writer_article_list(base_url2,2)

all_list = url_list_1 + url_list_2

click_article_url(200,8,50,30,60,all_list)

Version-1.0

利用午休的時間,用Python寫了一個刷Blog中文章閱讀量的小玩具,還不成熟,最近會迭代修改。

當前的整體思路:獲取使用者的文章頁,解析html,獲取該頁面每一篇文章的url,存入list,然後訓練訪問list中的url,達到點選的目的。

import requests
from bs4 import BeautifulSoup
import time

//爬取我的粉絲 周英俊 同學的blog
base_url = "https://blog.csdn.net/qq_38835878/article/list/1"

web_data = requests.get(base_url)
soup = BeautifulSoup(web_data.text,'lxml')
divs = soup.find_all('div', class_='article-item-box csdn-tracking-statistics')

url_list = []
for div in divs:
    label = div.find_all('a')
    url = label[0].get('href')
    url_list.append(url)

for url in url_list:
    requests.get(url)
    print('request for '+url)
    time.sleep(61)
    print('sleep for 61s')

缺陷與不足

  1. 沒有加入翻頁功能
  2. 沒有設計代理Ip
  3. 沒有設定UA
  4. 行為規律性較強:session時長無偏差
  5. 點選行為pattern明顯:順序訪問所有文章。