1. 程式人生 > >python爬取豆瓣影評

python爬取豆瓣影評

看的別人的程式碼 爬取某部影片的影評 沒有模擬登入只能爬6頁

# -*- encoding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import re
import random
import  io
import  sys
import time

# 使用session來儲存登陸資訊
s = requests.session()


# 獲取動態ip,防止ip被封
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list


# 隨機從動態ip連結串列中選擇一條ip
def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies



# 獲取評論內容和下一頁連結
def get_data(html):
    soup = BeautifulSoup(html, "lxml")
    comment_list = soup.select('.comment > p')
    next_page = soup.select('.next')[0].get('href')
    return comment_list, next_page


if __name__ == "__main__":
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
    absolute = 'https://movie.douban.com/subject/26322642/comments'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'}
    loginUrl = 'https://www.douban.com/accounts/login?source=movie'
    formData = {
        "redir": "https://movie.douban.com/subject/26322642/comments?start=201&limit=20&sort=new_score&status=P&percent_type=",
        "form_email": "
[email protected]
", "form_password": "yyf15997588668", "login": u'登入' } # 獲取動態ip url = 'http://www.xicidaili.com/nn/' ip_list = get_ip_list(url, headers=headers) proxies = get_random_ip(ip_list) current_page = absolute next_page = "" comment_list = [] temp_list = [] num = 0 ans = 0 while (1): ans+=1 print("爬取第" + str(ans) + "頁") time.sleep(5) html = s.get(current_page, headers=headers, proxies=proxies).content temp_list, next_page = get_data(html) if ans is 7: break current_page = absolute + next_page comment_list = comment_list + temp_list # time.sleep(1 + float(random.randint(1, 100)) / 20) num = num + 1 # 每20次更新一次ip if num % 20 == 0: proxies = get_random_ip(ip_list) print(current_page) # 將爬取的評論寫入txt檔案中 with open("F:\comments.txt", 'a')as f: ark = 0 for node in comment_list: comment = node.get_text().strip().replace("\n", "") f.write(comment + "\n") ark += 1 print("寫了" + str(ark) + "個") f.close()