scrapy爬取網站案例
阿新 • • 發佈:2020-08-06
scrapy爬取網站案例
爬取抽屜網資料,存到Redis和MySQL中,實現持久化
Mysql實現
# settings.py ITEM_PIPELINES = { 'firstscrapy.pipelines.DrawerMysqlPipeline': 305, } # pipelines.py class DrawerMysqlPipeline: def __init__(self): """ 初始化方法 host=None, user=None, password="", database=None, port=0, unix_socket=None, charset='', """ self.conn = pymysql.connect( host='127.0.0.1', port=3306, user='root', password='123456', database='drawer', charset='utf8' ) self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor) def open_spider(self,spider): pass def process_item(self, item, spider): sql = 'insert into drawer (title,url,img_url)values (%s,%s,%s)' # sql語句 self.cursor.execute(sql,[item['title'],item['url'],item['img_url']]) # 執行sql self.conn.commit() # 提交命令,修改資料庫 return item def close_spider(self,spider): self.cursor.close() self.conn.close()
Redis實現
from redis import Redis import json class DrawerRedisPipeline: def __init__(self): self.conn = None def open_spider(self,spider): self.conn = Redis(host='127.0.0.1',port=6379) def process_item(self, item, spider): self.conn.lpush('drawer_news',json.dumps(dict(item))) return item def close_spider(self,spider): pass
爬取cnblogs文章,把標題和連線地址打印出來
import scrapy class CnblogsSpider(scrapy.Spider): name = 'cnblogs' allowed_domains = ['www.cnblogs.com'] start_urls = ['https://www.cnblogs.com/'] def parse(self, response, **kwargs): article_list = response.xpath('//article[@class="post-item"]') article_info = [] for article in article_list: article_info.append( { 'author':article.xpath('.//a[@class="post-item-author"]/span/text()').extract_first(), 'title':article.xpath('.//a[@class="post-item-title"]/text()').extract_first(), 'link':article.xpath('.//a[@class="post-item-title"]/@href').extract_first(), 'delivery_time':article.xpath('.//span[@class="post-meta-item"]/span/text()').extract_first() } ) for art in article_info: print(art) print(len(article_info))
資料持久化
儲存到MySQL資料庫中
# items.py
class ArticleItem(scrapy.Item):
author = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
delivery_time = scrapy.Field()
content = scrapy.Field()
# cnblogs.py
import scrapy
from scrapy.http.request import Request
from firstscrapy.items import ArticleItem
# spider.py
import scrapy
from scrapy.http.request import Request
from firstscrapy.items import ArticleItem
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['www.cnblogs.com']
start_urls = ['https://www.cnblogs.com/']
page_num = 1
items = []
def content_parse(self, response, **kwargs):
item = response.meta.get('item')
content = response.css('#cnblogs_post_body').extract_first()
item['content'] = str(content)
return item
def parse(self, response, **kwargs):
article_list = response.xpath('//article[@class="post-item"]')
for article in article_list:
item = ArticleItem()
item['author'] = article.xpath('.//a[@class="post-item-author"]/span/text()').extract_first()
item['title'] = article.xpath('.//a[@class="post-item-title"]/text()').extract_first()
item['link'] = article.xpath('.//a[@class="post-item-title"]/@href').extract_first()
item['delivery_time'] = article.xpath('.//span[@class="post-meta-item"]/span/text()').extract_first()
self.items.append(item)
if self.page_num < 20:
self.page_num += 1
next_url = f'https://www.cnblogs.com/sitehome/p/{self.page_num}'
yield Request(url=next_url, callback=self.parse)
for item in self.items:
yield Request(item['link'], meta={'item': item}, callback=self.content_parse)
自定給抽屜點贊
執行方案:
<1>先用 selenium獲取使用者登入的cookie
<2>再通過requests物件自動點贊
from selenium import webdriver
import time
import json
# 載入驅動
bro = webdriver.Chrome(executable_path='./chromedriver.exe')
# 隱式等待10s
bro.implicitly_wait(10)
bro.get("https://dig.chouti.com/") # 瀏覽器開啟抽屜
# 找到頁面上的登入按鈕
login_btn = bro.find_element_by_id("login_btn")
login_btn.click()
username = bro.find_element_by_name("phone")
password = bro.find_element_by_name("password")
username.send_keys("18395806407")
time.sleep(1)
password.send_keys("wang931219peng")
time.sleep(1)
button = bro.find_element_by_css_selector("button.login-btn")
button.click()
time.sleep(10)
cookie_list = bro.get_cookies()
print(cookie_list)
cookie = {}
for item in cookie_list:
cookie[item['name']] = item['value']
with open('cookie.txt',mode='w',encoding='utf-8') as fw:
fw.write(json.dumps(cookie)) # 存到檔案中,也可以存到mysql或者Redis中
import requests
import json
with open('cookie.txt', mode='r', encoding='utf-8') as fr:
cookie = json.loads(fr.read())
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer': 'https://dig.chouti.com/'
}
res = requests.get("https://dig.chouti.com/top/24hr?_=1596712494547", headers=headers)
id_list = []
for item in res.json()['data']:
id_list.append(item['id']) # 新增id號
for id in id_list:
ret = requests.post('https://dig.chouti.com/link/vote', headers=headers, cookies=cookie, data={'linkId': id})
print(ret.text)
ret = requests.post(
"https://dig.chouti.com/comments/create",
headers=headers,
cookies=cookie,
data={
'content':'信春哥,得永生',
'linkId': id,
'parentId': 0
}
)
time.sleep(5)