scrapy入門-獲取電影排行榜儲存到json,csv,mysql
阿新 • • 發佈:2021-12-08
1.下載包
pip install scrapy
2.在使用路徑終端上建立專案指令: scrapy startproject 專案名
scrapy startproject maoyan
cd maoyan
scrapy genspider maoyan https://www.maoyan.com/
建立後目錄大致頁如下
|-ProjectName #專案資料夾
|-ProjectName #專案目錄
|-items.py #定義資料結構
|-middlewares.py #中介軟體
|-pipelines.py #資料處理
|-settings.py #全域性配置
|-spiders
|-__init__.py #爬蟲檔案
|-maoyan.py
|-scrapy.cfg #專案基本配置檔案
3.settings設定如下:
# 專案名 BOT_NAME = 'maoyan' SPIDER_MODULES = ['maoyan.spiders'] NEWSPIDER_MODULE = 'maoyan.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36' # ROBOTSTXT_OBEY:是否遵循機器人協議,預設是true,需要改為false,否則很多東西爬不了 ROBOTSTXT_OBEY = False # CONCURRENT_REQUESTS:最大併發數,很好理解,就是同時允許開啟多少個爬蟲執行緒 #CONCURRENT_REQUESTS = 32 # 下載延遲時間,單位是秒,控制爬蟲爬取的頻率 DOWNLOAD_DELAY = 3 # DEFAULT_REQUEST_HEADERS:預設請求頭,上面寫了一個USER_AGENT,其實這個東西就是放在請求頭裡面的,這個東西可以根據你爬取的內容做相應設定。 #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} #ITEM_PIPELINES:專案管道,300為優先順序,越低越爬取的優先度越高 ITEM_PIPELINES = { 'myfirstPj.pipelines.MyfirstpjPipeline': 300, }
#編碼格式 , 不設定的話json就會亂碼
FEED_EXPORT_ENCODING = 'utf-8'
USER_AGENT 在瀏覽器中可看到:
4.在items.py上編寫
import scrapy
class MaoyanItem(scrapy.Item):
move_name=scrapy.Field()
peaple_name=scrapy.Field()
move_time = scrapy.Field()
describe= scrapy.Field()
5.建立一個maoyan_spider.py檔案
6.在 maoyan_spider.py上編寫
import scrapy from maoyan.items import MaoyanItem class MaoyanSpiderSpider(scrapy.Spider): name = 'maoyan_spider' #專案名 allowed_domains = ['maoyan.com'] #域名 #url入口 start_urls = ['https://www.maoyan.com/board/4?timeStamp=1638539026134&channelId=40011&index=1&signKey=a675982b76014e4a8b7f3beb5afe7441&sVersion=1&webdriver=false'] def parse(self, response): # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div move_list = response.xpath("//*[@id='app']/div/div/div[1]/dl//div/div/div[1]") for d in move_list: maoyan_item=MaoyanItem()## 初始化item物件儲存爬取的資訊 # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[1]/a name # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[3] time # //*[@id="app"]/div/div/div[1]/dl/dd[3]/div/div/div[1]/p[2] peaple n_list= [] p_list= [] # 電影名 aaa=d.xpath(".//p[1]/a").extract_first().split('"') # 切割第一個資料 for aa in aaa: n_list.append(aa) maoyan_item['move_name'] = n_list[3] # 主演 bbb=d.xpath(".//p[2]").extract_first().split('\n') for bb in bbb: p_list.append(bb) maoyan_item['peaple_name'] = p_list[1].replace('主演:','').strip() # 上映時間 move_time1 = d.xpath(".//p[3]").extract() for t in move_time1: ccc=re.search(r"(\d{4}-\d{1,2}-\d{1,2})",t).group(0) maoyan_item['move_time'] =ccc print(maoyan_item) yield maoyan_item # 提交到排程器
7.手動建立一個main.py,用來執行的,也可以用指令
from scrapy import cmdline
cmdline.execute('scrapy crawl maoyan_spider'.split())
8.執行main.py
9.儲存方式:json,csv,mysql
1)儲存到json——注意路徑
scrapy crawl maoyan_spider -o test.json
2)儲存到csv——注意路徑
scrapy crawl maoyan_spider -o test.csv
3)儲存到資料庫mysql
#settings.py # mongo_host='192.168.x.xxx' # mongo_post=27017 # mongo_db_name='maoyan' # mongo_db_collection='maoyan_movie' MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'maoyan_sql' MYSQL_USER = 'root' MYSQL_PASSWD = '1234'
#pipelines.py # mongo_host='192.168.x.xxx' # mongo_post=27017 # mongo_db_name='maoyan' # mongo_db_collection='maoyan_movie' MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'maoyan_sql' MYSQL_USER = 'root' MYSQL_PASSWD = '1234'#pipelines.py from itemadapter import ItemAdapter import pymysql from sqlalchemy import * from sqlalchemy.orm import sessionmaker from sqlalchemy.ext.declarative import declarative_base from datetime import datetime #連線資料庫 from maoyan import settings def dbHandle(): conn = pymysql.connect( host = "localhost", user = "root", passwd = "1234", charset = "utf8mp4", use_unicode = False ) return conn class MaoyanPipeline: def __init__(self): # 連線資料庫 self.connect = pymysql.connect( host=settings.MYSQL_HOST, db=settings.MYSQL_DBNAME, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWD, charset='utf8', use_unicode=True) # 通過cursor執行增刪查改 self.cursor = self.connect.cursor() def process_item(self, item, spider): try: # 插入資料 self.cursor.execute( """insert into move(move_name,peaple_name,move_time) value (%s, %s, %s)""", (item['move_name'], item['peaple_name'], item['move_time'])) # 提交sql語句 self.connect.commit() except BaseException as e: # 出現錯誤時列印錯誤日誌 print("error:------------", e, "-----------------") return item #dbmongo部分參考 # def __init__(self): # host=mongo_host # post=mongo_post # dbname=mongo_db_name # sheetname=mongo_db_collection # client=pymongo.MongoClient(host=host,post=post) # mydb=client[dbname] # self.post=mydb[sheetname]#讀寫操作 # def process_item(self, item, spider): # data=dict(item)#先轉字典,再資料插入 # self.post.insert(data) # return item # # class HellospiderPipeline(object): # def process_item(self, item, spider): # dbObject = dbHandle() # cursor = dbObject.cursor() # cursor.execute("USE maoyan_sql") # #插入資料庫 # sql = "INSERT INTO move(move_name,peaple_name,move_time) VALUES(%s,%s,%s)" # try: # cursor.execute(sql, # ( item['move_name'], item['peaple_name'], item['move_time'])) # cursor.connection.commit() # except BaseException as e: # print("錯誤在這裡>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<") # dbObject.rollback() # return item
資料庫中檢視如下:
結尾:#以下僅供參考:
#middlewares.py
#定義ip代理中介軟體
import base64
class my_proxy(object):
def process_request(self,request,spider):
request.meta['proxy']='http-xxx.com:埠號'
proxy_name_pass=b'使用者名稱:密碼'
encode_pass_name=base64.b64encode(proxy_name_pass)#加密
request.headers['proxy-Authorization']='Basic '+encode_pass_name.decode()
#中介軟體定義之後一定要在settings檔案內啟用
#settings.py
DOWNLOADER_MIDDLEWARES = {
# 'maoyan.middlewares.MaoyanDownloaderMiddleware': 543,
'maoyan.middlewares.my_proxy': 543
}
#middlewares.py
#定義useragent中介軟體
class my_useragent(object):
def process_request(self,request,spider):
USER_AGENT_LIST=[百度一下就有]
agent=random.choice(USER_AGENT_LIST)
request.headers['User_Agent']=agent
#settings.py優先順序不能相同
DOWNLOADER_MIDDLEWARES = {
'maoyan.middlewares.my_proxy': 543,
'maoyan.middlewares.my_uesragent': 544,
}