Scrapy 爬取MT論壇所有主題帖，原因論壇搜尋功能很不好使。爬到本地搜尋。

阿新 • • 發佈：2020-11-30

在spiders下建立mt.py 寫入：

import scrapy


class itemSpider(scrapy.Spider):
    name = 'mt'
    start_urls = ['https://bbs.binmt.cc/forum.php']

    def parse(self, response):
        urls1 = response.xpath('//*[@class="comiis_fl_g"]/dl/dt/a//text()').extract()
        urls2 = response.xpath('//*[@class="comiis_fl_g"]/dl/dt/a//@href 
').extract()
        for i in  urls1:
            with open('D:\Study\pythonProject\scrapy\paqu_mt_luntan\mt_luntan\mt\{}.txt'.format(i),'a',encoding='utf-8') as f:
                f.write(i)
        #print(urls1)
        #print(urls2)
        for i in urls2:
            yield scrapy.Request(i, callback=self.fenlei)

     
def fenlei(self,response):
        zhuban = response.xpath('//*[@class="comiis_infotit cl"]/h1/a//text()').extract_first()
        zhuti = response.xpath('//*[@class="comiis_postlist cl"]/h2/span/a[2]//text()').extract()
        dizhi = response.xpath('//*[@class="comiis_postlist cl"]/h2/span/a[2]//@href').extract()
         
#print(zhuti)
        with open('D:\Study\pythonProject\scrapy\paqu_mt_luntan\mt_luntan\mt\{}.txt'.format(zhuban),'a',encoding='utf-8') as f:
            for i in range(len(zhuti)):
                f.write(zhuti[i]+','+dizhi[i])
                f.write('\n')
        #print(zhuban)
        for i in range(10,100):
            next_page1 = response.xpath('//*[@id="fd_page_bottom"]/div/a[{}]//@href'.format(i)).extract_first()
            next_page1_biaoti = response.xpath('//*[@id="fd_page_bottom"]/div/a[{}]//text()'.format(i)).extract_first()
            # print(next_page1)
            #next_page2 = response.xpath('//*[@id="fd_page_bottom"]/div/a[12]//@href').extract_first()
            #next_page2_biaoti = response.xpath('//*[@id="fd_page_bottom"]/div/a[12]//text()').extract_first()
            if next_page1_biaoti == '下一頁':
                next_page1 = response.urljoin(next_page1)
                yield scrapy.Request(next_page1, callback=self.fenlei)
                break
            #elif next_page2_biaoti == '下一頁':
                #next_page2 = response.urljoin(next_page2)
                #yield scrapy.Request(next_page2, callback=self.fenlei)
            else:
                print('結束！')

然後在middlewares.py 寫入隨即請求頭：

from fake_useragent import UserAgent
class NovelUserAgentMiddleWare(object): #隨即user_AGENT
    def __init__(self):
        self.t = UserAgent(verify_ssl=False).random

    def process_request(self, request, spider):

        ua = self.t
        print('User-Agent:' + ua)
        request.headers.setdefault('User-Agent', ua)

然後最後在settings中寫入：

ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.25

CONCURRENT_REQUESTS = 100

CONCURRENT_REQUESTS_PER_DOMAIN = 100

CONCURRENT_REQUESTS_PER_IP = 100

COOKIES_ENABLED = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

DOWNLOADER_MIDDLEWARES = {

   'mt_luntan.middlewares.NovelUserAgentMiddleWare': 544, #隨即user
   #'ImagesRename.middlewares.NovelProxyMiddleWare': 543,#隨即IP  ImagesRename 換成自己的
}