1. 程式人生 > 實用技巧 >Scrapy 爬取MT論壇 所有主題帖,原因論壇搜尋功能很不好使。爬到本地搜尋。

Scrapy 爬取MT論壇 所有主題帖,原因論壇搜尋功能很不好使。爬到本地搜尋。

在spiders下建立mt.py 寫入:

import scrapy


class itemSpider(scrapy.Spider):
    name = 'mt'
    start_urls = ['https://bbs.binmt.cc/forum.php']

    def parse(self, response):
        urls1 = response.xpath('//*[@class="comiis_fl_g"]/dl/dt/a//text()').extract()
        urls2 = response.xpath('//*[@class="comiis_fl_g"]/dl/dt/a//@href
').extract() for i in urls1: with open('D:\Study\pythonProject\scrapy\paqu_mt_luntan\mt_luntan\mt\{}.txt'.format(i),'a',encoding='utf-8') as f: f.write(i) #print(urls1) #print(urls2) for i in urls2: yield scrapy.Request(i, callback=self.fenlei)
def fenlei(self,response): zhuban = response.xpath('//*[@class="comiis_infotit cl"]/h1/a//text()').extract_first() zhuti = response.xpath('//*[@class="comiis_postlist cl"]/h2/span/a[2]//text()').extract() dizhi = response.xpath('//*[@class="comiis_postlist cl"]/h2/span/a[2]//@href').extract()
#print(zhuti) with open('D:\Study\pythonProject\scrapy\paqu_mt_luntan\mt_luntan\mt\{}.txt'.format(zhuban),'a',encoding='utf-8') as f: for i in range(len(zhuti)): f.write(zhuti[i]+','+dizhi[i]) f.write('\n') #print(zhuban) for i in range(10,100): next_page1 = response.xpath('//*[@id="fd_page_bottom"]/div/a[{}]//@href'.format(i)).extract_first() next_page1_biaoti = response.xpath('//*[@id="fd_page_bottom"]/div/a[{}]//text()'.format(i)).extract_first() # print(next_page1) #next_page2 = response.xpath('//*[@id="fd_page_bottom"]/div/a[12]//@href').extract_first() #next_page2_biaoti = response.xpath('//*[@id="fd_page_bottom"]/div/a[12]//text()').extract_first() if next_page1_biaoti == '下一頁': next_page1 = response.urljoin(next_page1) yield scrapy.Request(next_page1, callback=self.fenlei) break #elif next_page2_biaoti == '下一頁': #next_page2 = response.urljoin(next_page2) #yield scrapy.Request(next_page2, callback=self.fenlei) else: print('結束!')

然後在middlewares.py 寫入隨即請求頭:

from fake_useragent import UserAgent
class NovelUserAgentMiddleWare(object): #隨即user_AGENT
    def __init__(self):
        self.t = UserAgent(verify_ssl=False).random

    def process_request(self, request, spider):

        ua = self.t
        print('User-Agent:' + ua)
        request.headers.setdefault('User-Agent', ua)

然後最後在settings中寫入:

ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.25

CONCURRENT_REQUESTS = 100

CONCURRENT_REQUESTS_PER_DOMAIN = 100

CONCURRENT_REQUESTS_PER_IP = 100

COOKIES_ENABLED = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

DOWNLOADER_MIDDLEWARES = {

   'mt_luntan.middlewares.NovelUserAgentMiddleWare': 544, #隨即user
   #'ImagesRename.middlewares.NovelProxyMiddleWare': 543,#隨即IP  ImagesRename 換成自己的
}

加入延遲,太快容易出現抓去到的文字亂碼。

最後看下結果:

自己搜尋想看的帖子,在後面複製地址就可以直達看了。