爬取愛問知識人,問題及回答
主要原始碼:
aiwen_spider.py:
import scrapy
from aiwen.items import AiwenItem
class aiwenSpider( scrapy.Spider):
name = “aiwen”
allowed_domains = “/iask.sina.com”
start_urls = [
“https://iask.sina.com.cn/c/80-goodAnswer-1-new.html”
]
def parse(self,response):
link = response.xpath(’//div[@class=“list-body-con current”]/ul/li/div/div[@class=“question-title”]/a/@href’).extract()
user_agent = ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134’
headers = {‘User-Agent’: user_agent}
for a in link:
url = ‘
# print(url)
yield scrapy.Request(url,callback=self.content,dont_filter=True,headers=headers)
link1= response.xpath(’//div[@class=“page mt30”]/a/@href’).extract()
for b in link1:
url1 = ‘https://iask.sina.com.cn’+b
# print(url1)
yield scrapy.Request(url1,callback=self.parse,dont_filter=True,headers=headers)
def content(self,response):
print(response)
question = response.xpath(’//p[@class=“title-text”]/text()’).extract()
answer = response.xpath(’//div[@class=“new-answer-text new-answer-cut new-pre-answer-text”]/pre/text()’).extract()
item = AiwenItem()
for question1 in question:
item[‘question’] = question1
print(item[‘question’])
for answer1 in answer:
item[‘answer’] = answer1
print(item[‘answer’])
yield item
# f = open('aiwen.txt', 'a+')
# item['question']=str(item['question'])
# item['answer']=str(item['answer'] )
#
# f.write(item['answer'] + '\n\n')
# f.close()
-- coding: utf-8 --
Define here the models for your scraped items
See documentation in:
https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class AiwenItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
question = scrapy.Field()
answer = scrapy.Field()
pass
main.py:
#coding=utf-8
from scrapy import cmdline
cmdline.execute(“scrapy crawl aiwen”.split())
pipeline.py:
-- coding: utf-8 --
Define your item pipelines here
Don’t forget to add your pipeline to the ITEM_PIPELINES setting
See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class AiwenPipeline(object):
def init(self):
self.conn = pymysql.connect(host=‘localhost’,
user=‘root’,
password=‘123’,
db=‘test’,
charset=‘utf8’
)
cursor = self.conn.cursor()
cursor.execute(“DROP TABLE IF EXISTS aiwen”)
sql = “”“CREATE TABLE aiwen(aiwen text(1000) )”""
cursor.execute(sql)
def process_item(self, item, spider):
cursor = self.conn.cursor()
cursor.execute( "INSERT INTO aiwen(aiwen) VALUES ('%s');" % ( pymysql.escape_string(item['answer'])))
self.conn.commit()
return item
學習總結:
1.在這個小任務中,鞏固了scrapy框架的使用,同時也掌握了xpath的使用