記錄一下xpath提取不到iframe多層巢狀的問題
阿新 • • 發佈:2018-12-20
今天爬取中彩網福彩3d[http://www.zhcw.com/3d/]的時候,碰到iframe巢狀,xpath始終取不到值,如下圖: 無論怎麼取值,都為null,後來發現有個這個東西 然後直接進入到url裡面,就可以取到值了 好了,問題解決,查閱網上資料,聽說可以正面攻克,比較麻煩,不推薦花時間去做這東西。 最後附上本人程式碼,爬蟲框架用的是scrapy,儲存用的MySQL資料庫。 items
import scrapy
class Lottery3DItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 開獎日期
date = scrapy.Field()
# 期號
issue = scrapy.Field()
# 第一個藍球號碼
blue1 = scrapy.Field()
# 第二個藍球號碼
blue2 = scrapy.Field()
# 第三個藍球號碼
blue3 = scrapy.Field()
spider
# -*- coding: utf-8 -*-
import scrapy
from ..items import Lottery3DItem
class LotterySpider(scrapy. Spider):
name = 'lottery'
allowed_domains = ['zhcw.com']
start_urls = ['http://kaijiang.zhcw.com/zhcw/html/3d/list_1.html']
index = 1
items = []
def parse(self, response):
node_list = response.xpath("//tr")
node_list.pop(0)
node_list.pop(0)
node_list.pop( )
for node in node_list:
item = Lottery3DItem()
item["date"] = node.xpath("./td[1]/text()").extract_first()
item["issue"] = node.xpath("./td[2]/text()").extract_first()
item["blue1"] = node.xpath("./td[3]/em[1]/text()").extract_first()
item["blue2"] = node.xpath("./td[3]/em[2]/text()").extract_first()
item["blue3"] = node.xpath("./td[3]/em[3]/text()").extract_first()
yield item
self.index += 1
next_url = "http://kaijiang.zhcw.com/zhcw/html/3d/list_{}.html".format(self.index)
yield scrapy.Request(url=next_url, callback=self.parse)
pipeline
import pymysql
class Lottery3DPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host='103.27.5.156', user='developer', passwd='Developer!123', db='spider', charset='utf8')
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
lottery_date = item['date']
issue = item['issue']
blue1 = item['blue1']
blue2 = item['blue2']
blue3 = item['blue3']
sql = "insert into lottery_3d(date, issue, blue1, blue2, blue3) VALUES(%s, %s, %s, %s, %s)"
self.cursor.execute(sql, (lottery_date, issue, blue1, blue2, blue3,))
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()