1. 程式人生 > >Scrapy爬蟲-pipeline.py

Scrapy爬蟲-pipeline.py

req lac ica sts pre __init__ filename self pipe

一.文件保存

1.分類保存

1     def process_item(self, item, spider):
2         category="novel1/"+item[category]
3         if os.path.exists(category)==False:
4             os.mkdir(category)
5         fname=category+"/"+item[article_name]+.txt
6         self.filename=codecs.open(fname, a, utf-8)
7         self.filename.write(item[
content_name]+\n) 8 self.filename.write(item[content]+\n) 9 return item

2.直接保存

 1     def __init__(self):
 2         self.filename=codecs.open(face.json,wb+,utf-8)
 3 
 4     def process_item(self, item, spider):
 5         line = json.dumps(dict(item), ensure_ascii=False,sort_keys=True, indent=4) + "
,\n" 6 self.filename.write(line) 7 return item 8 9 def spider_closed(self, spider): 10 self.filename.close()

3.圖片下載(setting.py文件中設置保存路徑 IMAGES_STORE=os.path.join(os.path.dirname(os.path.dirname(__file__)),‘images‘) )

 1 class Img699PicPipeline(object):
 2     def
process_item(self, item, spider): 3 return item 4 5 6 class Images699Pipeline(ImagesPipeline): 7 def get_media_requests(self, item, info): 8 # 這個方法是在發送下載請求之前調用的,其實這個方法本身就是去發送下載請求的 9 request_objs=super(Images699Pipeline, self).get_media_requests(item,info) 10 for request_obj in request_objs: 11 request_obj.item=item 12 return request_objs 13 14 def file_path(self, request, response=None, info=None): 15 # 這個方法是在圖片將要被存儲的時候調用,來獲取這個圖片存儲的路徑 16 path=super(Images699Pipeline, self).file_path(request,response,info) 17 category=request.item.get(category) 18 image_store=settings.IMAGES_STORE 19 category_path=os.path.join(image_store,category) 20 if not os.path.exists(category_path): 21 os.makedirs(category_path) 22 image_name=path.replace("full/","") 23 image_path=os.path.join(category_path,image_name) 24 return image_path

二、內容去重

 1 class DuplicatesPipeline(object):
 2     def __init__(self):
 3         self.face_set = set()
 4 
 5     def process_item(self, item, spider):
 6         for materail in item[materials]:
 7             id=materail[id]
 8             if id in self.face_set:
 9                 raise DropItem("Duplicate book found:%s" % item)
10             self.face_set.add(id)
11         return item

Scrapy爬蟲-pipeline.py