1. 程式人生 > >python爬蟲:編寫多程序爬蟲學習筆記

python爬蟲:編寫多程序爬蟲學習筆記

# -*- coding: utf-8 -*-
"""
Created on Sat Oct 22 21:01:23 2016
@author: hhxsym
"""
import requests
import json
import os
import pymongo
import time
from bs4 import BeautifulSoup
from multiprocessing import Pool  #程序呼叫的包
inpath="C:\\Users\\hhxsym\\Desktop\\課程群Python爬蟲"
inpath = unicode(inpath , "utf8"
) os.chdir(inpath) #不做編碼轉換後,中文路徑無法開啟,更改 #連線資料庫 client =pymongo.MongoClient('localhost', 27017) #連線資料庫 sense = client['sense'] #建立資料庫 url_list = sense['url_list'] #建立資料庫表 def get_city_urls(): url = 'http://www.senseluxury.com' with open('city.html') as f: #本地讀取 response = f.read() #直接讀取到文字 soup = BeautifulSoup(response, 'lxml'
) urls = soup.select('#destination_nav > div > div > div > dl.dl-list > dt > a') #CSS結構型別,注意空格 return [url.get("href") for url in urls] def get_page_list(city, page=1): now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) url = 'http://www.senseluxury.com/destinations_list/%s'
% city.split('/')[-1] payload = {'page':page, 'callback':'jsonp'} responses = requests.get(url,payload) #請求網頁,獲得響應的內容,requests.get(url地址,關鍵字url引數) #print responses.url print responses.status_code #print responses.text[6:-1] #列印json格式的字串” (1) wb_data = json.loads(responses.text[6:-1]) #將字串轉換為python的字典 (2) print type(responses.text), type(wb_data) #對比兩種型別 (1)(2)對比 #print json.dumps(wb_data, encoding='utf-8', ensure_ascii=False) #json.dumps方法,變換成中文列印 #通過迴圈獲取資料 for i in wb_data['val']['data']: title = i['title'] url = 'http://www.senseluxury.com'+i['url'] #資料拼接,獲得我們想要的資料 id = i['id'] server=i['server'].replace(' ',' ').split() memo = i['memo'] price = i['price'] address = i['address'] subject =i['subject'] data = {'title':title, 'id':id, 'server':server,'memo':memo, 'prie':price, 'address':address, 'subject':subject, 'create_time':now} url_list.insert_one(data) #插入資料(字典) # 注:生成的資料列表中 _id是自動生成的 #print title, url print data if __name__=='__main__': #get_page_list(1) #print get_city_urls() #get_page_list('http://www.senseluxury.com/destinations/2', page=1) city_urls = get_city_urls() print city_urls pool = Pool(processes=4) #設定程序數量 pool.map(get_page_list, city_urls) #pool.map(函式名稱,迭代物件) pool.close() # 等待程序池中所有程序執行結束之後再關閉 pool.join() #關閉之後要計入它,作用:防止主程式在子程序結束前關閉 # 網頁json型別的檢視:瀏覽器 -> 右鍵 檢查 -> network -> XHR ->頁面觸發(跳轉頁面) -> name勾選-> Response ->檢視是否出現json格式字串 # http://jsoneditoronline.org/ 線上格式化網站,檢視接送巢狀格式