爬取汽車之家北京二手車資訊
阿新 • • 發佈:2018-12-07
爬取汽車之家北京二手車資訊
經測試,該網站:https://www.che168.com/beijing/list/ 反爬機制較低,僅需要偽造請求頭設定爬取速率,但是100頁之後需要登入,登入之後再爬要慎重,一不小心就會永久封號。爬取的資料以各種型別存放,下面展示儲存到mysql資料庫中:
程式碼解析:
程式原始碼自提Github:https://github.com/H-Ang/carsSpider
爬蟲主程式
# 汽車之家爬蟲,北京二手車 import requests from lxml import etree from data_save import * import time class Car_second(): name = '' gonglishu = '' brought_year = '' location = '' img_url = '' price = '' def getInfors(url,i): print("Page %d is saving." % i) # 構造請求頭 headers = { "Cache-Control":"no-cache", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36", "Referer":"https://www.che168.com/beijing/list/", } response = requests.get(url=url,headers=headers) html = response.text ob_xml = etree.HTML(html) infos = ob_xml.xpath('//*[@id="viewlist_ul"]//li[not(contains(@class,"adv-img"))]/a') secondCars = [] for info in infos: if info.xpath('.//img/@src2') == []: img = info.xpath('.//img/@src')[0] else: img = info.xpath('.//img/@src2')[0] name = info.xpath('.//h4/text()')[0] price = info.xpath('.//span[@class="price"]/text()')[0] + info.xpath('.//em/text()')[0] myl = info.xpath('.//p/text()')[0].split('/') gonglishu = myl[0] brought_year = myl[1] location = myl[2] secondCar = Car_second() secondCar.name = name secondCar.img_url = img secondCar.brought_year = brought_year secondCar.location = location secondCar.gonglishu = gonglishu secondCar.price = price secondCars.append(secondCar) return secondCars if __name__ == '__main__': url = 'https://www.che168.com/beijing/a0_0msdgscncgpi1ltocsp{}exx0/' for i in range(1,101): car_infors = getInfors(url.format(i),i) time.sleep(0.95) #savdFile(car_infors) saveMysql(car_infors)
儲存資料
def savdFile(datas): # 儲存到文字檔案 with open('J:\DATAs\北京市二手車(汽車之家)\data.txt','a+',encoding='utf-8') as f: for data in datas: # try: # name = data.name # gonglishu = data.gonglishu # brought_year = data.brought_year # location = data.location # img_url = data.img_url # price = data.price # writeCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"圖片地址:"+img_url # f.write(writeCont+'\n\n') # except: # print(writeCont) name = data.name gonglishu = data.gonglishu brought_year = data.brought_year location = data.location img_url = data.img_url price = data.price writeCont = name+"/"+gonglishu+"/"+brought_year+"/"+location+"\n"+price+"圖片地址:"+img_url f.write(writeCont+'\n\n') print('儲存完成。') # 將資料儲存到資料庫中 from sqlalchemy import Column,create_engine,Integer,String from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker Base = declarative_base() class Car(Base): __tablename__ = "second_cars" id = Column(Integer,primary_key=True,autoincrement=True,nullable=False) carName = Column(String(100)) gonglishu = Column(String(20)) brought_year = Column(String(10)) location = Column(String(10)) image_url = Column(String(200)) price = Column(String(10)) def saveMysql(datas): connect = create_engine("mysql+pymysql://root:
[email protected]:3306/second_cars", encoding='utf-8', echo=True) Base.metadata.create_all(connect) DBsession = sessionmaker(bind=connect) session = DBsession() for data in datas: car = Car( carName=data.name, gonglishu = data.gonglishu, brought_year = data.brought_year, price=data.price, location = data.location, image_url = data.img_url, ) session.add(car) session.commit() session.close()
反思
儲存到mysql資料庫是,建立新物件並傳參時有點複雜,我曾經記得有種很簡單明瞭的方法,現在怎麼也想不起來,望指教哈。