爬蟲高階應用03---資料庫和Json獲取
阿新 • • 發佈:2018-12-17
爬蟲基礎day03—資料庫和動態獲取
一.和MySQL資料庫建立連線
import pymysql def main(): # 設定連線 db = pymysql.connect( host='localhost', port=3306, user='root', password='123456', db='spider', charset='utf8') # 遊標 cursor = db.cursor() # 寫sql語句 sql = "INSERT INTO beautiful(url,content)VALUES('123','測試')", # 解析sql語句 cursor.execute(sql) # 提交 db.commit() if __name__ == '__main__': main()
注意:插入資料防止重複插入可用唯一索引
格式:alter table 表 add unique(欄位)
示例:alter table money add primary key(id);
解釋:為money表的id欄位增加主鍵索引
二、動態獲取(以蘑菇街為例)
import requests from lxml import etree import json # 取頁面HTML def get_one_page(): url = 'https://list.mogujie.com/search?callback=jQuery21104432147899441732_1540347837433&_version=8193&ratio=3%3A4&cKey=15&page=1&sort=pop&ad=0&fcid=50206&action=trousers&acm=3.mce.1_10_1hepw.109731.0.ubj8Qr7mesgMd.pos_1-m_406086-sd_119-mf_15261_1047900-idx_0-mfs_4-dm1_5000&ptp=1._mf1_1239_15261.0.0.wdmwVEI3&_=1540347837434' headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" } response = requests.get(url, headers=headers) if response.status_code == 200: text = response.content.decode('utf-8') return text return None def get_real_content(html): if html and len(html) > 128: html1 = html.split('(')[1:][0] html1 = html1.replace(');', '') return html1 return None def main(): html = get_one_page() html_content = get_real_content(html) print(html_content) result = json.loads(html_content) print(result['status']['code']) if __name__ == '__main__': main()