1. 程式人生 > 其它 >Python爬取資料並輸出到資料庫

Python爬取資料並輸出到資料庫

程式碼如下:

  1 import pymysql
  2 import requests
  3 import json
  4 import time
  5 
  6 
  7 def daorumysql(items):
  8     # 連線資料庫
  9     db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='jingdong', charset='utf8')
 10     cursor = db.cursor()
 11     # 測試,列印一下mysql版本
12 cursor.execute("select version()") 13 14 sql = 'insert into shishi(id,guid,content,creationTime,isTop,referenceTime,firstCategory,secondCategory,thirdCategory,replyCount,score,nickname,userClient,productColor,productSize,plusAvailable,productSales,days,afterDays) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
' 15 16 index = 1 # 行數 17 for data in items: # items是十條資料 data是其中一條(一條下有三個內容) 18 # 執行sql語句 19 values = ( 20 data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9], data[10], data[11], 21 data[12], data[13], data[14], data[15], data[16], data[17], data[18])
22 cursor.execute(sql, values) 23 db.commit() 24 25 cursor.close() 26 db.close() 27 28 29 def start(page): 30 # 獲取URL 31 # score 評價等級 page=0 第一頁 producitid 商品類別 32 # url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100014348492&score=0&sortType=5&page='+str(page)+'&pageSize=3100&isShadowSku=0&fold=1' 33 34 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100034710036&score=1&sortType=5&page=' + str( 35 page) + '&pageSize=10&isShadowSku=0&fold=1' 36 37 headers = { 38 "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36" 39 } 40 time.sleep(2) 41 test = requests.get(url=url, headers=headers) 42 t = test.text 43 data = json.loads(t.lstrip('fetchJSON_comment98vv12345(').rstrip(');')) 44 return data 45 46 47 def parse(data): 48 items = data['comments'] 49 for i in items: 50 yield ( 51 i['id'], # id 52 i['guid'], 53 i['content'], # 內容 54 i['creationTime'], # 時間 55 i['isTop'], 56 i['referenceTime'], 57 i['firstCategory'], 58 i['secondCategory'], 59 i['thirdCategory'], 60 i['replyCount'], 61 i['score'], 62 i['nickname'], 63 i['userClient'], 64 i['productColor'], 65 i['productSize'], 66 i['plusAvailable'], 67 i['productSales'], 68 i['days'], 69 i['afterDays'] 70 ) 71 72 73 def create(): 74 # 連線資料庫 75 db = pymysql.connect(host='localhost', port=3306, user='root', password='ym123', db='jingdong', charset='utf8') 76 cursor = db.cursor() 77 # 測試,列印一下mysql版本 78 cursor.execute("select version()") 79 data = cursor.fetchone() 80 print("Database Version:%s" % data) 81 82 # 執行建立分詞表的SQL語句 83 sql1 = """ 84 CREATE TABLE shishi ( 85 id varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 86 guid varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 87 content varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 88 creationTime varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 89 isTop varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 90 referenceTime varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 91 firstCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 92 secondCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 93 thirdCategory varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 94 replyCount varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 95 score varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 96 nickname varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 97 userClient varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 98 productColor varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 99 productSize varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 100 plusAvailable varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 101 productSales varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 102 days varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL, 103 afterDays varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL 104 )ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic; 105 """ 106 # 執行SQL語句 107 cursor.execute(sql1) 108 print("建立shishi表成功") 109 110 111 def main(): 112 create() 113 114 j = 1 # 頁面數 115 116 for i in range(0, 100): 117 time.sleep(1.5) 118 # 記得time反爬 其實我在爬取的時候沒有使用代理ip也沒給我封 不過就當這是個習慣吧 119 first = start(j) 120 test = parse(first) 121 122 daorumysql(test) 123 124 print('' + str(j) + '頁抓取完畢\n') 125 j = j + 1 126 127 128 if __name__ == '__main__': 129 main()