pandas切割mongodb資料
阿新 • • 發佈:2018-11-16
import pandas as pd import numpy as np import missingno as mso from pandas import Series,DataFrame import pymongo import pymysql import time #連線資料庫 def select_mysql(): db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='test', charset='utf8') cur = db.cursor() sql = "select cid from spider_category_task " # sql = "select cid from spider_category_task where cid = 110205"#測試 cur.execute(sql) cid_list = cur.fetchall() db.close() for cid in cid_list: cid =cid[0] find_pymongo_data(cid) def find_pymongo_data(category): a = 25 ''' 傳入一個類目名稱,從mongo裡面查出該類目的所有資料,並把 它轉換為pandas裡面的DataFrame物件 :param key_word: 類目名稱 :return: ''' mongo_user = "root" mongo_pwd = "123456*" client = pymongo.MongoClient(host = 'localhost',port = 27017 ) #資料庫登入需要賬號密碼的話 client.test.authenticate(mongo_user,mongo_pwd) # test代表資料庫名稱 db = client['test']#獲得資料庫的控制代碼 coll = db['20181022']#獲得collection的控制代碼 coll.create_index('category')#新增索引 #載入資料 # data = DataFrame(list(coll.find({'key_word':key_word})))#根據搜尋的關鍵詞拿出資料 mongo_data = DataFrame(list(coll.find({'category':category}))) client.close() try: if len(mongo_data) > 0:#查詢結果不為空執行 mongo_data = mongo_data[['price','sold']]#取裡面的價格和銷量 data = mongo_data[~mongo_data['sold'].isin([''])]#取出裡面的價格不為空的資料 print(data) data = data.astype({'price': 'float', 'sold': 'int'})#價格轉為浮點型.銷量轉為整數型 splite_price(data,category,a) except Exception as e: pass def splite_price(data,category, a): ''' :param data: 從mongo裡面獲取DataFrame物件的資料 :return: ''' try: data['group'] = pd.qcut(data.price,a)#價格切割為25段,如果切不了,依次減一,遞迴繼續切 # 求每個價格分段對應的sales的和 datas = data[['sold', 'group']].groupby('group').sum().reset_index() except Exception as e: if a > 0: a -= 1 splite_price(data,category, a) else: print('一段也切不了') # print((data['group']))#二位陣列 price_group =datas['group'] sold_group = datas['sold'] price_group_list = [] for i in price_group: i = (str(i)).replace('(','') i = i.replace(']','') price_list = i.split(',') price_group_list.append(price_list) # low_price = price_list[0] # high_price =price_list[1] data_list = zip(price_group_list,sold_group)#打包,兩個列表合為一個列表(資料一一對應) data_list = (list(data_list)) print(data_list) # insert_mysql(data_list,category) # def insert_mysql(data_list,category): ''' 插入到資料庫裡面 :param data_list: :param category: :return: ''' # print(data_list) pass datetime = time.strftime('%Y%m%d') # data_list = splite_price() db = pymysql.connect(host = 'localhost',port = 3306,user = 'test',password = 'root*',db = 'test',charset = 'utf8') cur = db.cursor() for data in data_list: min_price = data[0][0] max_price = data[0][1] min_price = float(min_price) max_price = float(max_price) sold = data[1] # print(sold) sql = "insert into price_break_test (min_price,max_price,cate_id,sold,date_time) values(%s,%s,'%s','%s','%s')"%(min_price,max_price,category,sold,datetime) cur.execute(sql) db.commit() db.close() select_mysql()