1. 程式人生 > >pandas切割mongodb資料

pandas切割mongodb資料

import pandas as pd
import numpy as np
import missingno as mso
from pandas import Series,DataFrame
import pymongo
import pymysql
import time
#連線資料庫
def select_mysql():
    db = pymysql.connect(host='localhost', port=3306, user='root', password='123456', db='test',
                         charset='utf8')
    cur = db.cursor()
    sql = "select cid from spider_category_task "
    # sql = "select cid from spider_category_task where cid = 110205"#測試
    cur.execute(sql)
    cid_list = cur.fetchall()
    db.close()
    for cid in cid_list:
        cid =cid[0]
        find_pymongo_data(cid)

def find_pymongo_data(category):
    a = 25
    '''
    傳入一個類目名稱,從mongo裡面查出該類目的所有資料,並把
    它轉換為pandas裡面的DataFrame物件
    :param key_word: 類目名稱
    :return:
    '''
    mongo_user = "root"
    mongo_pwd = "123456*"
    client = pymongo.MongoClient(host = 'localhost',port = 27017 )
    #資料庫登入需要賬號密碼的話
    client.test.authenticate(mongo_user,mongo_pwd) # test代表資料庫名稱
    db = client['test']#獲得資料庫的控制代碼
    coll = db['20181022']#獲得collection的控制代碼

    coll.create_index('category')#新增索引
    #載入資料
    # data = DataFrame(list(coll.find({'key_word':key_word})))#根據搜尋的關鍵詞拿出資料
    mongo_data = DataFrame(list(coll.find({'category':category})))
    client.close()
    try:
        if len(mongo_data) > 0:#查詢結果不為空執行
            mongo_data = mongo_data[['price','sold']]#取裡面的價格和銷量
            data = mongo_data[~mongo_data['sold'].isin([''])]#取出裡面的價格不為空的資料
            print(data)
            data = data.astype({'price': 'float', 'sold': 'int'})#價格轉為浮點型.銷量轉為整數型
            splite_price(data,category,a)
    except Exception as e:
        pass


def splite_price(data,category, a):
    '''
    :param data: 從mongo裡面獲取DataFrame物件的資料
    :return:
    '''
    try:
        data['group'] = pd.qcut(data.price,a)#價格切割為25段,如果切不了,依次減一,遞迴繼續切
        # 求每個價格分段對應的sales的和
        datas = data[['sold', 'group']].groupby('group').sum().reset_index()
    except Exception as e:
        if a > 0:
            a -= 1
            splite_price(data,category, a)
        else:
            print('一段也切不了')


    # print((data['group']))#二位陣列


    price_group =datas['group']
    sold_group = datas['sold']
    price_group_list = []
    for i in price_group:
        i = (str(i)).replace('(','')
        i = i.replace(']','')
        price_list = i.split(',')
        price_group_list.append(price_list)
        # low_price = price_list[0]
        # high_price =price_list[1]
    data_list = zip(price_group_list,sold_group)#打包,兩個列表合為一個列表(資料一一對應)
    data_list = (list(data_list))
    print(data_list)
#     insert_mysql(data_list,category)
#
def insert_mysql(data_list,category):
    '''
    插入到資料庫裡面
    :param data_list: 
    :param category: 
    :return: 
    '''
    # print(data_list)
    pass
    datetime = time.strftime('%Y%m%d')
    # data_list = splite_price()
    db = pymysql.connect(host = 'localhost',port = 3306,user = 'test',password = 'root*',db = 'test',charset = 'utf8')
    cur = db.cursor()
    for data in data_list:
        min_price = data[0][0]
        max_price = data[0][1]
        min_price = float(min_price)
        max_price = float(max_price)
        sold = data[1]
        # print(sold)
        sql = "insert into price_break_test (min_price,max_price,cate_id,sold,date_time) values(%s,%s,'%s','%s','%s')"%(min_price,max_price,category,sold,datetime)
        cur.execute(sql)
    db.commit()
    db.close()


select_mysql()