python3 ks檢驗 求平均值方差標準差中位數 dataframe使用engine寫入資料庫 pandas使用
阿新 • • 發佈:2018-11-02
需求是這樣的:將兩個資料集進行ks檢驗,算中位數方差標準差等資料,最後輸出到資料庫中
import psycopg2
import os
import pandas as pd
from scipy.stats import ks_2samp
import numpy as np
from sqlalchemy import create_engine
# 用來操作資料庫的類 class GPCommand(object): # 類的初始化 def __init__(self): self.hostname = 'ip' self.username = 'name' self.password = 'password' self.database = 'database' def connectGp(self): try: #連結資料庫 #讀取配置利用connect連結資料庫 self.connect = psycopg2.connect( host=self.hostname, user=self.username, password=self.password, dbname=self.database ) #建立一個新的cursor self.cursor = self.connect.cursor() print("connect gp successful."+'\n' + '資料庫連線成功') return ('con_successful') except psycopg2.Error: error = 'Failed to setup Postgres environment.\n{0}'.format(sys.exc_info()) print('connect gp error.'+'\n' + '資料庫連線失敗') return 'con_error'+ error #關閉資料庫 def closeMysql(self): self.cursor.close() self.connect.close() print("資料庫已關閉") def select_data(self): try: select_sql1 = "select distinct a.order_no,a.cust_no,b.item_num from dw_edw.edw_ord_fct_order_info a " \ "left join (select order_no,count(distinct item_no) as item_num " \ "from dw_edw.edw_ord_fct_order_items_info group by order_no)b on a.order_no=b.order_no where a.customer_confirm_date>='2015-01-01 00:00:00' " self.cursor.execute(select_sql1) rows1 = self.cursor.fetchall() select_sql2 = "select t1.*,t2.order_num,t3.item_num from( select distinct cust_no,order_no from dw_edw.edw_ord_fct_order_info " \ "where customer_confirm_date>='2015-01-01 00:00:00' )t1 left join ( select cust_no,count(distinct order_no) as order_num " \ "from dw_edw.edw_ord_fct_order_info where customer_confirm_date>='2015-01-01 00:00:00' group by cust_no )t2 on t1.cust_no=t2.cust_no " \ "left join (select order_no,count(distinct item_no) as item_num from dw_edw.edw_ord_fct_order_items_info group by order_no)t3 " \ "on t1.order_no=t3.order_no where t2.order_num>5 " self.cursor.execute(select_sql2) rows2 = self.cursor.fetchall() return rows1,rows2 except Exception as e: print(e) os._exit(0) def kt(df1,df2,x): itemks = df1['item_num1'] beta = itemks result2 =df2.groupby('cust_no2').count().reset_index() res_fram2 = result2.iloc[x] res_fram2_cust = res_fram2['cust_no2'] res2 = df2[df2['cust_no2'] == res_fram2_cust] sampleks = res2['item_num2'] norm = sampleks ks = ks_2samp(beta, norm) # df = pd.DataFrame({'cust_no':[res_fram2_cust], # 'pvalue': [ks.pvalue]}) return res_fram2_cust,ks.pvalue def insert_data(data): engine = create_engine('postgresql://name:
[email protected]:port/database') try: data.to_sql('market_sales_precision_ks_p_test',schema='dw_ana', con=engine, index=False, if_exists='append') except Exception as e: print(e) def main(): gpCommand = GPCommand() gpCommand.connectGp() ####計算平均值和方差/標準差 res1 = gpCommand.select_data() cust_no1 = [] item_num1 = [] for item1 in res1[0]: cust_no1.append(item1[1]) item_num1.append(item1[2]) df1 = pd.DataFrame({'cust_no1':cust_no1, 'item_num1':item_num1 }) result1 = df1.groupby('cust_no1') res1_mean1 = df1.groupby('cust_no1').mean() res1_var1 = df1.groupby('cust_no1').var() res1_std1 = df1.groupby('cust_no1').std() res1_median1 = df1.groupby('cust_no1').median() cust_no2 = [] item_num2 = [] for item2 in res1[1]: cust_no2.append(item2[0]) item_num2.append(item2[3]) df2 = pd.DataFrame({'cust_no2': cust_no2, 'item_num2': item_num2}) res1_mean2 = df2.groupby('cust_no2').mean() res1_var2 = df2.groupby('cust_no2').var() res1_std2 = df2.groupby('cust_no2').std() res1_median2 = df2.groupby('cust_no2').median() ####分佈分析-ks檢驗 data = [] df2_size = df2.groupby('cust_no2').size() for i in range(0,len(df2_size)): datef = kt(df1,df2,i) data.append(datef) data_cust1=[] data_p1=[] for item in data: data_cust1.append(item[0]) data_p1.append(item[1]) df_pdata = pd.DataFrame({'cust_no': data_cust1, 'p': data_p1}) gpCommand.closeMysql() insert_data(df_pdata) main()