1. 程式人生 > 實用技巧 >python DataFrame 交併差集

python DataFrame 交併差集

smysqldb = mysql.MYSQL(host=source_param['db_ip'], port=int(source_param['db_port']),
                       user=source_param['db_user'],
                       pwd=source_param['db_pwd'], db=source_param['db_name'])
tmysqldb = mysql.MYSQL(host=target_param['db_ip'], port=int(target_param['db_port']),
                       user=target_param['db_user'],
                       pwd=target_param['db_pwd'], db=target_param['db_name'])
desc_sql = "SELECT c.COLUMN_NAME AS filed_name,c.COLUMN_TYPE AS filed_type,c.DATA_TYPE as data_type,c.CHARACTER_MAXIMUM_LENGTH as char_length FROM information_schema. TABLES t INNER JOIN information_schema. COLUMNS c ON t.TABLE_NAME = c.TABLE_NAME" \
           + " AND t.TABLE_SCHEMA = c.TABLE_SCHEMA WHERE t.TABLE_NAME = '{table_name}' AND t.TABLE_SCHEMA = '{database_name}'"
filed_columns = ['filed_name', 'filed_type', 'data_type', 'char_length']

df1 = pd.DataFrame(smysqldb.ExecQuery(
    desc_sql.format(table_name=source_param['table_name'], database_name=source_param['db_name'])),
    columns=filed_columns)
df2 = pd.DataFrame(tmysqldb.ExecQuery(
    desc_sql.format(table_name=target_param['table_name'], database_name=target_param['db_name'])),
    columns=filed_columns)

df1內容

df2內容

可以看出df2比df1多兩個欄位 etl_date,real_pay_success_time

1)innner、left join 、rigtht join、outer join

inner_df = pd.merge(df1, df2, how='inner')  ## 計算df1=df2的部份
print(inner_df)

left_df = pd.merge(df1, df2, how='left')  ## df1部分
print(left_df) #df1部分

right_df = pd.merge(df1, df2, how='right')  ## df2部分
print(right_df) #df2部分

outer_df = pd.merge(df1, df2, how='outer')  ## 取合集:df1和df2所有資料的集合
print(outer_df) #df2部分

2)求差集

df1-df2

df = pd.concat([df1, df2, df2]).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)#df1-df2
print(df)

df2-df1

df = pd.concat([df2, df1, df1]).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)#df2-df1
print(df)

上面的df2-df1等同於

fileds_df = df1.append(df2).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False) #drop_duplicates重複資料刪除 
print(fileds_df)
fileds_df = df2.append(df1).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)
print(fileds_df)