python DataFrame 交併差集
阿新 • • 發佈:2020-08-19
smysqldb = mysql.MYSQL(host=source_param['db_ip'], port=int(source_param['db_port']), user=source_param['db_user'], pwd=source_param['db_pwd'], db=source_param['db_name']) tmysqldb = mysql.MYSQL(host=target_param['db_ip'], port=int(target_param['db_port']), user=target_param['db_user'], pwd=target_param['db_pwd'], db=target_param['db_name']) desc_sql = "SELECT c.COLUMN_NAME AS filed_name,c.COLUMN_TYPE AS filed_type,c.DATA_TYPE as data_type,c.CHARACTER_MAXIMUM_LENGTH as char_length FROM information_schema. TABLES t INNER JOIN information_schema. COLUMNS c ON t.TABLE_NAME = c.TABLE_NAME" \ + " AND t.TABLE_SCHEMA = c.TABLE_SCHEMA WHERE t.TABLE_NAME = '{table_name}' AND t.TABLE_SCHEMA = '{database_name}'" filed_columns = ['filed_name', 'filed_type', 'data_type', 'char_length'] df1 = pd.DataFrame(smysqldb.ExecQuery( desc_sql.format(table_name=source_param['table_name'], database_name=source_param['db_name'])), columns=filed_columns) df2 = pd.DataFrame(tmysqldb.ExecQuery( desc_sql.format(table_name=target_param['table_name'], database_name=target_param['db_name'])), columns=filed_columns)
df1內容
df2內容
可以看出df2比df1多兩個欄位 etl_date,real_pay_success_time
1)innner、left join 、rigtht join、outer join
inner_df = pd.merge(df1, df2, how='inner') ## 計算df1=df2的部份
print(inner_df)
left_df = pd.merge(df1, df2, how='left') ## df1部分
print(left_df) #df1部分
right_df = pd.merge(df1, df2, how='right') ## df2部分print(right_df) #df2部分
outer_df = pd.merge(df1, df2, how='outer') ## 取合集:df1和df2所有資料的集合
print(outer_df) #df2部分
2)求差集
df1-df2
df = pd.concat([df1, df2, df2]).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)#df1-df2
print(df)
df2-df1
df = pd.concat([df2, df1, df1]).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)#df2-df1print(df)
上面的df2-df1等同於
fileds_df = df1.append(df2).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False) #drop_duplicates重複資料刪除
print(fileds_df)
fileds_df = df2.append(df1).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)
print(fileds_df)