騰訊廣告演算法賽Top4—SDD模型1—FM_keras
阿新 • • 發佈:2018-12-15
基於Keras的FM系列NN
import numpy as np import pandas as pd import time import gc import sys from sklearn.preprocessing import OneHotEncoder,LabelEncoder from sklearn.utils import shuffle from sklearn.model_selection import train_test_split from keras.layers import * from keras.models import * from keras.callbacks import * from keras.optimizers import * from keras.applications import * from keras.regularizers import * import itertools from keras import backend as KK #from keras.engine.topology import Layer from keras.metrics import categorical_accuracy from keras.utils import multi_gpu_model gpus_num=2 import keras import os os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" single_emb=['LBS','age','carrier','consumptionAbility','education','gender','house','os','ct','marriageStatus','advertiserId','campaignId', 'creativeId','adCategoryId', 'productId', 'productType','creativeSize','new_AID'] singel_max=[898,5,3,2,7,3,1,4,64,27,197,479,831,74,83,3,14] model_col_name=['interest1','interest2','interest3','interest4','interest5','kw1', 'kw2','kw3','topic1', 'topic2', 'topic3','aid_insterest1'] model_col_max_length=[33,33,10,10,33,5,5,5,5,5,5,33] model_col_max_value=[122,82,10,10,136,796741,121958,58782,9999,9999,9463,95379] super_f_name=['final_sdd_embedding_feature_aid_neg.csv', 'final_sdd_embedding_feature_aid_pos.csv', 'final_sdd_embedding_feature_adCategoryId_adCategoryId_neg.csv', 'final_sdd_embedding_feature_adCategoryId_adCategoryId_pos.csv' ] super_f_length=[29,51,29,30] super_f_max_value=[2230,2230,282,282] super_fuck_name=['sdd_aid_neg','sdd_aid_pos','sdd_cate_neg','sdd_cate_pos'] def binary_crossentropy_with_ranking(y_true, y_pred): """ Trying to combine ranking loss with numeric precision""" # first get the log loss like normal logloss = KK.mean(KK.binary_crossentropy( y_true,y_pred), axis=-1) # next, build a rank loss # clip the probabilities to keep stability y_pred_clipped = KK.clip(y_pred, KK.epsilon(), 1-KK.epsilon()) # translate into the raw scores before the logit y_pred_score = KK.log(y_pred_clipped / (1 - y_pred_clipped)) # determine what the maximum score for a zero outcome is y_pred_score_zerooutcome_max = KK.max(tf.boolean_mask(y_pred_score ,(y_true < 1))) # determine how much each score is above or below it rankloss = y_pred_score - y_pred_score_zerooutcome_max # only keep losses for positive outcomes rankloss = tf.boolean_mask(rankloss,tf.equal(y_true,1)) # only keep losses where the score is below the max rankloss = KK.square(KK.clip(rankloss, -100, 0)) # average the loss for just the positive outcomes #tf.reduce_sum(tf.cast(myOtherTensor, tf.float32)) rankloss = KK.sum(rankloss, axis=-1) / (KK.sum(KK.cast(y_true > 0,tf.float32) + 1)) return (rankloss + 1)* logloss #- an alternative to try #return logloss # PFA, prob false alert for binary classifier def binary_PFA(y_true, y_pred, threshold=KK.variable(value=0.5)): y_pred = KK.cast(y_pred >= threshold, 'float32') # N = total number of negative labels N = KK.sum(1 - y_true) # FP = total number of false alerts, alerts from the negative class labels FP = KK.sum(y_pred - y_pred * y_true) return FP/N # P_TA prob true alerts for binary classifier def binary_PTA(y_true, y_pred, threshold=KK.variable(value=0.5)): y_pred = KK.cast(y_pred >= threshold, 'float32') # P = total number of positive labels P = KK.sum(y_true) # TP = total number of correct alerts, alerts from the positive class labels TP = KK.sum(y_pred * y_true) return TP/P def auc(y_true, y_pred): ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0) pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0) pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0) binSizes = -(pfas[1:]-pfas[:-1]) s = ptas*binSizes return KK.sum(s, axis=0) def log_loss(y_true, y_pred): """ Trying to combine ranking loss with numeric precision""" # first get the log loss like normal logloss = KK.sum(KK.binary_crossentropy(y_true,y_pred), axis=-1) return logloss def Mean_layer(x): return KK.mean(x,axis=1) def build_model(len_sdd,len_uid_count): field_sizes=len(single_emb)+len(model_col_name) emb_n=128 #FFM_layers=[] inputs = [] flatten_layers=[] columns = range(len(single_emb)) ###------second order term-------### for c in columns: inputs_c = Input(shape=(1,), dtype='int32',name = 'input_%s'%single_emb[c]) num_c = singel_max[c]+1 inputs.append(inputs_c) print (num_c,c) embed_c = Embedding(num_c,emb_n,input_length=1,name = 'embed_%s'%single_emb[c])(inputs_c) flatten_c = Reshape((emb_n,))(embed_c) flatten_layers.append(flatten_c) #FFM_temp=[Embedding(num_c,emb_n,input_length=1)(inputs_c) for i_i in range(field_sizes)] #FFM_layers.append(FFM_temp) #length inputs_length=Input(shape=(len_sdd,)) emb_length=Dense(emb_n, activation='relu')(inputs_length) inputs.append(inputs_length) inputs_uid_count=Input(shape=(len_uid_count,)) emb_uid_count=Dense(emb_n, activation='relu')(inputs_uid_count) inputs.append(inputs_uid_count) #flatten_c = Reshape((emb_n,))(emb_length) #flatten_layers.append(flatten_c) for f in range(len(model_col_name)): inputs_f = Input(shape=(model_col_max_length[f],),name = 'input_%s'%model_col_name[f]) num_f = model_col_max_value[f]+1 inputs.append(inputs_f) #print (num_f,f) embed_f = Embedding(num_f,emb_n,input_length=model_col_max_length[f],name = 'embed_%s'%model_col_name[f])(inputs_f) embed_f=Lambda(Mean_layer)(embed_f) flatten_f = Reshape((emb_n,))(embed_f) flatten_layers.append(flatten_f) for f in range(len(super_fuck_name)): inputs_f = Input(shape=(super_f_length[f],),name = 'input_%s'%super_fuck_name[f]) num_f = super_f_max_value[f]+1 inputs.append(inputs_f) #print (num_f,f) embed_f = Embedding(num_f,emb_n,input_length=super_f_length[f],name = 'embed_%s'%super_fuck_name[f])(inputs_f) embed_f=Lambda(Mean_layer)(embed_f) flatten_f = Reshape((emb_n,))(embed_f) flatten_layers.append(flatten_f) #FFM_temp=[Lambda(Mean_layer)(Embedding(num_f,emb_n,input_length=model_col_max_length[f])(inputs_f)) for i_i in range(field_sizes)] #FFM_layers.append(FFM_temp) #FFM #FFM_product=[] #for ff_i in range(field_sizes): # for ff_j in range(ff_i+1,field_sizes): # FFM_product.append(Reshape((emb_n,))(multiply([FFM_layers[ff_i][ff_j],FFM_layers[ff_j][ff_i]]))) ##FFM_second_order=add(FFM_product) #FFM_second_order=concatenate(FFM_product) #___________________________________________________________-- fm_layers=[] for em1,em2 in itertools.combinations(flatten_layers,2): dot_layer=Reshape((emb_n,))(multiply([em1,em2])) fm_layers.append(dot_layer) fm_layers=concatenate(fm_layers) fm_2layers=[] for em1,em2 in itertools.combinations(flatten_layers,2): dot_layer=merge([em1,em2],mode='dot',dot_axes=1) fm_2layers.append(dot_layer) fm_2layers=concatenate(fm_2layers) #y_first_order = add(flatten_layers) #y_first_order = BatchNormalization()(y_first_order) #y_first_order = Dropout(0.8)(y_first_order) #------------------------------------------------------------ y_deep = concatenate(flatten_layers) #y_deep = Dense(256)(y_deep) #加入BN會變差 PReLU 優於Relu 0.5 is 7483 #y_deep = Activation('relu',name='output_1')(y_deep) #y_deep = Dropout(0.5)(y_deep) #y_deep= Dense(128)(y_deep) #y_deep = Activation('relu',name='output_2')(y_deep) #y_deep = Dropout(0.5)(y_deep) concat_input = concatenate([fm_layers,fm_2layers,y_deep,emb_length,emb_uid_count],axis=1) concat_input = Dense(256)(concat_input) #加入BN會變差 PReLU 優於Relu 0.5 is 7483 concat_input = Activation('relu',name='output_3')(concat_input) concat_input = Dropout(0.5)(concat_input) concat_input= Dense(128)(concat_input) concat_input = Activation('relu',name='output_4')(concat_input) concat_input = Dropout(0.5)(concat_input) #new_input = concatenate([fm_layers,y_deep,FFM_second_order],axis=1) outp = Dense(1,activation='sigmoid')(concat_input) model = Model(inputs=inputs, outputs=outp,name='model') #optimizer_adam = Adam(lr=0.002) optimizer_adam = Adam(lr=0.005,decay=0.0005,amsgrad=True) parallel_model = keras.utils.training_utils.multi_gpu_model(model,gpus=gpus_num) parallel_model.compile(optimizer=optimizer_adam,loss='binary_crossentropy',metrics=[auc,log_loss]) #if(loss_flag==0): # model.compile(loss='binary_crossentropy',optimizer=optimizer_adam,metrics=[auc,log_loss]) #elif(loss_flag==1): # model.compile(loss=binary_crossentropy_with_ranking,optimizer=optimizer_adam,metrics=[auc,log_loss]) # model.summary() return parallel_model #read data #------------------------------------------------------------------------------ t1=time.time() single_onehot=pd.read_csv("./SDD_data/final_sdd_single_onehot_embedding_feature2_mix_test12.csv",dtype='float32') single_onehot['new_AID']=LabelEncoder().fit_transform(single_onehot['aid'].apply(int)) print("read single_onehot over",time.time()-t1) tem_max=single_onehot['new_AID'].max() print("NEWID max",tem_max) singel_max.append(tem_max) #-************************************************************************************** train_set=single_onehot[single_onehot['label']!=-1].values[:,3:] #train data label_train=single_onehot[single_onehot['label']!=-1].values[:,2:3] #label data test_set=single_onehot[single_onehot['label']==-1].values[:,3:] #test data subfile=single_onehot[single_onehot['label']==-1][['aid','uid','label']] del single_onehot;gc.collect() print("seg over",time.time()-t1) #-************************************************************************************** ramdom_seed=1 spilt_prob=0.05 train_x, evals_x, train_y, evals_y=train_test_split(train_set,label_train,test_size=spilt_prob, random_state=ramdom_seed) del train_set;gc.collect() print('split data over!',time.time()-t1) #-************************************************************************************** X_train=[] X_valid=[] X_test=[] for i in range(len(single_emb)-1): X_train.append(train_x[:,i:i+1]) X_valid.append(evals_x[:,i:i+1]) X_test.append(test_set[:,i:i+1]) X_train.append(train_x[:,(len(single_emb)-1):]) X_valid.append(evals_x[:,(len(single_emb)-1):]) X_test.append(test_set[:,(len(single_emb)-1):])## print('input data over!',time.time()-t1) #-************************************************************************************** #*********************READ trans************************************************************************************************************** temp_file1=pd.read_csv("user_statis_feature_train.csv",dtype='float32') temp_file2=pd.read_csv("user_statis_feature_test.csv",dtype='float32') temp_file=pd.concat([temp_file1,temp_file2]) temp_file=temp_file.fillna(temp_file.mean()) del temp_file1,temp_file2;gc.collect() from sklearn import preprocessing len_sdd=0 for i in list(temp_file): print(i,len_sdd) len_sdd=len_sdd+1 scaler=preprocessing.StandardScaler().fit(temp_file[i].values.reshape(-1,1)) temp_file[i]=scaler.transform(temp_file[i].values.reshape(-1,1)) temp_train=temp_file.values[:45539700,:] #train data temp_test=temp_file.values[45539700:,:] #test data #temp_test=temp_test[11729073:,:] #temp_test=temp_test[11729073:,:] print('temp_test',temp_test.shape) del temp_file;gc.collect() temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed) X_train.append(temp_train_x) X_valid.append(temp_evals_x) X_test.append(temp_test)## del temp_train_x,temp_evals_x;gc.collect() #-************************************************************************************** #uid count #*********************uid count************************************************************************************************************** temp_file=pd.read_csv("SDD_data/sdd_uid_count.csv") temp_file=temp_file.fillna(temp_file.mean()) from sklearn import preprocessing len_uid_count=0 for i in list(temp_file): print(i,len_uid_count) len_uid_count=len_uid_count+1 scaler=preprocessing.StandardScaler().fit(temp_file[i].values.reshape(-1,1)) temp_file[i]=scaler.transform(temp_file[i].values.reshape(-1,1)) temp_train=temp_file.values[:45539700,:] #train data temp_test=temp_file.values[45539700:,:] #test data #temp_test=temp_test[11729073:,:] #temp_test=temp_test[11729073:,:] print('temp_test',temp_test.shape) del temp_file;gc.collect() temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed) X_train.append(temp_train_x) X_valid.append(temp_evals_x) X_test.append(temp_test)## del temp_train_x,temp_evals_x;gc.collect() #-------------------------------------------------------------------------------------------------------------------------------------------- for i in range(len(model_col_name)): temp_file=pd.read_csv("./SDD_data/final_sdd_embedding_feature_mix_chusai_%s.csv"%model_col_name[i],dtype='float32') print("read %s over"%model_col_name[i],time.time()-t1) temp_train=temp_file[temp_file['label']!=-1].values[:,3:] #train data temp_test=temp_file[temp_file['label']==-1].values[:,3:] #test data del temp_file;gc.collect() temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed) del temp_train;gc.collect() X_train.append(temp_train_x) X_valid.append(temp_evals_x) X_test.append(temp_test)## del temp_train_x,temp_evals_x;gc.collect() #-************************************************************************************** #FFD feature for _name in super_f_name: temp_file=pd.read_csv("./SDD_data/%s"%_name,dtype='float32') temp_train=temp_file.values[:45539700,1:] #train data temp_test=temp_file.values[45539700:,1:] #test data print('temp_test',temp_test.shape) del temp_file;gc.collect() temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed) X_train.append(temp_train_x) X_valid.append(temp_evals_x) X_test.append(temp_test)## del temp_train_x,temp_evals_x;gc.collect() model=build_model(len_sdd,len_uid_count) batch_size=6000 model.fit(X_train,train_y, batch_size=batch_size, validation_data=(X_valid,evals_y),epochs=1, shuffle=True) print('fit model over!',time.time()-t1) y_pred_d = model.predict(X_valid,batch_size=6000) print('predict over!',time.time()-t1) from sklearn.metrics import roc_auc_score,log_loss print ('AUC:',roc_auc_score(evals_y,y_pred_d)) print ('log_loss:',log_loss(evals_y,y_pred_d)) print('compute AUC over!',time.time()-t1) pre1=model.predict(X_test,batch_size=6000) subfile['label']=pre1 subfile.to_csv("./sdd_results/submission_FM_1.csv",header=True,index=False)