騰訊廣告演算法賽Top4—SDD模型1—FM_keras

阿新 • • 發佈：2018-12-15

基於Keras的FM系列NN

import numpy as np
import pandas as pd
import time
import gc
import sys
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras.optimizers import *
from keras.applications import *
from keras.regularizers import *
import itertools
from keras import backend  as KK
#from keras.engine.topology import Layer
from keras.metrics import categorical_accuracy
from keras.utils import multi_gpu_model
gpus_num=2
import keras
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

single_emb=['LBS','age','carrier','consumptionAbility','education','gender','house','os','ct','marriageStatus','advertiserId','campaignId', 'creativeId','adCategoryId', 'productId', 'productType','creativeSize','new_AID']
singel_max=[898,5,3,2,7,3,1,4,64,27,197,479,831,74,83,3,14]

model_col_name=['interest1','interest2','interest3','interest4','interest5','kw1', 'kw2','kw3','topic1', 'topic2', 'topic3','aid_insterest1']
model_col_max_length=[33,33,10,10,33,5,5,5,5,5,5,33]
model_col_max_value=[122,82,10,10,136,796741,121958,58782,9999,9999,9463,95379]
 
super_f_name=['final_sdd_embedding_feature_aid_neg.csv',
'final_sdd_embedding_feature_aid_pos.csv',
'final_sdd_embedding_feature_adCategoryId_adCategoryId_neg.csv',
'final_sdd_embedding_feature_adCategoryId_adCategoryId_pos.csv'
]
super_f_length=[29,51,29,30] 
super_f_max_value=[2230,2230,282,282]
super_fuck_name=['sdd_aid_neg','sdd_aid_pos','sdd_cate_neg','sdd_cate_pos']
def binary_crossentropy_with_ranking(y_true, y_pred):
    """ Trying to combine ranking loss with numeric precision"""
    # first get the log loss like normal
    logloss = KK.mean(KK.binary_crossentropy( y_true,y_pred), axis=-1)
    # next, build a rank loss
    # clip the probabilities to keep stability
    y_pred_clipped = KK.clip(y_pred, KK.epsilon(), 1-KK.epsilon())
    # translate into the raw scores before the logit
    y_pred_score = KK.log(y_pred_clipped / (1 - y_pred_clipped))
    # determine what the maximum score for a zero outcome is
    y_pred_score_zerooutcome_max = KK.max(tf.boolean_mask(y_pred_score ,(y_true < 1)))
    # determine how much each score is above or below it
    rankloss = y_pred_score - y_pred_score_zerooutcome_max
    # only keep losses for positive outcomes
    rankloss = tf.boolean_mask(rankloss,tf.equal(y_true,1))
    # only keep losses where the score is below the max
    rankloss = KK.square(KK.clip(rankloss, -100, 0))
    # average the loss for just the positive outcomes
    #tf.reduce_sum(tf.cast(myOtherTensor, tf.float32))
    rankloss = KK.sum(rankloss, axis=-1) / (KK.sum(KK.cast(y_true > 0,tf.float32) + 1))
    return (rankloss + 1)* logloss #- an alternative to try
    #return logloss

# PFA, prob false alert for binary classifier  
def binary_PFA(y_true, y_pred, threshold=KK.variable(value=0.5)):  
    y_pred = KK.cast(y_pred >= threshold, 'float32')  
    # N = total number of negative labels  
    N = KK.sum(1 - y_true)  
    # FP = total number of false alerts, alerts from the negative class labels  
    FP = KK.sum(y_pred - y_pred * y_true)  
    return FP/N 


# P_TA prob true alerts for binary classifier  
def binary_PTA(y_true, y_pred, threshold=KK.variable(value=0.5)):  
    y_pred = KK.cast(y_pred >= threshold, 'float32')  
    # P = total number of positive labels  
    P = KK.sum(y_true)  
    # TP = total number of correct alerts, alerts from the positive class labels  
    TP = KK.sum(y_pred * y_true)  
    return TP/P

def auc(y_true, y_pred):  
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)  
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)  
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)  
    binSizes = -(pfas[1:]-pfas[:-1])  
    s = ptas*binSizes  
    return KK.sum(s, axis=0)  


def log_loss(y_true, y_pred):
    """ Trying to combine ranking loss with numeric precision"""
    # first get the log loss like normal
    logloss = KK.sum(KK.binary_crossentropy(y_true,y_pred), axis=-1)
    return logloss

def Mean_layer(x):
    return KK.mean(x,axis=1)

def build_model(len_sdd,len_uid_count):
    field_sizes=len(single_emb)+len(model_col_name)
    emb_n=128
    #FFM_layers=[]
    inputs = []
    flatten_layers=[]
    columns = range(len(single_emb))
    ###------second order term-------###
    for c in columns:
        inputs_c = Input(shape=(1,), dtype='int32',name = 'input_%s'%single_emb[c])
        num_c = singel_max[c]+1
        inputs.append(inputs_c)
        print (num_c,c)
        embed_c = Embedding(num_c,emb_n,input_length=1,name = 'embed_%s'%single_emb[c])(inputs_c)
       
        flatten_c = Reshape((emb_n,))(embed_c)
        flatten_layers.append(flatten_c)

        #FFM_temp=[Embedding(num_c,emb_n,input_length=1)(inputs_c) for i_i in range(field_sizes)]
        #FFM_layers.append(FFM_temp)
    #length
    inputs_length=Input(shape=(len_sdd,))
    emb_length=Dense(emb_n, activation='relu')(inputs_length)
    inputs.append(inputs_length)
	
    inputs_uid_count=Input(shape=(len_uid_count,))
    emb_uid_count=Dense(emb_n, activation='relu')(inputs_uid_count)
    inputs.append(inputs_uid_count)
    #flatten_c = Reshape((emb_n,))(emb_length)
    #flatten_layers.append(flatten_c)

    for f in range(len(model_col_name)):
        inputs_f = Input(shape=(model_col_max_length[f],),name = 'input_%s'%model_col_name[f])
        num_f = model_col_max_value[f]+1
        inputs.append(inputs_f)
        #print (num_f,f)
        embed_f = Embedding(num_f,emb_n,input_length=model_col_max_length[f],name = 'embed_%s'%model_col_name[f])(inputs_f)
        embed_f=Lambda(Mean_layer)(embed_f)
        flatten_f = Reshape((emb_n,))(embed_f)
        flatten_layers.append(flatten_f)        
		
    for f in range(len(super_fuck_name)):
        inputs_f = Input(shape=(super_f_length[f],),name = 'input_%s'%super_fuck_name[f])
        num_f = super_f_max_value[f]+1
        inputs.append(inputs_f)
        #print (num_f,f)
        embed_f = Embedding(num_f,emb_n,input_length=super_f_length[f],name = 'embed_%s'%super_fuck_name[f])(inputs_f)
        embed_f=Lambda(Mean_layer)(embed_f)
        flatten_f = Reshape((emb_n,))(embed_f)
        flatten_layers.append(flatten_f)      

        #FFM_temp=[Lambda(Mean_layer)(Embedding(num_f,emb_n,input_length=model_col_max_length[f])(inputs_f)) for i_i in range(field_sizes)]
        #FFM_layers.append(FFM_temp)		
		
	#FFM
    #FFM_product=[]
    #for ff_i in range(field_sizes):
    #    for ff_j in range(ff_i+1,field_sizes):
    #       FFM_product.append(Reshape((emb_n,))(multiply([FFM_layers[ff_i][ff_j],FFM_layers[ff_j][ff_i]])))
    ##FFM_second_order=add(FFM_product)
    #FFM_second_order=concatenate(FFM_product)

    #___________________________________________________________--
    fm_layers=[]
    for em1,em2 in itertools.combinations(flatten_layers,2):
       dot_layer=Reshape((emb_n,))(multiply([em1,em2]))
       fm_layers.append(dot_layer)
    fm_layers=concatenate(fm_layers)

    fm_2layers=[]
    for em1,em2 in itertools.combinations(flatten_layers,2):
       dot_layer=merge([em1,em2],mode='dot',dot_axes=1)
       fm_2layers.append(dot_layer)
    fm_2layers=concatenate(fm_2layers)
    #y_first_order = add(flatten_layers) 
    #y_first_order = BatchNormalization()(y_first_order)
    #y_first_order = Dropout(0.8)(y_first_order)
    #------------------------------------------------------------
    y_deep = concatenate(flatten_layers)  
    #y_deep = Dense(256)(y_deep)  #加入BN會變差  PReLU 優於Relu 0.5 is 7483
    #y_deep = Activation('relu',name='output_1')(y_deep)
    #y_deep = Dropout(0.5)(y_deep)
    #y_deep=  Dense(128)(y_deep)
    #y_deep = Activation('relu',name='output_2')(y_deep)
    #y_deep = Dropout(0.5)(y_deep)

	
    concat_input = concatenate([fm_layers,fm_2layers,y_deep,emb_length,emb_uid_count],axis=1)
    concat_input = Dense(256)(concat_input)  #加入BN會變差  PReLU 優於Relu 0.5 is 7483
    concat_input = Activation('relu',name='output_3')(concat_input)
    concat_input = Dropout(0.5)(concat_input)
    concat_input=  Dense(128)(concat_input)
    concat_input = Activation('relu',name='output_4')(concat_input)
    concat_input = Dropout(0.5)(concat_input)	
	
    #new_input = concatenate([fm_layers,y_deep,FFM_second_order],axis=1)
    outp = Dense(1,activation='sigmoid')(concat_input)

    model = Model(inputs=inputs, outputs=outp,name='model')
    #optimizer_adam = Adam(lr=0.002)
    optimizer_adam = Adam(lr=0.005,decay=0.0005,amsgrad=True)
    parallel_model = keras.utils.training_utils.multi_gpu_model(model,gpus=gpus_num)
    parallel_model.compile(optimizer=optimizer_adam,loss='binary_crossentropy',metrics=[auc,log_loss])
	
	
    #if(loss_flag==0):
    #  model.compile(loss='binary_crossentropy',optimizer=optimizer_adam,metrics=[auc,log_loss])
    #elif(loss_flag==1):
    #  model.compile(loss=binary_crossentropy_with_ranking,optimizer=optimizer_adam,metrics=[auc,log_loss])
   # model.summary()
    return parallel_model

#read data
#------------------------------------------------------------------------------
t1=time.time()
single_onehot=pd.read_csv("./SDD_data/final_sdd_single_onehot_embedding_feature2_mix_test12.csv",dtype='float32')
single_onehot['new_AID']=LabelEncoder().fit_transform(single_onehot['aid'].apply(int))
print("read single_onehot over",time.time()-t1)
tem_max=single_onehot['new_AID'].max()
print("NEWID max",tem_max)
singel_max.append(tem_max)
#-**************************************************************************************
train_set=single_onehot[single_onehot['label']!=-1].values[:,3:] #train data
label_train=single_onehot[single_onehot['label']!=-1].values[:,2:3] #label data
test_set=single_onehot[single_onehot['label']==-1].values[:,3:] #test data
subfile=single_onehot[single_onehot['label']==-1][['aid','uid','label']]
del single_onehot;gc.collect()
print("seg over",time.time()-t1)
#-**************************************************************************************
ramdom_seed=1
spilt_prob=0.05
train_x, evals_x, train_y, evals_y=train_test_split(train_set,label_train,test_size=spilt_prob, random_state=ramdom_seed)
del train_set;gc.collect()
print('split data over!',time.time()-t1)
#-**************************************************************************************

X_train=[]
X_valid=[]
X_test=[]
for i in range(len(single_emb)-1):
   X_train.append(train_x[:,i:i+1])
   X_valid.append(evals_x[:,i:i+1])
   X_test.append(test_set[:,i:i+1])
X_train.append(train_x[:,(len(single_emb)-1):])
X_valid.append(evals_x[:,(len(single_emb)-1):])
X_test.append(test_set[:,(len(single_emb)-1):])##
print('input data over!',time.time()-t1)
#-**************************************************************************************


#*********************READ trans**************************************************************************************************************
temp_file1=pd.read_csv("user_statis_feature_train.csv",dtype='float32')
temp_file2=pd.read_csv("user_statis_feature_test.csv",dtype='float32')
temp_file=pd.concat([temp_file1,temp_file2])
temp_file=temp_file.fillna(temp_file.mean())
del temp_file1,temp_file2;gc.collect()

from  sklearn import preprocessing
len_sdd=0
for i in list(temp_file):
  print(i,len_sdd)
  len_sdd=len_sdd+1
  scaler=preprocessing.StandardScaler().fit(temp_file[i].values.reshape(-1,1))
  temp_file[i]=scaler.transform(temp_file[i].values.reshape(-1,1))

  
temp_train=temp_file.values[:45539700,:] #train data
temp_test=temp_file.values[45539700:,:] #test data
#temp_test=temp_test[11729073:,:]
#temp_test=temp_test[11729073:,:]
print('temp_test',temp_test.shape)
del temp_file;gc.collect()
temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed)
X_train.append(temp_train_x)
X_valid.append(temp_evals_x)
X_test.append(temp_test)##
del temp_train_x,temp_evals_x;gc.collect()

#-**************************************************************************************


#uid count
#*********************uid count************************************************************************************************************** 
temp_file=pd.read_csv("SDD_data/sdd_uid_count.csv")
temp_file=temp_file.fillna(temp_file.mean())

from  sklearn import preprocessing
len_uid_count=0
for i in list(temp_file):
  print(i,len_uid_count)
  len_uid_count=len_uid_count+1
  scaler=preprocessing.StandardScaler().fit(temp_file[i].values.reshape(-1,1))
  temp_file[i]=scaler.transform(temp_file[i].values.reshape(-1,1))

  
temp_train=temp_file.values[:45539700,:] #train data
temp_test=temp_file.values[45539700:,:] #test data
#temp_test=temp_test[11729073:,:]
#temp_test=temp_test[11729073:,:]
print('temp_test',temp_test.shape)
del temp_file;gc.collect()
temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed)
X_train.append(temp_train_x)
X_valid.append(temp_evals_x)
X_test.append(temp_test)##
del temp_train_x,temp_evals_x;gc.collect()
  
 
#--------------------------------------------------------------------------------------------------------------------------------------------  

for i in range(len(model_col_name)):
   temp_file=pd.read_csv("./SDD_data/final_sdd_embedding_feature_mix_chusai_%s.csv"%model_col_name[i],dtype='float32')
   print("read %s over"%model_col_name[i],time.time()-t1)
   temp_train=temp_file[temp_file['label']!=-1].values[:,3:] #train data
   temp_test=temp_file[temp_file['label']==-1].values[:,3:] #test data
   del temp_file;gc.collect()
   temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed)
   del temp_train;gc.collect()
   X_train.append(temp_train_x)
   X_valid.append(temp_evals_x)
   X_test.append(temp_test)##
   del temp_train_x,temp_evals_x;gc.collect()
#-**************************************************************************************

#FFD feature

for _name in super_f_name:
   temp_file=pd.read_csv("./SDD_data/%s"%_name,dtype='float32')
   temp_train=temp_file.values[:45539700,1:] #train data
   temp_test=temp_file.values[45539700:,1:] #test data
   print('temp_test',temp_test.shape)
   del temp_file;gc.collect()
   temp_train_x, temp_evals_x, temp_train_y, temp_evals_y=train_test_split(temp_train,label_train,test_size=spilt_prob, random_state=ramdom_seed)
   X_train.append(temp_train_x)
   X_valid.append(temp_evals_x)
   X_test.append(temp_test)##
   del temp_train_x,temp_evals_x;gc.collect()


model=build_model(len_sdd,len_uid_count)


batch_size=6000
model.fit(X_train,train_y, batch_size=batch_size, validation_data=(X_valid,evals_y),epochs=1, shuffle=True)
print('fit model over!',time.time()-t1)

y_pred_d = model.predict(X_valid,batch_size=6000)  
print('predict over!',time.time()-t1)
from sklearn.metrics import roc_auc_score,log_loss
print ('AUC:',roc_auc_score(evals_y,y_pred_d))
print ('log_loss:',log_loss(evals_y,y_pred_d))
print('compute AUC over!',time.time()-t1)

pre1=model.predict(X_test,batch_size=6000) 
subfile['label']=pre1
subfile.to_csv("./sdd_results/submission_FM_1.csv",header=True,index=False)

騰訊廣告演算法賽Top4—SDD模型1—FM_keras

基於Keras的FM系列NN import numpy as np import pandas as pd import time import gc import sys from sklearn.preprocessing import OneHotEncoder,

騰訊廣告演算法賽Top4—SDD模型2—FFM_keras

基於Keras的FFM系列NN import numpy as np import pandas as pd import time import gc import sys from sklearn.preprocessing import OneHotEncoder

騰訊廣告投放介紹，投放聯系誰？

ios 高達媒體互聯企業流量報告進口成熟度在改變敏捷並且高度碎片化的交際媒體世界，微信、QQ、QQ空間等成了主營銷的首選陣地——看看你朋友圈整天面膜的微商就知道了。　　　　那麽下面就來揭秘下這些流量的承載者——騰訊旗下途徑廣點通，主能夠經過廣點通，在這些

聚合優量，騰訊廣告聯盟開啟流量生態進階之路

4月11日，騰訊廣告聯盟2018優量計劃釋出會於北京召開。騰訊廣告聯盟致力於流量變現的持續探索，幫助優質流量釋放商業價值。本次優量計劃是聯盟開放共享變現能力的全新嘗試，優量計劃將結合流量質量評分體系，輔以合理的激勵機制與線下交流分享，通過競賽模式

主題模型LDA及其在微博推薦&廣告演算法中的應用--第1期

因為原文偶爾會出現訪問不了的情況，所以特拷貝於此。 @吳宇WB 【前言】本篇文章中所涉及的大部分理論知識，都是由微博的推薦演算法和廣告演算法團隊共同收集，共同學習的，而現在這兩個團隊也合併成為一個更大的-

騰訊筆試演算法題

輸入歌曲總長度以及第一類歌曲長度A以及數量X和第一類歌曲長度B以及數量Y。輸出總共有多少種歌曲組合方法。var sum=5; var A=2; var X=3; var B=3; var Y=3; var rest,x,y,methods; for(var i=X;i>

騰訊，演算法基礎，字元移位

題目描述小Q最近遇到了一個難題：把一個字串的大寫字母放到字串的後面，各個字元的相對位置不變，且不能申請額外的空間。你能幫幫小Q嗎？解題思路不申請額外的空間的意思是可以申請O(1)的空間，但是不能申請O(n)的空間遇到字串移位的題目，可以考慮從後面往前面移動

騰訊面試演算法題——編碼

題目描述假定一種編碼的編碼範圍是a ~ y的25個字母，從1位到4位的編碼，如果我們把該編碼按字典序排序，形成一個數組如下： a, aa, aaa, aaaa, aaab, aaac, … …, b, ba, baa, baaa, ab, baac … …, yyyw, y

騰訊廣告算法大賽2019

計算資源科研使用最終開源完全概率社交將不大賽地址 https://algo.qq.com/application/home/information/info.html 大賽介紹作為國內領先的大數據營銷平臺，全新升級的騰訊

2013騰訊編程馬拉松初賽第0,1場

-- mat cnblogs math pen bool hid pla != HDU4500 直接模擬 #include <iostream> #include <cstdio> #include <cmath> #include

基於騰訊雲CLB實現K8S v1.10.1集群高可用+負載均衡

開源可能管理平臺可用 st3 tab OS 1.10 style 概述：最近對K8S非常感興趣,同時對容器的管理等方面非常出色，是一款非常開源，強大的容器管理方案,最後經過1個月的本地實驗，最終決定在騰訊雲平臺搭建屬於我們的K8S集群管理平臺~ 采購之後已經在本

騰訊雲(Linux)安裝.net core sdk2.1、net core runtime2.1

cor -m rod asp pac 安裝 core 分享圖片 spn 按照微軟指令安裝： sdk2.1：https://www.microsoft.com/net/download/linux-package-manager/centos/sdk-current 1.

騰訊雲申請免費ssl證書（1年有效期）

　　SSL 證書就是遵守 SSL協議，由受信任的數字證書頒發機構CA，在驗證伺服器身份後頒發，具有伺服器身份驗證和資料傳輸加密功能的檔案。當前大多數的ssl證書是收費的，我所知道的只有騰訊雲可以申請到1年有效期的二級域名使用的ssl證書，下面是申請的流程，需要一個騰訊雲完成實

愛奇藝，騰訊和優酷市場分析（1）

在影視行業迅猛發展與粉絲經濟的強大帶動背景下，愛奇藝與騰訊視訊藉助其自身流量優勢，紛紛推出了泡泡與doki兩個子功能，愛奇藝更是將原有互動介面改版，以增加使用者對於泡泡功能的使用頻率。本文主要針對愛奇藝泡泡與騰訊視訊doki的完成競品分析，以使用者體驗的五個維度對二者進行對

2017騰訊校招暑期實習生筆試題1

構造迴文給定一個字串s，你可以從中刪除一些字元，使得剩下的串是一個迴文串。如何刪除才能使得迴文串最長呢？輸出需要刪除的字元個數。輸入描述:輸入資料有多組，每組包含一個字串s，且保證:1<=s.length<=1000. 輸出描述:對於每組資料，輸出一個整數，

騰訊2018春招模擬——程式設計題1——4個點能否構成正方形

題目判斷輸入的4個點是否構成正方形輸入t組資料，每組資料包含兩行，一行是4個點的橫座標，一行是4個點的縱座標示範輸入： 2 0 0 1 1 0 1 0 1 0 1 5 6 1 6 0 5 示範輸出：

第一屆騰訊社交廣告高校演算法大賽--基基復基基隊伍分享（進入決賽）

我們是參加騰訊社交廣告演算法大賽的隊伍，這次想在這分享下我們隊伍的經驗。首先介紹下我們隊伍，我們隊伍3個成員是來自西安電子科大和長安大學的高校生，在網際網路上了解到這個競賽，挺感興趣，所以就參加了這個競賽！這次競賽的要求是讓選手預測出 App 廣告點選後被啟用的概率，

招破解愛奇藝騰訊等VIP電影無廣告免費看支持下載

騰訊廣告很多超粑粑s朋友都喜歡看電影，免費看電影的時代已經結束了。現在想看一些最新的或者經典的老電影，我們就只能花錢在愛奇藝、騰訊、優酷等視頻網站辦理VIP會員。但是，這些視頻網站低谷了中國網友們的智慧，網友們想出了很多破解這些VIP電影的方法。今天小源就把自己正在用的方法告訴大家

騰訊視頻API --關閉廣告推薦

height enc 使用其中 con tencent arch favor code 官方文檔：http://v.qq.com/open/doc/tvpapi2.0.pdf 使用： <script src="http://imgcache.qq.com/tenc

騰訊視頻播放器V 1.0 去廣告補丁

騰訊視頻用於屏蔽騰訊視頻播放器煩人的視頻廣告。 https://pan.baidu.com/s/1bzIdLO騰訊視頻播放器V 1.0 去廣告補丁

騰訊廣告演算法賽Top4—SDD模型1—FM_keras

相關推薦