1. 程式人生 > 實用技巧 >多模型的安卓惡意軟體分類

多模型的安卓惡意軟體分類

最近看到一篇論文,題為HYDRA: A multimodal deep learning framework for malware classification。本篇論文提到了一個多模式的惡意軟體分類框架,具體實現時,就是一個多輸入單輸出的網路框架。框架示意圖如下

​ 於是催生了本次實驗。在前幾篇博文中,在做惡意軟體分類時,最後都會加上特徵融合,並且效果都不錯。此次實驗旨在比較論文框架與特徵融合。基於安卓惡意軟體分類,所用特徵為API,opcode的n-gram,許可權。這也是論文模型的3輸入。

論文框架

​ 先基於opcodeui,以及許可權特徵做二輸入分類,看看效果,在加入API特徵,驗證模型的擴充套件性。模型用keras的函式式api搭建,程式碼如下

# stacked generalization with neural net meta model on blobs dataset
from sklearn.datasets.samples_generator import make_blobs
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import *
from numpy import argmax
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.preprocessing.text as T
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd

#資料讀取以及亂序
subtrainfeature1 = pd.read_csv("D:\\android\\dataset\\3_gram.csv")
subtrainfeature2 = pd.read_csv("D:\\android\\dataset\\permissions.csv")
labels = subtrainfeature2["Class"]
train_data_1 = subtrainfeature1.iloc[:,:].values
subtrainfeature2.drop(["Class"], axis=1, inplace=True)
train_data_2 = subtrainfeature2.iloc[:,:].values
import numpy as np
index = np.random.permutation(len(labels))
labels = labels[index]
train_data_1 = train_data_1[index]
train_data_2 = train_data_2[index]
p1 = int(len(labels)*0.8)
train_labels = labels[:p1]
test_labels = labels[p1:]
data_1_train = train_data_1[:p1]
data_1_test = train_data_1[p1:]
data_2_train = train_data_2[:p1]
data_2_test = train_data_2[p1:]

#模型構造1 3-gram
input1 = Input(shape=(343))
model1_x = Dense(200,input_dim = 343, activation='relu')(input1)
model1_x = Dense(150, activation = 'relu')(model1_x)
model1_x = Dense(150, activation = 'relu')(model1_x)
model1_x = Dense(150, activation = 'relu')(model1_x)
model1_x = Dense(100, activation = 'relu')(model1_x)
model1_x = Dense(100, activation = 'relu')(model1_x)
model1_x = Dense(100, activation = 'relu')(model1_x)
model1_x = Dense(50, activation = 'relu')(model1_x)
model1_x = Dense(50, activation = 'relu')(model1_x)
model1_x = Dense(50, activation = 'relu')(model1_x)
model1_x = Dense(30, activation = 'relu')(model1_x)

#模型構造2 許可權特徵
input1 = Input(shape=(343))
model1_x = Dense(200,input_dim = 343, activation='relu')(input1)
model1_x = Dense(150, activation = 'relu')(model1_x)
model1_x = Dense(150, activation = 'relu')(model1_x)
model1_x = Dense(150, activation = 'relu')(model1_x)
model1_x = Dense(100, activation = 'relu')(model1_x)
model1_x = Dense(100, activation = 'relu')(model1_x)
model1_x = Dense(100, activation = 'relu')(model1_x)
model1_x = Dense(50, activation = 'relu')(model1_x)
model1_x = Dense(50, activation = 'relu')(model1_x)
model1_x = Dense(50, activation = 'relu')(model1_x)
model1_x = Dense(30, activation = 'relu')(model1_x)

#全連線層
full = concatenate([model1_x,model2_x])
full = Dense(60,activation='relu')(full)
full = Dense(60,activation='relu')(full)
full = Dense(30,activation='relu')(full)
output = Dense(1,activation='sigmoid')(full)

#列印模型
model = Model(inputs=[input1,input2], outputs=output)
plot_model(model, show_shapes=True, to_file='model_andorid.jpg')

model.compile(
    optimizer='adam'
    ,loss = 'binary_crossentropy'
    ,metrics=['acc']
)

#訓練
from sklearn.model_selection import StratifiedKFold
history = model.fit(
    [data_1_train,data_2_train],
    train_labels,
    epochs=50,
    batch_size = 64,
    validation_data=([data_1_test,data_2_test], test_labels)
)

模型示意圖如下:

貼上api提取的程式碼:

import os
from androguard.misc import AnalyzeAPK
from androguard.core.androconf import load_api_specific_resource_module
from collections import *
import re
import os
import pandas as pd

malware_dir = "D:\\android\\dataset\\drebin-1"
kind_dir = "D:\\android\\dataset\\Benign_2016\\"

permmap = load_api_specific_resource_module('api_permission_mappings')

 def get_apis(file_path):
        out = AnalyzeAPK(file_path)
        dx = out[2]
        
        cc = Counter([])
        dd = Counter([])
        for meth_analysis in dx.get_methods():
            meth = meth_analysis.get_method()
            cc[meth.get_name()]+=1
            name  = meth.get_class_name() + "-" + meth.get_name() + "-" + str(meth.get_descriptor())
            for k,v in permmap.items():
                if name == k:
                    dd[meth.get_name()]+=1
        return cc,dd  
def file_api_count(file_path):
    a,d = get_apis(file_path)
    e = Counter([])
    for k,v in a.items():
        if v>=100:
            e[k]+=v
    return d,e

count = 1
mapapi_mal_less = defaultdict(Counter)
mapapi_mal_more = defaultdict(Counter)
mapapi_kind_less = defaultdict(Counter)

for file in os.listdir(malware_dir):
    print("counting  the {0} file...".format(str(count)))
    count+=1
    apk_dir = os.path.join(malware_dir,file)
    mapapi_mal_less[file], mapapi_mal_more[file]= file_api_count(apk_dir)
    
count = 1
for file in os.listdir(kind_dir):
    print("counting  the {0} file...".format(str(count)))
    count+=1
    apk_dir = os.path.join(kind_dir,file)
    mapapi_kind_less[file], mapapi_kind_more[file] = file_api_count(apk_dir)
    
cc = Counter([])
for d,lists in mapapi_kind_more.items():
    for item,num in lists.items():
        cc[item]+=num
for d,lists in mapapi_mal_more.items():
    for item,num in lists.items():
        cc[item]+=num
        
selectedfeatures = {}
tc = 0
for k,v in cc.items():
    if v >= 100:
        selectedfeatures[k] = v
        print (k,v)
        tc += 1
        
#存入與許可權特徵無關的api,並未用到
dataframelist = []
for fid,count in mapapi_kind_more.items():
    standard = {}
    standard["Class"] = 0
    for feature,num in count.items():
        if feature in selectedfeatures:
            standard[feature] = num
    dataframelist.append(standard)
for fid,count in mapapi_mal_more.items():
    standard = {}
    standard["Class"] = 1
    for feature,num in count.items():
        if feature in selectedfeatures:
            standard[feature] = num
    dataframelist.append(standard)
df = pd.DataFrame(dataframelist)
df.to_csv("D:\\android\\dataset\\api_more.csv",index=False)

#存入與許可權特徵有關的api
ff = Counter([])
selectfeature2 = []
for d,lists in mapapi_kind_less.items():
     for item,num in lists.items():
        selectfeature2.append(item)
        
for d,lists in mapapi_mal_less.items():
     for item,num in lists.items():
        selectfeature2.append(item)
for fid,count in mapapi_kind_less.items():
    standard = {}
    standard["Class"] = 0
    for feature,num in count.items():
        if feature in selectfeature2:
            standard[feature] = num
        else:
            standard[feature] = 0
    dataframelist2.append(standard)
for fid,count in mapapi_mal_less.items():
    standard = {}
    standard["Class"] = 1
    for feature,num in count.items():
        if feature in selectfeature2:
            standard[feature] = num
        else:
            standard[feature] = 0
    dataframelist2.append(standard)
df2 = pd.DataFrame(dataframelist2)
df2.to_csv("D:\\android\\dataset\\api_less.csv",index=False)

50輪訓練結果的最後十輪精確度如下

0.9850,0.9750,0.9750,0.9775,0.9800,0.9775,0.9725,0.9775,0.9750,0.9775

加入api特徵,程式碼與上文類似,其模型示意圖如下:

50輪訓練結果的最後十輪精確度如下:

0.9875,0.9875,0.9875,0.9875,0.9875,0.9875,0.9875,0.9900,0.9900,0.9900

特徵融合

​ 嘗試之前的方法,將特徵融合在一起,看看精確度

先嚐試兩特徵融合,opcode n-gram和許可權特徵,程式碼在上一篇博文中可以找到,就不貼了,最終基於深度學習的十輪交叉驗證精確度如下:

[0.99, 0.98, 0.985, 1.0, 0.995, 0.97, 0.98, 0.995, 0.97, 0.9849246]

加入api特徵後,精確度如下:

[0.99, 0.99, 0.985, 1.0, 1.0, 0.99, 0.985, 0.98, 0.975, 0.9798995]

總結

​ 比較結果如下:

可見多模型精度不如單模型特徵融合,但是穩定性勝於特徵融合