1. 程式人生 > >Python資料探勘入門與實戰:第一章

Python資料探勘入門與實戰:第一章

程式碼來源於:https://github.com/hLvMxM/Learning_Data_Mining_with_Python/blob/master/Chapter 1/ch1_affinity.ipynb
其中註釋是在自己學習中加上去的,
便於初學者看懂
分析文字為:affinity_dataset.txt

0 0 1 1 1
1 1 0 1 0
1 0 1 1 0
0 0 1 1 1
0 1 0 0 1
0 1 0 0 0
1 0 0 0 1
1 0 0 0 1
0 0 0 1 1
0 0 1 1 1
1 1 0 0 1
0 1 0 0 0
0 0 0 0 1
0 0 1 0 1
0 1 0 0 1
0 0 1 1 1
1 0 0 0 1
0 0 1 1 1
1 1 0 0 0
0 1 0 0 0
0 0 1 0 0
0 1 0 0 1
0 1 0 0 0
0 1 0 0 1
0 0 1 1 1
0 0 1 1 0
0 0 1 0 1
0 0 0 0 1
0 1 0 0 0
0 1 0 1 0
1 1 1 0 1
1 1 0 0 1
0 0 1 1 1
0 0 1 0 1
0 0 1 1 1
0 0 1 1 0
0 1 1 0 1
0 0 1 1 0
0 1 0 0 1
0 0 0 0 1
0 0 1 0 1
1 1 0 1 1
1 0 0 0 1
0 0 1 1 1
0 1 0 0 0
0 1 0 1 1
0 1 0 0 0
0 1 0 0 0
0 0 1 1 0
0 0 1 1 1
0 1 0 1 0
0 1 1 0 0
0 0 1 1 0
0 0 1 1 1
1 0 0 0 0
0 1 0 1 0
1 0 0 0 1
0 1 0 0 0
0 0 0 0 1
0 0 1 1 1
0 1 1 1 1
1 1 0 0 0
0 0 1 0 1
1 0 0 0 1
1 1 0 0 0
0 1 1 0 0
0 0 0 0 1
0 1 0 0 0
0 0 1 1 1
0 1 0 0 1
1 0 0 0 1
1 0 0 0 1
0 1 0 0 1
0 0 1 1 1
1 0 1 0 1
1 1 0 0 1
0 1 0 0 1
1 1 1 0 1
0 0 1 1 1
1 0 0 0 0
0 0 1 1 1
1 1 0 1 0
0 0 1 0 0
0 0 1 0 1
0 1 0 0 0
1 1 0 0 0
0 0 0 1 0
0 0 0 1 1
0 1 0 0 0
0 1 0 0 0
1 1 0 0 1
0 0 1 0 0
0 1 0 0 1
1 1 0 1 0
1 0 0 0 1
0 1 0 0 0
0 0 1 1 0
0 1 1 0 0
0 0 1 1 0
0 0 0 0 1

程式碼為:

# @Time    : 2018/12/3 上午10:13
# @Author  : 鄭超
# @Desc    :
# In [1]:
import numpy as np
from operator import itemgetter
from collections import defaultdict

dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape  # 輸出該陣列的結構
features = ["bread", "milk", "cheese", "apples", "bananas"]  # 將檔案中的各項分別定義為一種商品

valid_rules = defaultdict(int)  # 規則應驗字典
invalid_rules = defaultdict(int)  # 規則無效字典
num_occurences = defaultdict(int)  # 條件相同字典

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        num_occurences[premise] += 1  # 滿足第一個條件的總次數
        for conclusion in range(n_features):
            if premise == conclusion: continue  # 同一個條件 進行跳過
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1  # 滿足第一個條件時同時滿足條件二進行+1
            else:
                invalid_rules[(premise, conclusion)] += 1  # 滿足第一個條件時不滿足條件二進行+1
support = valid_rules  # 支援度 支援度指資料集中規則應驗的次數
confidence = defaultdict(float)  # 讓置信度支援浮點數
for premise, conclusion in valid_rules.keys():  # 遍歷keys ,得到條件組合
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]  # 計算置信度
# In [8]:
for premise, conclusion in confidence:  # 遍歷置信度字典
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")


def print_rule(premise, conclusion, support, confidence, features):
    # 定義函式,輸出置信度和支援度
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))  # 輸出置信度
    print(" - Support: {0}\n".format(support[(premise, conclusion)]))  # 輸出支援度


"""輸出支援度最高的前五個元素"""
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)  # 使支援度字典按照value 來進行倒敘排練
for index in range(5):
    print("Rule #{0}".format(index + 1))
    premise, conclusion = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

print("*" * 60)

"""輸出置信度最高的前五個元素"""
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)  # 使支援度字典按照value 來進行倒敘排練
for index in range(5):
    print("Rule #{0}".format(index + 1))
    premise, conclusion = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)