Python資料探勘入門與實戰:第一章
阿新 • • 發佈:2018-12-04
程式碼來源於:https://github.com/hLvMxM/Learning_Data_Mining_with_Python/blob/master/Chapter 1/ch1_affinity.ipynb
其中註釋是在自己學習中加上去的,
便於初學者看懂
分析文字為:affinity_dataset.txt
0 0 1 1 1 1 1 0 1 0 1 0 1 1 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 1 0 1 1 1 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 1 1 0 1 1 1 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 1
程式碼為:
# @Time : 2018/12/3 上午10:13 # @Author : 鄭超 # @Desc : # In [1]: import numpy as np from operator import itemgetter from collections import defaultdict dataset_filename = "affinity_dataset.txt" X = np.loadtxt(dataset_filename) n_samples, n_features = X.shape # 輸出該陣列的結構 features = ["bread", "milk", "cheese", "apples", "bananas"] # 將檔案中的各項分別定義為一種商品 valid_rules = defaultdict(int) # 規則應驗字典 invalid_rules = defaultdict(int) # 規則無效字典 num_occurences = defaultdict(int) # 條件相同字典 for sample in X: for premise in range(n_features): if sample[premise] == 0: continue num_occurences[premise] += 1 # 滿足第一個條件的總次數 for conclusion in range(n_features): if premise == conclusion: continue # 同一個條件 進行跳過 if sample[conclusion] == 1: valid_rules[(premise, conclusion)] += 1 # 滿足第一個條件時同時滿足條件二進行+1 else: invalid_rules[(premise, conclusion)] += 1 # 滿足第一個條件時不滿足條件二進行+1 support = valid_rules # 支援度 支援度指資料集中規則應驗的次數 confidence = defaultdict(float) # 讓置信度支援浮點數 for premise, conclusion in valid_rules.keys(): # 遍歷keys ,得到條件組合 confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise] # 計算置信度 # In [8]: for premise, conclusion in confidence: # 遍歷置信度字典 premise_name = features[premise] conclusion_name = features[conclusion] print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name)) print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)])) print(" - Support: {0}".format(support[(premise, conclusion)])) print("") def print_rule(premise, conclusion, support, confidence, features): # 定義函式,輸出置信度和支援度 premise_name = features[premise] conclusion_name = features[conclusion] print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name)) print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)])) # 輸出置信度 print(" - Support: {0}\n".format(support[(premise, conclusion)])) # 輸出支援度 """輸出支援度最高的前五個元素""" sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True) # 使支援度字典按照value 來進行倒敘排練 for index in range(5): print("Rule #{0}".format(index + 1)) premise, conclusion = sorted_support[index][0] print_rule(premise, conclusion, support, confidence, features) print("*" * 60) """輸出置信度最高的前五個元素""" sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True) # 使支援度字典按照value 來進行倒敘排練 for index in range(5): print("Rule #{0}".format(index + 1)) premise, conclusion = sorted_confidence[index][0] print_rule(premise, conclusion, support, confidence, features)