# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from numpy import float64
Height_cm = np.array([164, 167, 168, 169, 169, 170, 170, 170, 171, 172, 172, 173, 173, 175, 176, 178], dtype=float64)
Weight_kg = np.array([54,  57,  58,  60,  61,  60,  61,  62,  62,  64,  62,  62,  64,  56,  66,  70], dtype=float64)
hw = {'Height_cm': Height_cm, 'Weight_kg': Weight_kg}
hw = pd.DataFrame(hw)#hw為矩陣 兩列兩個變數(身高和體重) 行為變數序號
print hw
Height_cm  Weight_kg
0       164.0       54.0
1       167.0       57.0
2       168.0       58.0
3       169.0       60.0
4       169.0       61.0
5       170.0       60.0
6       170.0       61.0
7       170.0       62.0
8       171.0       62.0
9       172.0       64.0
10      172.0       62.0
11      173.0       62.0
12      173.0       64.0
13      175.0       56.0
14      176.0       66.0
15      178.0       70.0
import pandas as pd
from sklearn import preprocessing
import numpy as np
from numpy import float64
from matplotlib import pyplot as plt

#scale 資料進行標準化    公式為:(X-mean)/std  將資料按按列減去其均值,並處以其方差   結果是所有資料都聚集在0附近,方差為1。
is_height_outlier = abs(preprocessing.scale(hw['Height_cm'])) > 2 #線性歸一化    資料中心的標準偏差與2比較
is_weight_outlier = abs(preprocessing.scale(hw['Weight_kg'])) > 2
is_outlier = is_height_outlier | is_weight_outlier#按位或 表示兩個變數中有一位是異常的 本組資料(體重,身高)異常    is_outlier是陣列,值為True或False
color = ['g', 'r']
pch = [1 if is_outlier[i] == True else 0 for i in range(len(is_outlier))]#pch是陣列,值為1或0,1表示異常點
cValue = [color[is_outlier[i]] for i in range(len(is_outlier))]#顏色陣列
# print is_height_outlier
# print cValue
fig = plt.figure()
plt.title('Scatter Plot')
plt.scatter(hw['Height_cm'], hw['Weight_kg'], s=40, c=cValue)#散點圖 s代表圖上點大小




# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from numpy import float64
from matplotlib import pyplot as plt
from scipy.spatial import distance
from pandas import Series
from mpl_toolkits.mplot3d  import Axes3D
Height_cm = np.array([164, 167, 168, 168, 169, 169, 169, 170, 172, 173, 175, 176, 178], dtype=float64)
Weight_kg = np.array([55,  57,  58,  56,  57,  61,  61,  61,  64,  62,  56,  66,  70], dtype=float64)
Age = np.array([13,  12,  14,  17,  15,  14,  16,  16,  13,  15,  16,  14,  16], dtype=float64)
hw = {'Height_cm': Height_cm, 'Weight_kg': Weight_kg, 'Age': Age}#hw為矩陣 三列三個變數(身高、體重、年齡) 行為變數序號
hw = pd.DataFrame(hw)
print len(hw)#13

n_outliers = 2#選2個作為異常點
#iloc[]取出3列,一行    hw.mean()此處為3個變數的陣列    np.mat(hw.cov().as_matrix()).I為協方差的逆矩陣    **為乘方
#m_dist_order為一維陣列    儲存Series中值降序排列的索引
m_dist_order =  Series([float(distance.mahalanobis(hw.iloc[i], hw.mean(), np.mat(hw.cov().as_matrix()).I) ** 2)
       for i in range(len(hw))]).sort_values(ascending=False).index.tolist()
is_outlier = [False, ] * 13
for i in range(n_outliers):#馬氏距離值大的標為True
    is_outlier[m_dist_order[i]] = True
# print is_outlier

color = ['g', 'r']
pch = [1 if is_outlier[i] == True else 0 for i in range(len(is_outlier))]
cValue = [color[is_outlier[i]] for i in range(len(is_outlier))]
# print cValue

fig = plt.figure()
#ax1 = fig.add_subplot(111, projection='3d')
ax1 = fig.gca(projection='3d')
ax1.set_title('Scatter Plot')
ax1.scatter(hw['Height_cm'], hw['Weight_kg'], hw['Age'],  s=40, c=cValue)



percentage_to_remove = 20    # Remove 20% of points
number_to_remove = round(len(hw) * percentage_to_remove / 100)   # 四捨五入取整
m_dist_order =  Series([float(distance.mahalanobis(hw.iloc[i], hw.mean(), np.mat(hw.cov().as_matrix()).I) ** 2)
       for i in range(len(hw))]).sort_values(ascending=False).index.tolist()

rows_to_keep_index = m_dist_order[int(number_to_remove): ]
my_dataframe = hw.loc[rows_to_keep_index]
print my_dataframe
     Age  Height_cm  Weight_kg
3   17.0      168.0       56.0
1   12.0      167.0       57.0
0   13.0      164.0       55.0
11  14.0      176.0       66.0
6   16.0      169.0       61.0
8   13.0      172.0       64.0
7   16.0      170.0       61.0
5   14.0      169.0       61.0
4   15.0      169.0       57.0
2   14.0      168.0       58.0
9   15.0      173.0       62.0



import pandas as pd
from sklearn import preprocessing
import numpy as np
from numpy import float64
from matplotlib import pyplot as plt
from scipy.spatial import distance
from pandas import Series

x = np.array([4,  8, 10, 16, 17, 22, 27, 33, 38, 40, 47, 48, 53, 55, 63, 71, 76, 85, 85, 92, 96], dtype=float64)
y = np.array([6, 22, 32, 34, 42, 51, 59, 63, 64, 69, 70, 20, 70, 63, 63, 55, 46, 41, 33, 19,  6], dtype=float64)
hw = {'x': x, 'y': y}
hw = pd.DataFrame(hw)

percentage_of_outliers = 10    # Mark 10% of points as outliers
number_of_outliers = round(len(hw) * percentage_of_outliers / 100)   # 四捨五入取整
m_dist_order =  Series([float(distance.mahalanobis(hw.iloc[i], hw.mean(), np.mat(hw.cov().as_matrix()).I) ** 2)
       for i in range(len(hw))]).sort_values(ascending=False).index.tolist()

rows_not_outliers = m_dist_order[int(number_of_outliers): ]
my_dataframe = hw.loc[rows_not_outliers]

is_outlier = [True, ] * 21
for i in rows_not_outliers:
    is_outlier[i] = False
color = ['g', 'r']
pch = [1 if is_outlier[i] == True else 0 for i in range(len(is_outlier))]
cValue = [color[is_outlier[i]] for i in range(len(is_outlier))]
fig = plt.figure()
plt.title('Scatter Plot')
plt.scatter(hw['x'], hw['y'], s=40, c=cValue)

