異常值檢測:
阿新 • • 發佈:2018-12-14
通過分位點來進行異常值檢測:
def detect_outliers(df,n,features): """ Tuckey演算法 """ outlier_indices = [] # iterate over features(columns) for col in features: # 1st quartile (25%) Q1 = np.percentile(df[col], 25) # 3rd quartile (75%) Q3 = np.percentile(df[col],75) # Interquartile range (IQR) IQR = Q3 - Q1 # outlier step outlier_step = 1.5 * IQR # Determine a list of indices of outliers for feature col outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index # append the found outlier indices for col to the list of outlier indices outlier_indices.extend(outlier_list_col) # select observations containing more than 2 outliers outlier_indices = Counter(outlier_indices) multiple_outliers = list( k for k, v in outlier_indices.items() if v > n ) return multiple_outliers