【pandas】 DataFrame的常用方法
阿新 • • 發佈:2019-01-29
____tz_zs
引數 axis
axis=0 對每一列進行操作
axis=1 對每一行進行操作
引數 skipna
計算中,pandas 會預設排除NaN值,設定 skipna=False 將不再排除 NaN 值
.
#!/usr/bin/python2.7 # -*- coding:utf-8 -*- """ @author: tz_zs """ import numpy as np import pandas as pd data = [[1, 2, np.nan], [2, np.nan, 3], [7, 8, 9], [3, 4, 5]] date_range = pd.date_range(start="20180701", periods=4) df = pd.DataFrame(data=data, index=date_range, columns=['a', 'b', 'c']) print df """ a b c 2018-07-01 1 2.0 NaN 2018-07-02 2 NaN 3.0 2018-07-03 7 8.0 9.0 2018-07-04 3 4.0 5.0 """
.
sum
求和 df.sum()
# 對每一列求和,預設排除NaN值 print df.sum() """ a 13.0 b 14.0 c 17.0 dtype: float64 """ # 對每一行求和,預設排除NaN值 print df.sum(axis=1) """ 2018-07-01 3.0 2018-07-02 5.0 2018-07-03 24.0 2018-07-04 12.0 Freq: D, dtype: float64 """ # 設定 skipna=False 將不再排除 NaN 值 print df.sum(skipna=False) """ a 13.0 b NaN c NaN dtype: float64 """ print df.sum(axis=1,skipna=False) """ 2018-07-01 NaN 2018-07-02 NaN 2018-07-03 24.0 2018-07-04 12.0 Freq: D, dtype: float64 """
.
mean
求平均 df.mean
# 對每一行求平均值
print df.mean(axis=1)
"""
2018-07-01 1.5
2018-07-02 2.5
2018-07-03 8.0
2018-07-04 4.0
Freq: D, dtype: float64
"""
print df.mean(axis=1, skipna=False)
"""
2018-07-01 NaN
2018-07-02 NaN
2018-07-03 8.0
2018-07-04 4.0
Freq: D, dtype: float64
"""
.
max、min
最大最小值 df.max、df.min
# 最大最小值
print df.max()
print df.min()
"""
a 7.0
b 8.0
c 9.0
dtype: float64
a 1.0
b 2.0
c 3.0
dtype: float64
"""
.
idxmax、idxmin
最大最小值的索引 df.idxmax、df.idxmin
# 返回每一列中最大值的索引
print df.idxmax()
# 返回每一列中最小值的索引
print df.idxmin()
"""
a 2018-07-03
b 2018-07-03
c 2018-07-03
dtype: datetime64[ns]
a 2018-07-01
b 2018-07-01
c 2018-07-02
dtype: datetime64[ns]
"""
.
cumsum
累加 df.cumsum
# 對每一列累加
print df.cumsum()
"""
a b c
2018-07-01 1.0 2.0 NaN
2018-07-02 3.0 NaN 3.0
2018-07-03 10.0 10.0 12.0
2018-07-04 13.0 14.0 17.0
"""
.
print df.median(axis=1) # 中位數
"""
2018-07-01 1.5
2018-07-02 2.5
2018-07-03 8.0
2018-07-04 4.0
Freq: D, dtype: float64
"""
.
mad
根據平均值計算平均絕對離差 df.mad
print df.mad() # 根據平均值計算平均絕對離差
"""
a 1.875000
b 2.222222
c 2.222222
dtype: float64
"""
.
std
標準差 df.std
print df.std() # 標準差
"""
a 2.629956
b 3.055050
c 3.055050
dtype: float64
"""
.
var
方差 df.var
print df.var() # 方差
"""
a 6.916667
b 9.333333
c 9.333333
dtype: float64
"""
.
diff
一階差分 df.diff
print df.diff() # 計算一階差分
"""
a b c
2018-07-01 NaN NaN NaN
2018-07-02 1.0 NaN NaN
2018-07-03 5.0 NaN 6.0
2018-07-04 -4.0 -4.0 -4.0
"""
.
pct_change
百分數變化 df.pct_change
print df.pct_change() # 計算百分數變化(在列上計算)
"""
a b c
2018-07-01 NaN NaN NaN
2018-07-02 1.000000 NaN NaN
2018-07-03 2.500000 3.0 2.000000
2018-07-04 -0.571429 -0.5 -0.444444
"""
print df.pct_change(axis=1) # 計算百分數變化(在行上計算)
"""
a b c
2018-07-01 NaN 1.000000 NaN
2018-07-02 NaN NaN 0.500
2018-07-03 NaN 0.142857 0.125
2018-07-04 NaN 0.333333 0.250
"""
.
corr
計算列與列之間的相關性,不計算包括NA / null值的列。
DataFrame.corr(method='pearson', min_periods=1)
引數:
method:
- pearson:皮爾遜相關係數
- kendall:肯德爾等級相關係數
- spearman:斯皮爾曼等級相關係數
min_periods:為獲取有效結構,每對列所需的最小觀察資料量
返回:
關於原始DataFrame列與列之間相關性的DataFrame物件。
.
#!/usr/bin/python2.7
# -*- coding:utf-8 -*-
"""
@author: tz_zs
"""
import pandas as pd
list_l = [[1, 3, 3, 5, ], [11, 7, 15, 13], [4, 2, 7, 9]]
index = ["2018-07-02", "2018-07-03", "2018-07-04"]
col = ['a', 'b', 'c', 'd']
df = pd.DataFrame(list_l, index=index, columns=col)
print(df)
"""
a b c d
2018-07-02 1 3 3 5
2018-07-03 11 7 15 13
2018-07-04 4 2 7 9
"""
df_corr = df.corr()
print(df_corr)
print(type(df_corr))
"""
a b c d
a 1.000000 0.883852 0.999322 0.974355
b 0.883852 1.000000 0.866025 0.755929
c 0.999322 0.866025 1.000000 0.981981
d 0.974355 0.755929 0.981981 1.000000
<class 'pandas.core.frame.DataFrame'>
"""
.
#!/usr/bin/python2.7
# -*- coding:utf-8 -*-
"""
@author: tz_zs
"""
import pandas as pd
list_l = [[1, 3, 3, 5, ], [11, 7, 15, 13], [4, 2, 7, None]]
index = ["2018-07-02", "2018-07-03", "2018-07-04"]
col = ['a', 'b', 'c', 'd']
df = pd.DataFrame(list_l, index=index, columns=col)
print(df)
"""
a b c d
2018-07-02 1 3 3 5.0
2018-07-03 11 7 15 13.0
2018-07-04 4 2 7 NaN
"""
df_corr = df.corr()
print(df_corr)
print(type(df_corr))
"""
a b c d
a 1.000000 0.883852 0.999322 1.0
b 0.883852 1.000000 0.866025 1.0
c 0.999322 0.866025 1.000000 1.0
d 1.000000 1.000000 1.000000 1.0
<class 'pandas.core.frame.DataFrame'>
"""
.
end