numpy和pandas 小計

阿新 • • 發佈：2020-11-05

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import pandas as pd


#用值列表生成 Series 時，Pandas 預設自動生成整數索引
s = pd.Series([1,3,5,np.nan,6,8])
print s

#用含日期時間索引與標籤的 NumPy 陣列生成 DataFrame ：
dates = pd.date_range('20200901',periods=6)
print dates

df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list(' 
ABCD'))
print df

#用 Series 字典物件生成 DataFrame:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical([" 
test", "train", "test", "train"]),
                    'F': 'foo'})

print df2

#DataFrame 的列有不同資料型別 。
print df2.dtypes


#檢視 DataFrame 頭部和尾部資料：
print df.head()
print df.tail(2)


#顯示索引與列名：
print df.index
print df.columns

#可以快速檢視資料的統計摘要
print df.describe()

print df

#轉置資料
print df.T

#按軸排序
print df.sort_index(axis=1, ascending=False)

 
#按值排序
print df.sort_values(by='B')

#選擇單列，產生 Series，與 df.A 等效：
print  df['A']

#用 [ ] 切片行
print df[0:3]

#用標籤提取一行資料
print df.loc[dates[0]]
print df.loc[dates[1]]


#用標籤選擇多列資料
print df.loc[:, ['A', 'B']]

#用標籤切片，包含行與列結束點
print df.loc['2020-09-01':'2020-09-02', ['A', 'B']]

#返回物件降維
print df.loc['2020-09-02', ['A', 'B']]

#提取標量值
print df.loc[dates[0], 'A']
print df.loc[dates[1], 'A']

print df.at[dates[0], 'A']
print df.at[dates[1], 'A']

#用整數位置選擇
print df
print df.iloc[3]

#用整數切片
df.iloc[3:5, 0:2]

#用整數列表按位置切片
df.iloc[[1, 2, 4], [0, 2]]

#顯式整行切片
df.iloc[1:3, :]

#顯式整列切片
df.iloc[:, 1:3]

#顯式提取值
df.iloc[1, 1]

--布林索引
#用單列的值選擇資料
df[df.A > 0]

#選擇 DataFrame 裡滿足條件的值
df[df > 0]


#用 isin() 篩選
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2[df2['E'].isin(['two', 'four'])]



--賦值
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20200901', periods=6))
df['F'] = s1

#按標籤賦值
df.at[dates[0], 'A'] = 0
df

#按位置賦值
df.iat[0, 1] = 0
df

#按 NumPy 陣列賦值
df.loc[:, 'D'] = np.array([5] * len(df))
df


#用 where 條件賦值：
df2 = df.copy()
df2[df2 > 0] = -df2
df2


--缺失值
#Pandas 主要用 np.nan 表示缺失資料。 計算時，預設不包含空值
#重建索引（reindex）可以更改、新增、刪除指定軸的索引，並返回資料副本，即不更改原資料
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1


#刪除所有含缺失值的行
df1.dropna(how='any')


#填充缺失值
df1.fillna(value=5)


#提取 nan 值的布林掩碼
pd.isna(df1)
 

--運算
#描述性統計
df.mean()


#在另一個軸(即，行)上執行同樣的操作
df.mean(1)

#不同維度物件運算時，要先對齊。 此外，Pandas 自動沿指定維度廣播  shift下移幾位
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) 


#pply 函式處理資料
df.apply(np.cumsum)
df.apply(lambda x: x.max() - x.min())


--直方圖
#直方圖
s = pd.Series(np.random.randint(0, 7, size=10))
s.value_counts()



--字串方法
#Series 的 str 屬性包含一組字串處理功能，如下列程式碼所示。注意，str 的模式匹配預設使用正則表示式
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()


--合併（Merge）
結合（Concat）
#Pandas 提供了多種將 Series、DataFrame #物件組合在一起的功能，用索引與關聯代數功能的多種設定邏輯可執行連線（join）與合併（merge）操作
df = pd.DataFrame(np.random.randn(10, 4))
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)