1. 程式人生 > >pandas學習與使用2

pandas學習與使用2

繼續學習pandas庫,上一節主要介紹了Series,這一節主要是DataFrame結構的用法。執行環境python2.7

#!usr/bin/python3
# coding:utf-8
# pandas 使用DataFrame

import numpy as np
import pandas as pd

dates = pd.date_range('20171231', periods=6)
print "dates:\n", dates

#                    a         b         c         d
# 2017-12-31  0.544078  0.389521  0.097052  0.942329
# 2018-01-01  0.474514  0.456605  0.750682  0.683513
# 2018-01-02  0.537973  0.230534  0.216569  0.015208
# 2018-01-03  0.320855  0.295421  0.342874  0.808681
# 2018-01-04  0.649339  0.678842  0.390282  0.692622
# 2018-01-05  0.041877  0.197155  0.384499  0.301309
# index 為縱向座標的,column為橫向座標

df = pd.DataFrame(np.random.rand(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print "df:\n", df
print "dtypes:\n", df.dtypes
print "columns:\n", df.columns
print "values:\n", df.values
# describe是DataFrame的詳細資訊,包括count、mena、std等值
print "describe:\n", df.describe()

#               a         b         c         d
# count  6.000000  6.000000  6.000000  6.000000
# mean   0.428106  0.374680  0.363660  0.573944
# std    0.217850  0.177776  0.220942  0.347394
# min    0.041877  0.197155  0.097052  0.015208
# 25%    0.359270  0.246756  0.248145  0.396860
# 50%    0.506243  0.342471  0.363687  0.688067
# 75%    0.542552  0.439834  0.388836  0.779666
# max    0.649339  0.678842  0.750682  0.942329
print "轉置:\n", df.T

# 按照索引排序 ascending=False 倒序列
print "按照索引排序:\n", df.sort_index(axis=1, ascending=False)
# 按照值排序
print "按照值排序:\n", df.sort_values(by='a')
print "______________________"

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print "df1:\n", df1
print "df1.dtypes:\n", df1.dtypes

df2 = pd.DataFrame({'A': 1, 'B': pd.Timestamp('20180101'), 'C': pd.Series(1, index=list(range(4)), dtype=float)})
print "df2:\n", df2

# DataFrame的構造
# pd.DataFrame() 引數: 1、二維array; 2、Series 列表; 3、value為Series的字典;

# 1、二維array
s1 = np.array([1, 2, 3, 4])
s2 = np.array([5, 6, 7, 8])
dataframe1 = pd.DataFrame([s1, s2])
print "dataframe1:\n", dataframe1

# 2、Series 列表
s1 = pd.Series(np.array([1, 2, 3, 4]))
s2 = pd.Series(np.array([5, 6, 7, 8]))
dataframe2 = pd.DataFrame([s1, s2])
print "dataframe2:\n", dataframe2

# 3、value為Series的字典
s1 = pd.Series(np.array([1, 2, 3, 4]))
s2 = pd.Series(np.array([5, 6, 7, 8]))
dataframe3 = pd.DataFrame({"a": s1, "b": s2});
print "dataframe3:\n", dataframe3

# DataFrame的屬性
print "dataframe3的columns的值:\n", dataframe3.columns
print "dataframe3的形狀:\n", dataframe3.shape
print "dataframe3的index的值:\n", list(dataframe3.index)
print "dataframe3的value的值:\n", dataframe3.values

# DataFrame的if-then操作
# df.ix[條件,then操作區域]
df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": [1, 1, 1, 1]})
print "修改前:\n", df
# if(df.A > 2),"B"=-1
df.ix[df.A > 2, 'B'] = -1
print "修改後:\n", df

# 使用numpy.where
df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": [1, 1, 1, 1]})
# np.where(條件,then,else)
df["then"] = np.where(df.A < 3, 1, 0)
print "修改後:\n", df

# 直接取值df.[]
df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": [1, 1, 1, 1]})
df = df[df.A >= 2]
print "df[df.A >= 2]:\n", df

# 使用.loc[]
df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": [1, 1, 1, 1]})
df = df.loc[df.A > 2]
print "df.loc[df.A > 2]:\n", df

df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(),
                   'size': list('SSMMMLL'),
                   'weight': [8, 10, 11, 1, 20, 12, 12],
                   'adult': [False] * 5 + [True] * 2})
print "df:\n", df