1. Pandas數據構成
1)Series數據構成
>>>import pandas as pd
>>>series1 = pd.Series([1, 2, 3])
>>>series1
0 1
1 2
2 3
2)DataFrame數據構成
>>>import pandas as pd
>>>import numpy as np
>>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4))
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
>>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4), index = ['row0', 'row1', 'row2'], columns = ['col0', 'col1', 'col2', 'col3'])
>>>datafram1
col0 col1 col2 col3
row0 0 1 2 3
row1 4 5 6 7
row2 8 9 10 11
>>>datafram1 = pd.DataFrame({'col0':0, 'col1':1, 'col2':[2, 2, 2]})
>>>datafram1
col0 col1 col2
0 0 1 2
1 0 1 2
2 0 1 2
2. Pandas數據的索引
>>>import pandas as pd
>>>import numpy as np
#Series的索引
>>>series1 = pd.Series([1, 2, 3, 4, 5, 6], index = ['A', 'B', 'C', 'D', 'E', 'F'])
>>>series1
A 1
B 2
C 3
D 4
E 5
F 6
>>>series1['A']
1
>>>series1[0]
1
#DataFrame的索引
>>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4), index = ['row0', 'row1', 'row2'], columns = ['col0', 'col1', 'col2', 'col3'])
>>>dataframe1
col0 col1 col2 col3
row0 0 1 2 3
row1 4 5 6 7
row2 8 9 10 11
>>>dataframe1.col0
row0 0
row1 4
row2 8
>>>dataframe1['col0']
row0 0
row1 4
row2 8
#按標簽進行索引
>>>dataframe1.loc['row0']
col0 0
col1 1
col2 2
col3 3
>>>dataframe1.loc['row0', 'col0']
0
#按位置使用序號索引
>>>dataframe1.iloc[0]
col0 0
col1 1
col2 2
col3 3
>>>dataframe1.iloc[0, 1]
1
#混合索引,不建議使用
>>>dataframe1.ix['row0', 0]
0
>>>dataframe1.ix[0, 'col0']
0
3. Pandas數據的插入和修改
>>>import numpy as np
>>>import pandas as pd
>>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4), index = ['row0', 'row1', 'row2'], columns = ['col0', 'col1', 'col2', 'col3'])
>>>dataframe1
col0 col1 col2 col3
row0 0 1 2 3
row1 4 5 6 7
row2 8 9 10 11
#使用loc進行修改
>>>dataframe1.loc['row0', 'col0'] = 111
>>>dataframe1
col0 col1 col2 col3
row0 111 1 2 3
row1 4 5 6 7
row2 8 9 10 11
#使用iloc進行修改
>>>dataframe1.iloc[0, 0] = 222
>>>dataframe1
col0 col1 col2 col3
row0 222 1 2 3
row1 4 5 6 7
row2 8 9 10 11
#使用布爾索引
#將dataframe1的col1列<5的數值對應的行再將這些行中的col0列賦值為0
>>>dataframe1.col0[dataframe1.col1 < 5] = 0
>>>dataframe1
col0 col1 col2 col3
row0 0 1 2 3
row1 4 5 6 7
row2 8 9 10 11
#插入一行數據(不使用index的情況)
>>>dataframe1['col4'] = pd.Series([11, 22, 33])
>>>dataframe1
col0 col1 col2 col3 col4
row0 0 1 2 3 NaN
row1 4 5 6 7 NaN
row2 8 9 10 11 NaN
#插入一行數據,使用index的情況
>>>dataframe1['col4'] = pd.Series([11, 22, 33], index = ['row0', 'row1', 'row2'])
>>>dataframe1
col0 col1 col2 col3 col4
row0 0 1 2 3 11
row1 4 5 6 7 22
row2 8 9 10 11 33
4. Pandas中nan值的相關操作
>>>dataframe1
col0 col1 col2 col3 col4
row0 0 1 2 3 11
row1 4 5 6 7 22
row2 8 9 10 11 33
>>>dataframe1.iloc[0, 0] = np.nan
>>>ataframe1.iloc[1, 1] = np.nan
>>>dataframe1
col0 col1 col2 col3 col4
row0 NaN 1.0 2 3 11
row1 4.0 NaN 6 7 22
row2 8.0 9.0 10 11 33
#將axis = 0(橫)或者axis = 1(列)中含有'nan'(how = 'any')或者都是'nan'(how = 'all')的橫或者列丟棄
>>>dataframe1.dropna(axis = 0, how = 'any')
col0 col1 col2 col3 col4
row2 8.0 9.0 10 11 33
>>>dataframe1.dropna(axis = 1, how = 'any')
col2 col3 col4
row0 2 3 11
row1 6 7 22
row2 10 11 33
#判斷是否含有nan
#nan值那一項為True
>>>dataframe1.isnull()
col0 col1 col2 col3 col4
row0 True False False False False
row1 False True False False False
row2 False False False False False
#判斷數組中是否含有nan值
>>>np.any(dataframe1.isnull())
True
#將nan值改爲自己想要的值
>>>dataframe1.fillna(value = 0)
col0 col1 col2 col3 col4
row0 0.0 1.0 2 3 11
row1 4.0 0.0 6 7 22
row2 8.0 9.0 10 11 33
5. Pandas中Dataframe數據的合并操作(concat)
>>>dataframe1
col0 col1 col2 col3
row0 1.0 1.0 1.0 1.0
row1 1.0 1.0 1.0 1.0
row2 1.0 1.0 1.0 1.0
>>>dataframe2
col1 col2 col3 col4
row1 1.0 1.0 1.0 1.0
row2 1.0 1.0 1.0 1.0
row3 1.0 1.0 1.0 1.0
#默認join = 'outer', axis = 0(多餘的row和col的數值會使用nan來補充)
>>>pd.concat([dataframe1, dataframe2], join = 'outer')
col0 col1 col2 col3 col4
row0 1.0 1.0 1.0 1.0 NaN
row1 1.0 1.0 1.0 1.0 NaN
row2 1.0 1.0 1.0 1.0 NaN
row1 NaN 1.0 1.0 1.0 1.0
row2 NaN 1.0 1.0 1.0 1.0
row3 NaN 1.0 1.0 1.0 1.0
#join = 'inner'(多餘的row和col會被刪除)
>>>pd.concat([dataframe1, dataframe2], join = 'inner')
col1 col2 col3
row0 1.0 1.0 1.0
row1 1.0 1.0 1.0
row2 1.0 1.0 1.0
row1 1.0 1.0 1.0
row2 1.0 1.0 1.0
row3 1.0 1.0 1.0
#忽略index,系統重新分配
>>>pd.concat([dataframe1, dataframe2], ignore_index = True)
col0 col1 col2 col3 col4
0 1.0 1.0 1.0 1.0 NaN
1 1.0 1.0 1.0 1.0 NaN
2 1.0 1.0 1.0 1.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
#axis = 1
>>>pd.concat([dataframe1, dataframe2], axis = 1)
col0 col1 col2 col3 col1 col2 col3 col4
row0 1.0 1.0 1.0 1.0 NaN NaN NaN NaN
row1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
row2 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
row3 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
#按照dataframe1的row index來進行合并,沒有就填充nan
>>>pd.concat([dataframe1, dataframe2], axis = 1, join_axes = [dataframe1.index])
col0 col1 col2 col3 col1 col2 col3 col4
row0 1.0 1.0 1.0 1.0 NaN NaN NaN NaN
row1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
row2 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
#按照dataframe1的col來進行合并,沒有填充nan
>>> pd.concat([dataframe1, dataframe2], join_axes = [dataframe1.columns], axis = 0)
col0 col1 col2 col3
row0 1.0 1.0 1.0 1.0
row1 1.0 1.0 1.0 1.0
row2 1.0 1.0 1.0 1.0
row1 NaN 1.0 1.0 1.0
row2 NaN 1.0 1.0 1.0
row3 NaN 1.0 1.0 1.0
#使用append來進行合并,麽有axis,默認上下添加
>>>dataframe1.append(dataframe1, ignore_index=True)
col0 col1 col2 col3
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
#有index,ignore_index = True(沒有後面會報錯)
>>>dataframe1.append(pd.Series(np.arange(4), index = ['col0', 'col1', 'col2', 'col3']), ignore_index=True)
col0 col1 col2 col3
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
3 0.0 1.0 2.0 3.0
#沒有index
>>>dataframe1.append(pd.Series(np.arange(4)), ignore_index= True)
col0 col1 col2 col3 0 1 2 3
0 1.0 1.0 1.0 1.0 NaN NaN NaN NaN
1 1.0 1.0 1.0 1.0 NaN NaN NaN NaN
2 1.0 1.0 1.0 1.0 NaN NaN NaN NaN
3 NaN NaN NaN NaN 0.0 1.0 2.0 3.0
6. Pandas中DataFrame數據的合并操作
>>>left
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
>>>right
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
#使用Merge按照key(單個Key)進行合并
>>>pd.merge(left, right, on = 'key')
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
#兩個key進行合并(空缺的填充相同的即可)
>>>left
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
>>>right
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
#how = 'inner, outer, left, right',默認how = 'inner'
>>>res = pd.merge(right, left, on = ['key1', 'key2'])
key1 key2 C D A B
0 K0 K0 C0 D0 A0 B0
1 K1 K0 C1 D1 A2 B2
2 K1 K0 C2 D2 A2 B2
>>>res = pd.merge(right, left, on = ['key1', 'key2'], how = 'outer')
>>>res
key1 key2 C D A B
0 K0 K0 C0 D0 A0 B0
1 K1 K0 C1 D1 A2 B2
2 K1 K0 C2 D2 A2 B2
3 K2 K0 C3 D3 NaN NaN
4 K0 K1 NaN NaN A1 B1
5 K2 K1 NaN NaN A3 B3
#以右邊的KEY爲準,左邊沒有該Key的值填NaN,有該Key則填相應的值,不計次數
>>>res = pd.merge(left, right, on = ['key1', 'key2'], how = 'right')
>>>res
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
>>>dataframe1
col1 col_left
0 0 a
1 1 b
>>>dataframe2
col1 col_right
0 1 2
1 2 2
2 2 2
#indicator參數會告知你哪邊有值哪邊沒值,默認indicator
>>>res = pd.merge(df1, df2, on = 'col1', how = 'outer', indicator = 'indicator_label')
>>>res
col1 col_left col_right indicator_label
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
>>>left
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
>>>right
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
>>>pd.merge(left, right, left_index=True, right_index=True, how = 'outer')
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
#合并重復值
>>>boys
k age
0 K0 1
1 K1 2
2 K2 3
>>>girls
k age
0 K0 4
1 K0 5
2 K3 6
#suffixes中的字符串和前面的矩陣要一一對應
>>>pd.merge(boys, girls, on = 'k', suffixes = ['_a', '_b'], how = 'inner')
k age_a age_b
0 K0 1 4
1 K0 1 5
7. Pandas plot圖表
#Pandas中Series的plot
>>>data = pd.Series(np.random.randn(1000), index = np.arange(1000))
>>>data = data.cumsum()
>>>data.plot()
#Pandas中DataFrame數據的plot
#plotmethod(plt.scatter)
#'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'
>>> data = pd.DataFrame(np.random.randn(1000, 4), index = np.arange(1000), columns = list("ABCD"))
>>>data = data.cumsum()
>>>data.plot()
#plt.scatter(x = , y = )
#figure1
>>>data = pd.DataFrame(np.random.randn(1000, 4), index = np.arange(1000), columns = list("ABCD"))
>>>data = data.cumsum()
>>>data.plot.scatter()
#屬性 color,label
>>>data.plot.scatter(x = 'A', y = 'B')
#添加屬性并且在一個圖上打印兩個圖形
#figure2
>>>ax = data.plot.scatter(x = 'A', y = 'B', color = 'DarkBlue', label = 'Class 1')
>>>data.plot.scatter(x = 'A', y = 'C', color = 'DarkGreen', label = 'Class 2', ax = ax)