Python中Pandas模塊的基本操作

2019-04-14 21:31发布

1. Pandas數據構成 1)Series數據構成 >>>import pandas as pd >>>series1 = pd.Series([1, 2, 3]) >>>series1 0 1 1 2 2 3 2)DataFrame數據構成 >>>import pandas as pd >>>import numpy as np >>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4)) 0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 >>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4), index = ['row0', 'row1', 'row2'], columns = ['col0', 'col1', 'col2', 'col3']) >>>datafram1 col0 col1 col2 col3 row0 0 1 2 3 row1 4 5 6 7 row2 8 9 10 11 >>>datafram1 = pd.DataFrame({'col0':0, 'col1':1, 'col2':[2, 2, 2]}) >>>datafram1 col0 col1 col2 0 0 1 2 1 0 1 2 2 0 1 2 2. Pandas數據的索引 >>>import pandas as pd >>>import numpy as np #Series的索引 >>>series1 = pd.Series([1, 2, 3, 4, 5, 6], index = ['A', 'B', 'C', 'D', 'E', 'F']) >>>series1 A 1 B 2 C 3 D 4 E 5 F 6 >>>series1['A'] 1 >>>series1[0] 1 #DataFrame的索引 >>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4), index = ['row0', 'row1', 'row2'], columns = ['col0', 'col1', 'col2', 'col3']) >>>dataframe1 col0 col1 col2 col3 row0 0 1 2 3 row1 4 5 6 7 row2 8 9 10 11 >>>dataframe1.col0 row0 0 row1 4 row2 8 >>>dataframe1['col0'] row0 0 row1 4 row2 8 #按標簽進行索引 >>>dataframe1.loc['row0'] col0 0 col1 1 col2 2 col3 3 >>>dataframe1.loc['row0', 'col0'] 0 #按位置使用序號索引 >>>dataframe1.iloc[0] col0 0 col1 1 col2 2 col3 3 >>>dataframe1.iloc[0, 1] 1 #混合索引,不建議使用 >>>dataframe1.ix['row0', 0] 0 >>>dataframe1.ix[0, 'col0'] 0 3. Pandas數據的插入和修改 >>>import numpy as np >>>import pandas as pd >>>dataframe1 = pd.DataFrame(np.arange(12).reshape(3, 4), index = ['row0', 'row1', 'row2'], columns = ['col0', 'col1', 'col2', 'col3']) >>>dataframe1 col0 col1 col2 col3 row0 0 1 2 3 row1 4 5 6 7 row2 8 9 10 11 #使用loc進行修改 >>>dataframe1.loc['row0', 'col0'] = 111 >>>dataframe1 col0 col1 col2 col3 row0 111 1 2 3 row1 4 5 6 7 row2 8 9 10 11 #使用iloc進行修改 >>>dataframe1.iloc[0, 0] = 222 >>>dataframe1 col0 col1 col2 col3 row0 222 1 2 3 row1 4 5 6 7 row2 8 9 10 11 #使用布爾索引 #將dataframe1的col1列<5的數值對應的行再將這些行中的col0列賦值為0 >>>dataframe1.col0[dataframe1.col1 < 5] = 0 >>>dataframe1 col0 col1 col2 col3 row0 0 1 2 3 row1 4 5 6 7 row2 8 9 10 11 #插入一行數據(不使用index的情況) >>>dataframe1['col4'] = pd.Series([11, 22, 33]) >>>dataframe1 col0 col1 col2 col3 col4 row0 0 1 2 3 NaN row1 4 5 6 7 NaN row2 8 9 10 11 NaN #插入一行數據,使用index的情況 >>>dataframe1['col4'] = pd.Series([11, 22, 33], index = ['row0', 'row1', 'row2']) >>>dataframe1 col0 col1 col2 col3 col4 row0 0 1 2 3 11 row1 4 5 6 7 22 row2 8 9 10 11 33 4. Pandas中nan值的相關操作 >>>dataframe1 col0 col1 col2 col3 col4 row0 0 1 2 3 11 row1 4 5 6 7 22 row2 8 9 10 11 33 >>>dataframe1.iloc[0, 0] = np.nan >>>ataframe1.iloc[1, 1] = np.nan >>>dataframe1 col0 col1 col2 col3 col4 row0 NaN 1.0 2 3 11 row1 4.0 NaN 6 7 22 row2 8.0 9.0 10 11 33 #將axis = 0(橫)或者axis = 1(列)中含有'nan'(how = 'any')或者都是'nan'(how = 'all')的橫或者列丟棄 >>>dataframe1.dropna(axis = 0, how = 'any') col0 col1 col2 col3 col4 row2 8.0 9.0 10 11 33 >>>dataframe1.dropna(axis = 1, how = 'any') col2 col3 col4 row0 2 3 11 row1 6 7 22 row2 10 11 33 #判斷是否含有nan #nan值那一項為True >>>dataframe1.isnull() col0 col1 col2 col3 col4 row0 True False False False False row1 False True False False False row2 False False False False False #判斷數組中是否含有nan值 >>>np.any(dataframe1.isnull()) True #將nan值改爲自己想要的值 >>>dataframe1.fillna(value = 0) col0 col1 col2 col3 col4 row0 0.0 1.0 2 3 11 row1 4.0 0.0 6 7 22 row2 8.0 9.0 10 11 33 5. Pandas中Dataframe數據的合并操作(concat) >>>dataframe1 col0 col1 col2 col3 row0 1.0 1.0 1.0 1.0 row1 1.0 1.0 1.0 1.0 row2 1.0 1.0 1.0 1.0 >>>dataframe2 col1 col2 col3 col4 row1 1.0 1.0 1.0 1.0 row2 1.0 1.0 1.0 1.0 row3 1.0 1.0 1.0 1.0 #默認join = 'outer', axis = 0(多餘的row和col的數值會使用nan來補充) >>>pd.concat([dataframe1, dataframe2], join = 'outer') col0 col1 col2 col3 col4 row0 1.0 1.0 1.0 1.0 NaN row1 1.0 1.0 1.0 1.0 NaN row2 1.0 1.0 1.0 1.0 NaN row1 NaN 1.0 1.0 1.0 1.0 row2 NaN 1.0 1.0 1.0 1.0 row3 NaN 1.0 1.0 1.0 1.0 #join = 'inner'(多餘的row和col會被刪除) >>>pd.concat([dataframe1, dataframe2], join = 'inner') col1 col2 col3 row0 1.0 1.0 1.0 row1 1.0 1.0 1.0 row2 1.0 1.0 1.0 row1 1.0 1.0 1.0 row2 1.0 1.0 1.0 row3 1.0 1.0 1.0 #忽略index,系統重新分配 >>>pd.concat([dataframe1, dataframe2], ignore_index = True) col0 col1 col2 col3 col4 0 1.0 1.0 1.0 1.0 NaN 1 1.0 1.0 1.0 1.0 NaN 2 1.0 1.0 1.0 1.0 NaN 3 NaN 1.0 1.0 1.0 1.0 4 NaN 1.0 1.0 1.0 1.0 5 NaN 1.0 1.0 1.0 1.0 #axis = 1 >>>pd.concat([dataframe1, dataframe2], axis = 1) col0 col1 col2 col3 col1 col2 col3 col4 row0 1.0 1.0 1.0 1.0 NaN NaN NaN NaN row1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 row2 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 row3 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 #按照dataframe1的row index來進行合并,沒有就填充nan >>>pd.concat([dataframe1, dataframe2], axis = 1, join_axes = [dataframe1.index]) col0 col1 col2 col3 col1 col2 col3 col4 row0 1.0 1.0 1.0 1.0 NaN NaN NaN NaN row1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 row2 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 #按照dataframe1的col來進行合并,沒有填充nan >>> pd.concat([dataframe1, dataframe2], join_axes = [dataframe1.columns], axis = 0) col0 col1 col2 col3 row0 1.0 1.0 1.0 1.0 row1 1.0 1.0 1.0 1.0 row2 1.0 1.0 1.0 1.0 row1 NaN 1.0 1.0 1.0 row2 NaN 1.0 1.0 1.0 row3 NaN 1.0 1.0 1.0 #使用append來進行合并,麽有axis,默認上下添加 >>>dataframe1.append(dataframe1, ignore_index=True) col0 col1 col2 col3 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 3 1.0 1.0 1.0 1.0 4 1.0 1.0 1.0 1.0 5 1.0 1.0 1.0 1.0 #有index,ignore_index = True(沒有後面會報錯) >>>dataframe1.append(pd.Series(np.arange(4), index = ['col0', 'col1', 'col2', 'col3']), ignore_index=True) col0 col1 col2 col3 0 1.0 1.0 1.0 1.0 1 1.0 1.0 1.0 1.0 2 1.0 1.0 1.0 1.0 3 0.0 1.0 2.0 3.0 #沒有index >>>dataframe1.append(pd.Series(np.arange(4)), ignore_index= True) col0 col1 col2 col3 0 1 2 3 0 1.0 1.0 1.0 1.0 NaN NaN NaN NaN 1 1.0 1.0 1.0 1.0 NaN NaN NaN NaN 2 1.0 1.0 1.0 1.0 NaN NaN NaN NaN 3 NaN NaN NaN NaN 0.0 1.0 2.0 3.0 6. Pandas中DataFrame數據的合并操作 >>>left key A B 0 K0 A0 B0 1 K1 A1 B1 2 K2 A2 B2 3 K3 A3 B3 >>>right key C D 0 K0 C0 D0 1 K1 C1 D1 2 K2 C2 D2 3 K3 C3 D3 #使用Merge按照key(單個Key)進行合并 >>>pd.merge(left, right, on = 'key') key A B C D 0 K0 A0 B0 C0 D0 1 K1 A1 B1 C1 D1 2 K2 A2 B2 C2 D2 3 K3 A3 B3 C3 D3 #兩個key進行合并(空缺的填充相同的即可) >>>left key1 key2 A B 0 K0 K0 A0 B0 1 K0 K1 A1 B1 2 K1 K0 A2 B2 3 K2 K1 A3 B3 >>>right key1 key2 C D 0 K0 K0 C0 D0 1 K1 K0 C1 D1 2 K1 K0 C2 D2 3 K2 K0 C3 D3 #how = 'inner, outer, left, right',默認how = 'inner' >>>res = pd.merge(right, left, on = ['key1', 'key2']) key1 key2 C D A B 0 K0 K0 C0 D0 A0 B0 1 K1 K0 C1 D1 A2 B2 2 K1 K0 C2 D2 A2 B2 >>>res = pd.merge(right, left, on = ['key1', 'key2'], how = 'outer') >>>res key1 key2 C D A B 0 K0 K0 C0 D0 A0 B0 1 K1 K0 C1 D1 A2 B2 2 K1 K0 C2 D2 A2 B2 3 K2 K0 C3 D3 NaN NaN 4 K0 K1 NaN NaN A1 B1 5 K2 K1 NaN NaN A3 B3 #以右邊的KEY爲準,左邊沒有該Key的值填NaN,有該Key則填相應的值,不計次數 >>>res = pd.merge(left, right, on = ['key1', 'key2'], how = 'right') >>>res key1 key2 A B C D 0 K0 K0 A0 B0 C0 D0 1 K1 K0 A2 B2 C1 D1 2 K1 K0 A2 B2 C2 D2 3 K2 K0 NaN NaN C3 D3 >>>dataframe1 col1 col_left 0 0 a 1 1 b >>>dataframe2 col1 col_right 0 1 2 1 2 2 2 2 2 #indicator參數會告知你哪邊有值哪邊沒值,默認indicator >>>res = pd.merge(df1, df2, on = 'col1', how = 'outer', indicator = 'indicator_label') >>>res col1 col_left col_right indicator_label 0 0 a NaN left_only 1 1 b 2.0 both 2 2 NaN 2.0 right_only 3 2 NaN 2.0 right_only >>>left A B K0 A0 B0 K1 A1 B1 K2 A2 B2 >>>right C D K0 C0 D0 K2 C2 D2 K3 C3 D3 >>>pd.merge(left, right, left_index=True, right_index=True, how = 'outer') A B C D K0 A0 B0 C0 D0 K1 A1 B1 NaN NaN K2 A2 B2 C2 D2 K3 NaN NaN C3 D3 #合并重復值 >>>boys k age 0 K0 1 1 K1 2 2 K2 3 >>>girls k age 0 K0 4 1 K0 5 2 K3 6 #suffixes中的字符串和前面的矩陣要一一對應 >>>pd.merge(boys, girls, on = 'k', suffixes = ['_a', '_b'], how = 'inner') k age_a age_b 0 K0 1 4 1 K0 1 5 7. Pandas plot圖表 #Pandas中Series的plot >>>data = pd.Series(np.random.randn(1000), index = np.arange(1000)) >>>data = data.cumsum() >>>data.plot()   #Pandas中DataFrame數據的plot #plotmethod(plt.scatter) #'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie' >>> data = pd.DataFrame(np.random.randn(1000, 4), index = np.arange(1000), columns = list("ABCD")) >>>data = data.cumsum() >>>data.plot()   #plt.scatter(x = , y = ) #figure1 >>>data = pd.DataFrame(np.random.randn(1000, 4), index = np.arange(1000), columns = list("ABCD")) >>>data = data.cumsum() >>>data.plot.scatter() #屬性 color,label >>>data.plot.scatter(x = 'A', y = 'B') #添加屬性并且在一個圖上打印兩個圖形 #figure2 >>>ax = data.plot.scatter(x = 'A', y = 'B', color = 'DarkBlue', label = 'Class 1') >>>data.plot.scatter(x = 'A', y = 'C', color = 'DarkGreen', label = 'Class 2', ax = ax)