Python练手，pandas

zhltom · 发表于 2018-8-5 11:21:55

'''　　
http://pandas.pydata.org/pandas-docs/stable/10min.html
　　

　　
numpy的主要数据结构是ndarry
　　
pandas的主要数据结构是Series、DataFrame
　　
'''
　　

　　

　　
import pandas as pd
　　
import numpy as np
　　
import matplotlib.pyplot as plt
　　

　　
df1 = pd.DataFrame(np.array(range(101,125)).reshape(6,4),
　　
               index=range(6),
　　
               columns=list('ABCD'))
　　
print(df1)
　　
#    A B C D
　　
# 0  101  102  103  104
　　
# 1  105  106  107  108
　　
# 2  109  110  111  112
　　
# 3  113  114  115  116
　　
# 4  117  118  119  120
　　
# 5  121  122  123  124
　　

　　
df2 = pd.DataFrame({'custID':['C0001','C0002','C0004','C0004','C0004','C0003'],
　　
                  'accountID':pd.Series(['6214C000101',
　　
                                       '6214C000201',
　　
                                       '6214C000401',
　　
                                       '6214C000403',
　　
                                       '6214C000402',
　　
                                       '6214C000301'],index=range(6),dtype='str'),
　　
                  'tradeDate':pd.Series(['2018-01-18 14:00:00',
　　
                                       '2018-01-18 14:00:00',
　　
                                       '2018-01-18 14:00:01',
　　
                                       '2018-01-18 14:00:03',
　　
                                       '2018-01-18 14:00:02',
　　
                                       '2018-01-18 14:00:00'],index=range(6),dtype='str'),
　　
                  'tradeAmt':pd.Series([100.0,
　　
                                       100.0,
　　
                                       101.0,
　　
                                       103.0,
　　
                                       102.0,
　　
                                       100.0],index=range(6),dtype='float'),
　　
                  'tradeDesc':'xxxxxx',
　　
                  'mark':pd.Categorical(["row1","row2","row3","row4","row5","row6"])},
　　
               index=range(6))  #注意：表DateFrame与列Series的索引保持一致。DateFrame的真实index默认是从0开始的，这里设置的其实是index的标签，如果自定义了DateFrame的index（标签），假如某列是Series，那么Series的index也必须保持一致，否则会错位。
　　
print(df2)
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　

　　
print(df2.dtypes)
　　
# accountID    object
　　
# custID       object
　　
# mark       category
　　
# tradeAmt    float64
　　
# tradeDate    object
　　
# tradeDesc    object
　　
# dtype: object
　　

　　
print(df2.index)
　　
# RangeIndex(start=0, stop=6, step=1)
　　

　　
print(df2.columns)
　　
# Index(['accountID', 'custID', 'mark', 'tradeAmt', 'tradeDate', 'tradeDesc'], dtype='object')
　　

　　
print(df2.values)
　　
# [['6214C000101' 'C0001' 'row1' 100.0 '2018-01-18 14:00:00' 'xxxxxx']
　　
#  ['6214C000201' 'C0002' 'row2' 100.0 '2018-01-18 14:00:00' 'xxxxxx']
　　
#  ['6214C000401' 'C0004' 'row3' 101.0 '2018-01-18 14:00:01' 'xxxxxx']
　　
#  ['6214C000403' 'C0004' 'row4' 103.0 '2018-01-18 14:00:03' 'xxxxxx']
　　
#  ['6214C000402' 'C0004' 'row5' 102.0 '2018-01-18 14:00:02' 'xxxxxx']
　　
#  ['6214C000301' 'C0003' 'row6' 100.0 '2018-01-18 14:00:00' 'xxxxxx']]
　　

　　
print(df2.head(2))
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　

　　
print(df2.tail(2))
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　

　　

　　
print(df2.describe())  #统计，但仅限数值的列，非数值的列不会输出统计
　　
#       tradeAmt
　　
# count 6.000000
　　
# mean 101.000000
　　
# std    1.264911
　　
# min 100.000000
　　
# 25% 100.000000
　　
# 50% 100.500000
　　
# 75% 101.750000
　　
# max 103.000000
　　

　　
print(df2.T)
　　
#                            0                   1                   2  \
　　
# accountID       6214C000101       6214C000201       6214C000401
　　
# custID                C0001             C0002             C0004
　　
# mark                   row1                row2                row3
　　
# tradeAmt                100                100                101
　　
# tradeDate  2018-01-18 14:00:00  2018-01-18 14:00:00  2018-01-18 14:00:01
　　
# tradeDesc             xxxxxx             xxxxxx             xxxxxx
　　
#
　　
#                            3                   4                   5
　　
# accountID       6214C000403       6214C000402       6214C000301
　　
# custID                C0004             C0004             C0003
　　
# mark                   row4                row5                row6
　　
# tradeAmt                103                102                100
　　
# tradeDate  2018-01-18 14:00:03  2018-01-18 14:00:02  2018-01-18 14:00:00
　　
# tradeDesc             xxxxxx             xxxxxx             xxxxxx
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
print(df2.sort_values(by='tradeDate',ascending=False)) #排序  按指定列的值降序
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　

　　
print(df2.sort_values(by=['custID','tradeDate'],ascending=[True,False])) #联合排序
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　

　　
print(df2.sort_index(axis=0,ascending=False)) #索引排序  按照行的索引
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　

　　
print(df2.sort_index(axis=1,ascending=True)) #索引排序  按照列的索引（默认是按照列名生成的行索引）
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
'''
　　
iloc按索引查找，loc按标签查找
　　
iat按索引查找，iat按标签查找
　　
'''
　　

　　
print(df2['custID'])
　　
# 0 C0001
　　
# 1 C0002
　　
# 2 C0004
　　
# 3 C0004
　　
# 4 C0004
　　
# 5 C0003
　　
# Name: custID, dtype: object
　　

　　
print(df2[0:4]) #切片按行索引
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　

　　
print(df2[1:4]) #切片按行索引
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　

　　
print(df2.loc[1,'accountID']) #按行列标签查找，不是按行列索引查找
　　
# 6214C000201
　　

　　
print(df2.iloc[3]) #第4行
　　
# accountID          6214C000403
　　
# custID                   C0004
　　
# mark                      row4
　　
# tradeAmt                   103
　　
# tradeDate 2018-01-18 14:00:03
　　
# tradeDesc                xxxxxx
　　
# Name: 3, dtype: object
　　

　　
print(df2.iloc[3,4]) #第4行第5列
　　
# 2018-01-18 14:00:03
　　

　　
print(df2.iloc[3:4]) #第4至5行（不含第5行）
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　

　　
print(df2.iloc[3:5,1:3]) #第4、5行，第2、3列（列索引如果没有自定义，是按列名排序自动生成的）
　　
# custID  mark
　　
# 3  C0004  row4
　　
# 4  C0004  row5
　　
print(df2.iloc[[3,4],[1,2]]) #第4、5行，第2、3列
　　
# custID  mark
　　
# 3  C0004  row4
　　
# 4  C0004  row5
　　

　　
print(df2.iloc[3:5,:]) #第4、5行，所有列
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　

　　
print(df2.iloc[:,1:3]) #所有行，第2、3列
　　
# custID  mark
　　
# 0  C0001  row1
　　
# 1  C0002  row2
　　
# 2  C0004  row3
　　
# 3  C0004  row4
　　
# 4  C0004  row5
　　
# 5  C0003  row6
　　

　　
print(df2[df2.tradeAmt > 101.0]) #筛选
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
df3 = df2.copy()
　　
df3["custID"] = ["NEW","NEW","NEW","NEW","NEW","NEW"] # 更新整列
　　
df3.loc[:,'tradeAmt'] = range(len(df3))  #更新  按行列标签查找
　　
df3.at[range(7)[1],'accountID'] = '==========='  # 更新  按行列标签查找
　　
df3.iat[0,0] = '+++++++++++' # 更新  按行列索引查找
　　
# df3[df3.tradeDate == '2018-01-18 14:00:03'] = -df3 #找出符合条件的行，然后取反，如果所有字段都是数值的话是可以的
　　
print(df3)
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  +++++++++++ NEW  row1       0  2018-01-18 14:00:00 xxxxxx
　　
# 1  =========== NEW  row2       1  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401 NEW  row3       2  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403 NEW  row4       3  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402 NEW  row5       4  2018-01-18 14:00:02 xxxxxx
　　
# 5  6214C000301 NEW  row6       5  2018-01-18 14:00:00 xxxxxx
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
df4 = df2.reindex(index=range(4), columns=['custID','accountID','tradeAmt']) #重新组合抽取
　　
df4.loc[0:1,'tradeAmt'] = 200  #如果该列存在，则更新
　　
df4.loc[0:1,'newColumn'] = 1 #如果该列不存在，则新增列
　　
print(df4)
　　
# custID accountID  tradeAmt  newColumn
　　
# 0  C0001  6214C000101    200.0       1.0
　　
# 1  C0002  6214C000201    200.0       1.0
　　
# 2  C0004  6214C000401    101.0       NaN
　　
# 3  C0004  6214C000403    103.0       NaN
　　

　　
print(df4.dropna(how='any'))  #过滤所有包含空值的行
　　
# custID accountID  tradeAmt  newColumn
　　
# 0  C0001  6214C000101    200.0       1.0
　　
# 1  C0002  6214C000201    200.0       1.0
　　

　　
print(df4.fillna(value=999)) #填充空值
　　
# custID accountID  tradeAmt  newColumn
　　
# 0  C0001  6214C000101    200.0       1.0
　　
# 1  C0002  6214C000201    200.0       1.0
　　
# 2  C0004  6214C000401    101.0    999.0
　　
# 3  C0004  6214C000403    103.0    999.0
　　

　　
print(pd.isnull(df4)) #判断空值
　　
# custID accountID tradeAmt newColumn
　　
# 0  False    False False    False
　　
# 1  False    False False    False
　　
# 2  False    False False    True
　　
# 3  False    False False    True
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
print(df2)
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　

　　
print(df2.mean())
　　
# tradeAmt 101.0
　　
# dtype: float64
　　

　　

　　

　　
s = pd.Series([1,3,5,np.nan,6,8], index=range(6)).shift(2)  # 向后移动几行，前面置空
　　
print(s)
　　
# 0 NaN
　　
# 1 1.0
　　
# 2 3.0
　　
# 3 5.0
　　
# 4 NaN
　　
# 5 6.0
　　
# dtype: float64
　　

　　
print(df2.shift(2))
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0       NaN NaN NaN    NaN                NaN    NaN
　　
# 1       NaN NaN NaN    NaN                NaN    NaN
　　
# 2  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 3  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 4  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 5  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
print(df2.apply(lambda x: max(x))) #列函数 lambda或者function都可以
　　
# accountID          6214C000403
　　
# custID                   C0004
　　
# mark                      row6
　　
# tradeAmt                   103
　　
# tradeDate 2018-01-18 14:00:03
　　
# tradeDesc                xxxxxx
　　
# dtype: object
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
print(df2["custID"].value_counts()) #类似 group by count
　　
# C0004 3
　　
# C0001 1
　　
# C0002 1
　　
# C0003 1
　　
# Name: custID, dtype: int64
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　

　　
print(df2["mark"].str.upper()) #大小写转换
　　
# 0 ROW1
　　
# 1 ROW2
　　
# 2 ROW3
　　
# 3 ROW4
　　
# 4 ROW5
　　
# 5 ROW6
　　
# Name: mark, dtype: object
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
df5 = pd.DataFrame(np.random.randn(9,3))
　　
print(df5)
　　
#          0       1       2
　　
# 0  1.303158 -0.125934 -0.205285
　　
# 1  0.760388 -1.004298  1.143800
　　
# 2  2.063722  0.229955  0.020368
　　
# 3 -2.024974  0.307957 -0.579090
　　
# 4 -1.571883  0.260561 -0.884209
　　
# 5  2.465572 -1.001873  1.243028
　　
# 6  0.025388 -0.372608  1.431214
　　
# 7 -0.079416 -0.401075 -0.973337
　　
# 8 -1.088755 -1.947188 -1.100827
　　

　　
pieces = [df5[:2],df5[5:6],df5[7:]]  #头、中间、尾，切几块拼起来
　　
print(pieces)
　　
# [       0       1       2
　　
# 0  1.303158 -0.125934 -0.205285
　　
# 1  0.760388 -1.004298  1.143800,          0       1       2
　　
# 5  2.465572 -1.001873  1.243028,          0       1       2 #index重复打印了几次
　　
# 7 -0.079416 -0.401075 -0.973337
　　
# 8 -1.088755 -1.947188 -1.100827]
　　

　　
print(pd.concat(pieces)) #包含
　　
#          0       1       2
　　
# 0  1.303158 -0.125934 -0.205285
　　
# 1  0.760388 -1.004298  1.143800
　　
# 5  2.465572 -1.001873  1.243028
　　
# 7 -0.079416 -0.401075 -0.973337
　　
# 8 -1.088755 -1.947188 -1.100827
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
df_left = pd.DataFrame({'key':['001','002','007'],'val':['999','1','2']})
　　
df_right = pd.DataFrame({'key':['001','002','009'],'key2':['001','002','009'],'val':['999','3','4']})
　　

　　
print(df_left)
　　
# key  val
　　
# 0  001  999
　　
# 1  002 1
　　
# 2  007 2
　　
print(df_right)
　　
# key key2  val
　　
# 0  001  001  999
　　
# 1  002  002 3
　　
# 2  009  009 4
　　
print( pd.merge(df_left, df_right,how='inner', on='key') ) #内关联
　　
# key val_x key2 val_y
　　
# 0  001 999  001 999
　　
# 1  002    1  002    3
　　
print( pd.merge(df_left, df_right, how='inner', left_on='key',right_on='key2') ) #内关联不同字段
　　
# key_x val_x key_y key2 val_y
　　
# 0 001 999 001  001 999
　　
# 1 002    1 002  002    3
　　
print( pd.merge(df_left, df_right,how='inner', on=['key','val']) ) #内关联多字段
　　
# key  val key2
　　
# 0  001  999  001
　　
print( pd.merge(df_left, df_right, how='left', on='key') ) #左外关联
　　
# key val_x key2 val_y
　　
# 0  001 999  001 999
　　
# 1  002    1  002    3
　　
# 2  007    2  NaN NaN
　　
print( pd.merge(df_left, df_right, how='right', on='key') ) #右外关联
　　
# key val_x key2 val_y
　　
# 0  001 999  001 999
　　
# 1  002    1  002    3
　　
# 2  009 NaN  009    4
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
print(df2.append(df2[:3],ignore_index=True)) #对原表做行切片，再追加到原表，追加的时候忽略切片的索引标签，索引自动重新编排标签
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 6  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx  （这行是追加的）
　　
# 7  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx  （这行是追加的）
　　
# 8  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx  （这行是追加的）
　　

　　
print(df2.append(df2[:3],ignore_index=False))  #追加之后，保留切片的索引标签，发现了吗，索引标签是允许重复的
　　
#    accountID custID  mark  tradeAmt          tradeDate tradeDesc
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx
　　
# 3  6214C000403  C0004  row4    103.0  2018-01-18 14:00:03 xxxxxx
　　
# 4  6214C000402  C0004  row5    102.0  2018-01-18 14:00:02 xxxxxx
　　
# 5  6214C000301  C0003  row6    100.0  2018-01-18 14:00:00 xxxxxx
　　
# 0  6214C000101  C0001  row1    100.0  2018-01-18 14:00:00 xxxxxx  （这行是追加的）
　　
# 1  6214C000201  C0002  row2    100.0  2018-01-18 14:00:00 xxxxxx  （这行是追加的）
　　
# 2  6214C000401  C0004  row3    101.0  2018-01-18 14:00:01 xxxxxx  （这行是追加的）
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', #zip()函数，将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表
　　
                  'foo', 'foo', 'qux', 'qux'],
　　
                  ['one', 'two', 'one', 'two',
　　
                  'one', 'two', 'one', 'two']]))
　　
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) # 多索引标签MultiIndex
　　

　　
df6 = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
　　
print(df6)
　　
#                   A       B
　　
# first second
　　
# bar one -0.101234 -0.956210
　　
#    two -0.480354  1.308950
　　
# baz one    0.943706  0.976480
　　
#    two -0.788852 -1.556547
　　
# foo one    0.997527 -0.337391
　　
#    two -0.191448 -0.083129
　　
# qux one -0.919527 -0.414051
　　
#    two -0.579727  1.595290
　　

　　
stacked = df6.stack() # 把“行列表结构”变成“堆栈结构”（姑且这样称呼它），把列标签追加到行标签之后
　　
print(stacked)
　　
# first  second
　　
# bar one    A -0.101234
　　
#             B -0.956210
　　
#       two    A -0.480354
　　
#             B 1.308950
　　
# baz one    A 0.943706
　　
#             B 0.976480
　　
#       two    A -0.788852
　　
#             B -1.556547
　　
# foo one    A 0.997527
　　
#             B -0.337391
　　
#       two    A -0.191448
　　
#             B -0.083129
　　
# qux one    A -0.919527
　　
#             B -0.414051
　　
#       two    A -0.579727
　　
#             B 1.595290
　　

　　
print(stacked["bar"]["one"]["A"]) # “堆栈结构”的好处是，你可以这样访问数据，可以想象“堆栈结构”其实就是多层数组
　　
# dtype: float64
　　
# -0.101233870095
　　

　　
unstacked = stacked.unstack() # 还原回去，把“堆栈结构”变成“行列表结构”，把行标签变成列
　　
print(unstacked)
　　
#                   A       B
　　
# first second
　　
# bar one -0.101234 -0.956210
　　
#    two -0.480354  1.308950
　　
# baz one    0.943706  0.976480
　　
#    two -0.788852 -1.556547
　　
# foo one    0.997527 -0.337391
　　
#    two -0.191448 -0.083129
　　
# qux one -0.919527 -0.414051
　　
#    two -0.579727  1.595290
　　

　　
unstacked_unstacked_0 = unstacked.unstack(0) #还能继续吧行标签变成列标签
　　
print(unstacked_unstacked_0)
　　
#             A                                     B
　　
# first       bar    baz    foo    qux    bar    baz    foo       qux
　　
# second
　　
# one -0.101234  0.943706  0.997527 -0.919527 -0.95621  0.976480 -0.337391  -0.414051
　　
# two -0.480354 -0.788852 -0.191448 -0.579727  1.30895 -1.556547 -0.083129 1.595290
　　

　　
unstacked_unstacked_1 = unstacked.unstack(1) #还能继续吧行标签变成列标签  把第2个标签变成列标签
　　
print(unstacked_unstacked_1)
　　
#             A                B
　　
# second    one    two    one    two
　　
# first
　　
# bar -0.101234 -0.480354 -0.956210  1.308950
　　
# baz    0.943706 -0.788852  0.976480 -1.556547
　　
# foo    0.997527 -0.191448 -0.337391 -0.083129
　　
# qux -0.919527 -0.579727 -0.414051  1.595290
　　

　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
df7 = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
　　
               'B' : ['A', 'B', 'C'] * 4,
　　
               'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
　　
               'D' : np.random.randn(12),
　　
               'E' : np.random.randn(12)})
　　

　　
print(df7)
　　
#       A  B C       D       E
　　
# 0    one  A  foo -0.516297 -0.860641
　　
# 1    one  B  foo -1.560483 -1.647366
　　
# 2    two  C  foo  1.124756  0.329971
　　
# 3 three  A  bar -0.312954  0.040263
　　
# 4    one  B  bar -1.355079  0.358829
　　
# 5    one  C  bar  0.749617  0.978513
　　
# 6    two  A  foo -2.173830  0.434789
　　
# 7 three  B  foo -1.070213  0.641253
　　
# 8    one  C  foo -0.515032  0.127273
　　
# 9    one  A  bar -1.408970  0.025128
　　
# 10 two  B  bar -0.390044  0.060392
　　
# 11  three  C  bar  0.067667  0.676595
　　

　　
print( pd.pivot_table(df7, values='D', index=['A', 'B'], columns=['C'])  ) #透视表
　　
# C          bar    foo
　　
# A    B
　　
# one A -1.408970 -0.516297
　　
#    B -1.355079 -1.560483
　　
#    C  0.749617 -0.515032
　　
# three A -0.312954    NaN
　　
#    B    NaN -1.070213
　　
#    C  0.067667    NaN
　　
# two A    NaN -2.173830
　　
#    B -0.390044    NaN
　　
#    C    NaN  1.124756
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　

　　
rng = pd.date_range('1/1/2012', periods=10, freq='min') #看结果，是个时间索引DatetimeIndex
　　
print(rng)
　　
# DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:01:00',
　　
#             '2012-01-01 00:02:00', '2012-01-01 00:03:00',
　　
#             '2012-01-01 00:04:00', '2012-01-01 00:05:00',
　　
#             '2012-01-01 00:06:00', '2012-01-01 00:07:00',
　　
#             '2012-01-01 00:08:00', '2012-01-01 00:09:00'],
　　
#             dtype='datetime64[ns]', freq='T')
　　

　　
ts = pd.Series(range(10), index=rng)  # 时间序列数据
　　
print(ts)
　　
# 2012-01-01 00:00:00 0
　　
# 2012-01-01 00:01:00 1
　　
# 2012-01-01 00:02:00 2
　　
# 2012-01-01 00:03:00 3
　　
# 2012-01-01 00:04:00 4
　　
# 2012-01-01 00:05:00 5
　　
# 2012-01-01 00:06:00 6
　　
# 2012-01-01 00:07:00 7
　　
# 2012-01-01 00:08:00 8
　　
# 2012-01-01 00:09:00 9
　　
# Freq: T, dtype: int32
　　

　　
print( ts.resample('5Min').sum() ) #resample()是对时间序列数据进行重新采样的便捷方法
　　
# 2012-01-01 00:00:00 10
　　
# 2012-01-01 00:05:00 35
　　
# Freq: 5T, dtype: int32
　　

　　
ts_utc = ts.tz_localize('UTC') #改变时区标准 UTC世界时 GMT格里尼治时
　　
print( ts_utc )
　　
# 2012-01-01 00:00:00+00:00 0
　　
# 2012-01-01 00:01:00+00:00 1
　　
# 2012-01-01 00:02:00+00:00 2
　　
# 2012-01-01 00:03:00+00:00 3
　　
# 2012-01-01 00:04:00+00:00 4
　　
# 2012-01-01 00:05:00+00:00 5
　　
# 2012-01-01 00:06:00+00:00 6
　　
# 2012-01-01 00:07:00+00:00 7
　　
# 2012-01-01 00:08:00+00:00 8
　　
# 2012-01-01 00:09:00+00:00 9
　　
# Freq: T, dtype: int32
　　

　　
print( ts_utc.tz_convert('US/Eastern') ) #时区转换
　　
# 2011-12-31 19:00:00-05:00 0
　　
# 2011-12-31 19:01:00-05:00 1
　　
# 2011-12-31 19:02:00-05:00 2
　　
# 2011-12-31 19:03:00-05:00 3
　　
# 2011-12-31 19:04:00-05:00 4
　　
# 2011-12-31 19:05:00-05:00 5
　　
# 2011-12-31 19:06:00-05:00 6
　　
# 2011-12-31 19:07:00-05:00 7
　　
# 2011-12-31 19:08:00-05:00 8
　　
# 2011-12-31 19:09:00-05:00 9
　　
# Freq: T, dtype: int32
　　

　　

　　
print( ts.to_period() ) #时间序列显示格式，只显示到你定义的单位
　　
# 2012-01-01 00:00 0
　　
# 2012-01-01 00:01 1
　　
# 2012-01-01 00:02 2
　　
# 2012-01-01 00:03 3
　　
# 2012-01-01 00:04 4
　　
# 2012-01-01 00:05 5
　　
# 2012-01-01 00:06 6
　　
# 2012-01-01 00:07 7
　　
# 2012-01-01 00:08 8
　　
# 2012-01-01 00:09 9
　　
# Freq: T, dtype: int32
　　

　　
print( ts.to_period().to_timestamp() ) #时间序列显示格式，标准时间格式
　　
# 2012-01-01 00:00:00 0
　　
# 2012-01-01 00:01:00 1
　　
# 2012-01-01 00:02:00 2
　　
# 2012-01-01 00:03:00 3
　　
# 2012-01-01 00:04:00 4
　　
# 2012-01-01 00:05:00 5
　　
# 2012-01-01 00:06:00 6
　　
# 2012-01-01 00:07:00 7
　　
# 2012-01-01 00:08:00 8
　　
# 2012-01-01 00:09:00 9
　　
# Freq: T, dtype: int32
　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
　　
df["grade"] = df["raw_grade"].astype("category") #创建新的列，支持category类型数据（category是一种类别标签）
　　

　　
print( df["grade"] )
　　
# 0 a
　　
# 1 b
　　
# 2 b
　　
# 3 a
　　
# 4 a
　　
# 5 e
　　
# Name: grade, dtype: category
　　

　　
df["grade"].cat.categories = ["very good", "good", "very bad"]
　　
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) #重新定义类别，覆盖原来的类别
　　
print( df["grade"] )
　　
# 0 very good
　　
# 1       good
　　
# 2       good
　　
# 3 very good
　　
# 4 very good
　　
# 5    very bad
　　
# Name: grade, dtype: category
　　
# Categories (5, object): [very bad, bad, medium, good, very good]
　　

　　
print( df.groupby("grade").size() ) #按类别统计
　　
# grade
　　
# very bad    1
　　
# bad       0
　　
# medium    0
　　
# good       2
　　
# very good 3
　　
# dtype: int64
　　

　　

　　
print('------------------------------------------------------------------------------------')
　　

　　
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) #1000日的时间序列+随机数
　　

　　
ts = ts.cumsum() #累加统计
　　

　　
print(ts)
　　
ts.plot() #有的环境到这步就显式了
　　
plt.show() #有的要导入matplotlib.pyplot模块，这样开启图像显示
　　
#图像是一条曲线，X轴：1000日，y轴：每日的累加统计结果
　　

　　
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,columns=['A', 'B', 'C', 'D']) #时间序列的索引标签，4列的表
　　
df = df.cumsum() #每列的累加统计
　　
df.plot()
　　
plt.show()
　　
#图像是4条曲线，X轴：1000日，y轴：每日的累加统计结果

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] Python练手，pandas

浏览过的版块

扫码加入运维网微信交流群