内容简介:在concat里, join的默认参数时outer
创建数组
np.array([10, 11, 12, 13]) # [10 11 12 13] np.array([10, 11, 12, 13, 14 ,15]).reshape([2,3]) # [ # [10 11 12] # [13 14 15] # ] np.array([[1, 2], [3, 4]]) # [ # [1 2] # [3 4] # ] np.arange(4) # [0 1 2 3] np.arange(2, 6) # [2 3 4 5] np.arange(4).reshape([2,2]) # [ # [0 1] # [2 3] # ] np.random.random([2,3]) # [ # [ 0.00136044 0.46854718 0.59149907] # [ 0.75636339 0.18204628 0.53191402] # ]
求值
arr = np.array([10, 11, 12, 13, 14 ,15]).reshape([2,3]) # [ # [10 11 12] # [13 14 15] # ] # 总数 np.sum(arr, axis=0) # [23 25 27] np.sum(arr, axis=1) # [33 42] # 最小数 np.min(arr, axis=0) # [10 11 12] np.min(arr, axis=1) # [10 13] # 最大数 np.max(arr, axis=0) # [13 14 15] np.max(arr, axis=1) # [12 15] # 最大/小值得索引值 np.argmin(arr) # 0 0是索引 np.argmax(arr) # 5 5是索引 # 平均值 arr.mean() # np.mean(arr) # 12.5 np.average(arr) # 12.5 # 逐步增加 np.cumsum(arr) # [10 21 33 46 60 75] # 相差 np.diff(arr) # [ # [1 1] # [1 1] # ] # 替换 np.clip(arr, 11, 14) # [ # [11 11 12] # [13 14 14] # ] # 小于11的数替换成11, 大于14的数替换成14, 其他数不变
索引
arr = np.arange(3, 15).reshape([3,4]) # [ # [ 3 4 5 6] # [ 7 8 9 10] # [11 12 13 14] # ] arr[1, 1] # arr[1][1] # 8 arr[:, 1] # [ 4 8 12] arr[1, :] # [ 7 8 9 10] arr[1, 1:3] # [8 9] arr.flatten() # [ 3 4 5 6 7 8 9 10 11 12 13 14] for i in arr.flat: print(i) # 每行打印出值。arr.flat是迭代器
合并
A = np.array([1, 1, 1]) B = np.array([2, 2, 2]) np.vstack((A, B)) # [ # [1 1 1] # [2 2 2] # ] np.hstack((A, B)) # [1 1 1 2 2 2]
分割
arr = np.arange(12).reshape([3,4]) # [ # [ 0 1 2 3] # [ 4 5 6 7] # [ 8 9 10 11] # ] np.split(arr, 2, axis=1) # [array([ # [0, 1], # [4, 5], # [8, 9] # ]), # array([ # [ 2, 3], # [ 6, 7], # [10, 11]] # )]
Pandas
导入
import pandas as pd
API
创建列表
pd.Series([1, 3, 6, np.nan, 44, 1]) # 0 1.0 # 1 3.0 # 2 6.0 # 3 NaN # 4 44.0 # 5 1.0 # dtype: float64 pd.date_range('20171108', periods=6) # DatetimeIndex( # ['2017-11-08', '2017-11-09', '2017-11-10', '2017-11-11','2017-11-12', '2017-11-13'], # dtype='datetime64[ns]', # freq='D' # ) dates = pd.date_range('20171108', periods=6) pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd']) # a b c d # 2017-11-08 0.644350 1.122020 -1.263401 0.163371 # 2017-11-09 0.573329 -0.242054 -0.342220 1.070905 # 2017-11-10 0.714291 -0.721509 -2.298672 -0.513572 # 2017-11-11 -0.614927 2.010482 -1.369179 -0.901276 # 2017-11-12 0.709672 -0.430620 1.070244 -2.308874 # 2017-11-13 1.284080 1.169807 1.668942 0.859300 pd.DataFrame({ 'A': 1., 'B': pd.Timestamp('20171108'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(['test', 'train', 'test', 'train']), 'F': 'foo' }) # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo # 3 1.0 2017-11-08 1.0 3 train foo
选择获取
datas = pd.DataFrame({ 'A': 1., 'B': pd.Timestamp('20171108'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(['test', 'train', 'test', 'train']), 'F': 'foo' }) # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo # 3 1.0 2017-11-08 1.0 3 train foo datas.A # datas['A'] # 0 1.0 # 1 1.0 # 2 1.0 # 3 1.0 # Name: A, dtype: float64 datas[0:3] # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo datas.loc[0] # 当index是类似'2017-11-8的时候', datas.loc['20171108'] # A 1 # B 2017-11-08 00:00:00 # C 1 # D 3 # E test # F foo # Name: 0, dtype: object datas.loc[:,['A', 'B']] # A B # 0 1.0 2017-11-08 # 1 1.0 2017-11-08 # 2 1.0 2017-11-08 # 3 1.0 2017-11-08 datas.loc[[1, 3],['A', 'B']] # A B # 1 1.0 2017-11-08 # 3 1.0 2017-11-08 # icol是基于行号获取的, col是基于index获取的, ix是他们俩的混合(index、行号都可以) # icol[1] # ix[1] # 当index为2017-11-08时, ix['20171108'] datas[datas.E == 'test'] # A B C D E F # 2017-11-08 1.0 2017-11-08 1.0 3 test foo # 2017-11-10 1.0 2017-11-08 1.0 3 test foo datas.index # Int64Index([0, 1, 2, 3], dtype='int64') datas.columns # Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object') datas.values # array( # [ # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'test', 'foo'], # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'train', 'foo'], # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'test', 'foo'], # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'train', 'foo'] # ], # dtype=object)
排序
datas.sort_index(axis=0, ascending=False) # F E D C B A # 0 foo test 3 1.0 2017-11-08 1.0 # 1 foo train 3 1.0 2017-11-08 1.0 # 2 foo test 3 1.0 2017-11-08 1.0 # 3 foo train 3 1.0 2017-11-08 1.0 datas.sort_index(axis=0, ascending=False) # A B C D E F # 3 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 0 1.0 2017-11-08 1.0 3 test foo datas.sort_values(by='E') # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 2 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 3 1.0 2017-11-08 1.0 3 train foo
设置值
datas = pd.DataFrame({ 'A': pd.Series([1, 5, 'test', 'foo'], index=list(range(4))), 'B': pd.Series([np.nan, 1, np.nan, 'test'], index=list(range(4))), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), }) # A B C # 0 1 NaN 1.0 # 1 5 1 1.0 # 2 test NaN 1.0 # 3 foo test 1.0 datas.dropna(axis=0, how='any') # 当axis是1时,则判断竖向里是否含有NaN的值 # how = 'any' || 'all' 默认是any # 当是any的时候, 有一个值是NaN的时, 就删除这一行。 # 当时all的时候, 这一行全部为NaN时, 就删除这一行 # A B C # 1 5 1 1.0 # 3 foo test 1.0 datas.fillna(value=0) # A B C # 0 1 0 1.0 # 1 5 1 1.0 # 2 test 0 1.0 # 3 foo test 1.0 datas.isnull() # A B C # 0 False True False # 1 False False False # 2 False True False # 3 False False False # 当数据特别大的时候, 或者只想判断是否有值是NaN的值时 # np.any(datas.isnull()) == True # 当有值时NaN时, 将返回True
导入导出
pd.read_csv('***.csv',delimiter=',',encoding='utf-8',names=['test1','test2','test3']) # 参数一:读取的目标文件 # 参数二:csv文件的分隔符 # 参数三:编码 # 参数四:设置列名 # test1 test2 test3 # 0 2017-11-18 ABC 51315.0 # 1 2017-11-19 DEF 5659.0 # 2 2017-11-20 GHI 1599.0 # 3 2017-11-21 JKL 2224.0 datas.to_csv('**.csv')
合并
concat
datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd']) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 datas2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd']) # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 datas3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd']) # a b c d # 0 2.0 2.0 2.0 2.0 # 1 2.0 2.0 2.0 2.0 # 2 2.0 2.0 2.0 2.0 pd.concat([datas1, datas2, datas3], axis=0, ignore_index=True) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 pd.concat([datas1, datas2, datas3], axis=1) # a b c d a b c d a b c d # 0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 # 1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
concat 部分参数
在concat里, join的默认参数时outer
datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3]) # a b c d # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 0.0 0.0 0.0 0.0 datas2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4]) # b c d e # 2 1.0 1.0 1.0 1.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 pd.concat([datas1, datas2], join='outer') # a b c d e # 1 0.0 0.0 0.0 0.0 NaN # 2 0.0 0.0 0.0 0.0 NaN # 3 0.0 0.0 0.0 0.0 NaN # 2 NaN 1.0 1.0 1.0 1.0 # 3 NaN 1.0 1.0 1.0 1.0 # 4 NaN 1.0 1.0 1.0 1.0 pd.concat([datas1, datas2], join='inner') # b c d # 1 0.0 0.0 0.0 # 2 0.0 0.0 0.0 # 3 0.0 0.0 0.0 # 2 1.0 1.0 1.0 # 3 1.0 1.0 1.0 # 4 1.0 1.0 1.0 pd.concat([datas1, datas2], axis=1, join_axes=[datas2.index]) # a b c d b c d e # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 # 如果没有join_axes值时: # a b c d b c d e # 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
append
datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd']) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 datas2 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) # a 1 # b 2 # c 3 # d 4 # dtype: int64 datas1.append(datas2, ignore_index=True) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 2.0 3.0 4.0
merge
left = pd.DataFrame({ 'key': ['k0', 'k1', 'k2', 'k3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'] }) # A B key # 0 A0 B0 k0 # 1 A1 B1 k1 # 2 A2 B2 k2 # 3 A3 B3 k3 right = pd.DataFrame({ 'key': ['k0', 'k1', 'k2', 'k3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3'] }) # C D key # 0 C0 D0 k0 # 1 C1 D1 k1 # 2 C2 D2 k2 # 3 C3 D3 k3 pd.merge(left, right, on='key') # A B key C D # 0 A0 B0 k0 C0 D0 # 1 A1 B1 k1 C1 D1 # 2 A2 B2 k2 C2 D2 # 3 A3 B3 k3 C3 D3
left = pd.DataFrame({ 'key1': ['k0', 'k0', 'k1', 'k2'], 'key2': ['k0', 'k1', 'k0', 'k1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'] }) # A B key1 key2 # 0 A0 B0 k0 k0 # 1 A1 B1 k0 k1 # 2 A2 B2 k1 k0 # 3 A3 B3 k2 k1 right = pd.DataFrame({ 'key1': ['k0', 'k1', 'k1', 'k2'], 'key2': ['k0', 'k0', 'k0', 'k0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3'] }) # C D key1 key2 # 0 C0 D0 k0 k0 # 1 C1 D1 k1 k0 # 2 C2 D2 k1 k0 # 3 C3 D3 k2 k0 pd.merge(left, right, on=['key1', 'key2'], how='inner') # how默认是inner # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A2 B2 k1 k0 C1 D1 # 2 A2 B2 k1 k0 C2 D2 pd.merge(left, right, on=['key1', 'key2'], how='outer') # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A1 B1 k0 k1 NaN NaN # 2 A2 B2 k1 k0 C1 D1 # 3 A2 B2 k1 k0 C2 D2 # 4 A3 B3 k2 k1 NaN NaN # 5 NaN NaN k2 k0 C3 D3 pd.merge(left, right, on=['key1', 'key2']. how='right') # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A2 B2 k1 k0 C1 D1 # 2 A2 B2 k1 k0 C2 D2 # 3 NaN NaN k2 k0 C3 D3 pd.merge(left, right, on=['key1', 'key2'], how='left') # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A1 B1 k0 k1 NaN NaN # 2 A2 B2 k1 k0 C1 D1 # 3 A2 B2 k1 k0 C2 D2 # 4 A3 B3 k2 k1 NaN NaN
matplotilb
导入
import matplotlib.pyplot as plt
API
plot
data = pd.Series(np.random.randn(1000)) # 随机1000个数 data = data.cumsum() # 累加 # 因为pandas本来就是一个数据,所以可以直接plot, # 还有两种写法: plt.plot(x= , y = ) 或者 plt.plot([xxx, xxx], [yyy, yyy]) data.plot() plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号 # linewidth线条的宽度 # linestyle线条风格(-实线 --破折线 -.点划线 :虚线 None说明都不画) plt.plot([1,50,100],[1,4,9], linewidth=2.5, linestyle='--', label='lalala') plt.legend(loc='upper left') # 没有这句, 上面的label将不会显示 plt.plot([1,100,200],[1,7,9]) # 第三个数据 plt.title('Demo') # 标题 plt.xlabel('xxx') # x轴名称 plt.ylabel('yyy') # y轴名称 plt.text(60, 10, u'说明文字') # 说明文字 plt.show() # 显示
# 随机1000行4列的数字, 行数从0到999, 列表为A B C D data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list('ABCD')) data = data.cumsum() # 累加 data.plot() plt.show()
其他图
柱状图
plt.bar(left, height, width=0.8)
以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网
猜你喜欢:- docker的centos安装操作及部分理解
- Flutter Stream 简介及部分操作符使用
- 使用 Clojure 编写 OpenWhisk 操作,第 2 部分: 将 Clojure OpenWhisk 操作连接成有用的序列
- 使用 Clojure 编写 OpenWhisk 操作,第 3 部分: 改进您的 OpenWhisk Clojure 应用程序
- 使用 Clojure 编写 OpenWhisk 操作,第 1 部分: 使用 Lisp 方言为 OpenWhisk 编写简明的代码
- Git 和 GitHub:从入门到实践,第 3 部分: Git 分支简介、Git 和 GitHub 日常操作
本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。
Open Data Structures
Pat Morin / AU Press / 2013-6 / USD 29.66
Offered as an introduction to the field of data structures and algorithms, Open Data Structures covers the implementation and analysis of data structures for sequences (lists), queues, priority queues......一起来看看 《Open Data Structures》 这本书的介绍吧!