内容简介:在concat里, join的默认参数时outer
创建数组
np.array([10, 11, 12, 13]) # [10 11 12 13] np.array([10, 11, 12, 13, 14 ,15]).reshape([2,3]) # [ # [10 11 12] # [13 14 15] # ] np.array([[1, 2], [3, 4]]) # [ # [1 2] # [3 4] # ] np.arange(4) # [0 1 2 3] np.arange(2, 6) # [2 3 4 5] np.arange(4).reshape([2,2]) # [ # [0 1] # [2 3] # ] np.random.random([2,3]) # [ # [ 0.00136044 0.46854718 0.59149907] # [ 0.75636339 0.18204628 0.53191402] # ]
求值
arr = np.array([10, 11, 12, 13, 14 ,15]).reshape([2,3]) # [ # [10 11 12] # [13 14 15] # ] # 总数 np.sum(arr, axis=0) # [23 25 27] np.sum(arr, axis=1) # [33 42] # 最小数 np.min(arr, axis=0) # [10 11 12] np.min(arr, axis=1) # [10 13] # 最大数 np.max(arr, axis=0) # [13 14 15] np.max(arr, axis=1) # [12 15] # 最大/小值得索引值 np.argmin(arr) # 0 0是索引 np.argmax(arr) # 5 5是索引 # 平均值 arr.mean() # np.mean(arr) # 12.5 np.average(arr) # 12.5 # 逐步增加 np.cumsum(arr) # [10 21 33 46 60 75] # 相差 np.diff(arr) # [ # [1 1] # [1 1] # ] # 替换 np.clip(arr, 11, 14) # [ # [11 11 12] # [13 14 14] # ] # 小于11的数替换成11, 大于14的数替换成14, 其他数不变
索引
arr = np.arange(3, 15).reshape([3,4]) # [ # [ 3 4 5 6] # [ 7 8 9 10] # [11 12 13 14] # ] arr[1, 1] # arr[1][1] # 8 arr[:, 1] # [ 4 8 12] arr[1, :] # [ 7 8 9 10] arr[1, 1:3] # [8 9] arr.flatten() # [ 3 4 5 6 7 8 9 10 11 12 13 14] for i in arr.flat: print(i) # 每行打印出值。arr.flat是迭代器
合并
A = np.array([1, 1, 1]) B = np.array([2, 2, 2]) np.vstack((A, B)) # [ # [1 1 1] # [2 2 2] # ] np.hstack((A, B)) # [1 1 1 2 2 2]
分割
arr = np.arange(12).reshape([3,4]) # [ # [ 0 1 2 3] # [ 4 5 6 7] # [ 8 9 10 11] # ] np.split(arr, 2, axis=1) # [array([ # [0, 1], # [4, 5], # [8, 9] # ]), # array([ # [ 2, 3], # [ 6, 7], # [10, 11]] # )]
Pandas
导入
import pandas as pd
API
创建列表
pd.Series([1, 3, 6, np.nan, 44, 1]) # 0 1.0 # 1 3.0 # 2 6.0 # 3 NaN # 4 44.0 # 5 1.0 # dtype: float64 pd.date_range('20171108', periods=6) # DatetimeIndex( # ['2017-11-08', '2017-11-09', '2017-11-10', '2017-11-11','2017-11-12', '2017-11-13'], # dtype='datetime64[ns]', # freq='D' # ) dates = pd.date_range('20171108', periods=6) pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd']) # a b c d # 2017-11-08 0.644350 1.122020 -1.263401 0.163371 # 2017-11-09 0.573329 -0.242054 -0.342220 1.070905 # 2017-11-10 0.714291 -0.721509 -2.298672 -0.513572 # 2017-11-11 -0.614927 2.010482 -1.369179 -0.901276 # 2017-11-12 0.709672 -0.430620 1.070244 -2.308874 # 2017-11-13 1.284080 1.169807 1.668942 0.859300 pd.DataFrame({ 'A': 1., 'B': pd.Timestamp('20171108'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(['test', 'train', 'test', 'train']), 'F': 'foo' }) # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo # 3 1.0 2017-11-08 1.0 3 train foo
选择获取
datas = pd.DataFrame({ 'A': 1., 'B': pd.Timestamp('20171108'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(['test', 'train', 'test', 'train']), 'F': 'foo' }) # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo # 3 1.0 2017-11-08 1.0 3 train foo datas.A # datas['A'] # 0 1.0 # 1 1.0 # 2 1.0 # 3 1.0 # Name: A, dtype: float64 datas[0:3] # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo datas.loc[0] # 当index是类似'2017-11-8的时候', datas.loc['20171108'] # A 1 # B 2017-11-08 00:00:00 # C 1 # D 3 # E test # F foo # Name: 0, dtype: object datas.loc[:,['A', 'B']] # A B # 0 1.0 2017-11-08 # 1 1.0 2017-11-08 # 2 1.0 2017-11-08 # 3 1.0 2017-11-08 datas.loc[[1, 3],['A', 'B']] # A B # 1 1.0 2017-11-08 # 3 1.0 2017-11-08 # icol是基于行号获取的, col是基于index获取的, ix是他们俩的混合(index、行号都可以) # icol[1] # ix[1] # 当index为2017-11-08时, ix['20171108'] datas[datas.E == 'test'] # A B C D E F # 2017-11-08 1.0 2017-11-08 1.0 3 test foo # 2017-11-10 1.0 2017-11-08 1.0 3 test foo datas.index # Int64Index([0, 1, 2, 3], dtype='int64') datas.columns # Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object') datas.values # array( # [ # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'test', 'foo'], # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'train', 'foo'], # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'test', 'foo'], # [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'train', 'foo'] # ], # dtype=object)
排序
datas.sort_index(axis=0, ascending=False) # F E D C B A # 0 foo test 3 1.0 2017-11-08 1.0 # 1 foo train 3 1.0 2017-11-08 1.0 # 2 foo test 3 1.0 2017-11-08 1.0 # 3 foo train 3 1.0 2017-11-08 1.0 datas.sort_index(axis=0, ascending=False) # A B C D E F # 3 1.0 2017-11-08 1.0 3 train foo # 2 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 0 1.0 2017-11-08 1.0 3 test foo datas.sort_values(by='E') # A B C D E F # 0 1.0 2017-11-08 1.0 3 test foo # 2 1.0 2017-11-08 1.0 3 test foo # 1 1.0 2017-11-08 1.0 3 train foo # 3 1.0 2017-11-08 1.0 3 train foo
设置值
datas = pd.DataFrame({ 'A': pd.Series([1, 5, 'test', 'foo'], index=list(range(4))), 'B': pd.Series([np.nan, 1, np.nan, 'test'], index=list(range(4))), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), }) # A B C # 0 1 NaN 1.0 # 1 5 1 1.0 # 2 test NaN 1.0 # 3 foo test 1.0 datas.dropna(axis=0, how='any') # 当axis是1时,则判断竖向里是否含有NaN的值 # how = 'any' || 'all' 默认是any # 当是any的时候, 有一个值是NaN的时, 就删除这一行。 # 当时all的时候, 这一行全部为NaN时, 就删除这一行 # A B C # 1 5 1 1.0 # 3 foo test 1.0 datas.fillna(value=0) # A B C # 0 1 0 1.0 # 1 5 1 1.0 # 2 test 0 1.0 # 3 foo test 1.0 datas.isnull() # A B C # 0 False True False # 1 False False False # 2 False True False # 3 False False False # 当数据特别大的时候, 或者只想判断是否有值是NaN的值时 # np.any(datas.isnull()) == True # 当有值时NaN时, 将返回True
导入导出
pd.read_csv('***.csv',delimiter=',',encoding='utf-8',names=['test1','test2','test3']) # 参数一:读取的目标文件 # 参数二:csv文件的分隔符 # 参数三:编码 # 参数四:设置列名 # test1 test2 test3 # 0 2017-11-18 ABC 51315.0 # 1 2017-11-19 DEF 5659.0 # 2 2017-11-20 GHI 1599.0 # 3 2017-11-21 JKL 2224.0 datas.to_csv('**.csv')
合并
concat
datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd']) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 datas2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd']) # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 datas3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd']) # a b c d # 0 2.0 2.0 2.0 2.0 # 1 2.0 2.0 2.0 2.0 # 2 2.0 2.0 2.0 2.0 pd.concat([datas1, datas2, datas3], axis=0, ignore_index=True) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 pd.concat([datas1, datas2, datas3], axis=1) # a b c d a b c d a b c d # 0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 # 1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
concat 部分参数
在concat里, join的默认参数时outer
datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3]) # a b c d # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 0.0 0.0 0.0 0.0 datas2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4]) # b c d e # 2 1.0 1.0 1.0 1.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 pd.concat([datas1, datas2], join='outer') # a b c d e # 1 0.0 0.0 0.0 0.0 NaN # 2 0.0 0.0 0.0 0.0 NaN # 3 0.0 0.0 0.0 0.0 NaN # 2 NaN 1.0 1.0 1.0 1.0 # 3 NaN 1.0 1.0 1.0 1.0 # 4 NaN 1.0 1.0 1.0 1.0 pd.concat([datas1, datas2], join='inner') # b c d # 1 0.0 0.0 0.0 # 2 0.0 0.0 0.0 # 3 0.0 0.0 0.0 # 2 1.0 1.0 1.0 # 3 1.0 1.0 1.0 # 4 1.0 1.0 1.0 pd.concat([datas1, datas2], axis=1, join_axes=[datas2.index]) # a b c d b c d e # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0 # 如果没有join_axes值时: # a b c d b c d e # 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN # 2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 # 4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
append
datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd']) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 datas2 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) # a 1 # b 2 # c 3 # d 4 # dtype: int64 datas1.append(datas2, ignore_index=True) # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 2.0 3.0 4.0
merge
left = pd.DataFrame({ 'key': ['k0', 'k1', 'k2', 'k3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'] }) # A B key # 0 A0 B0 k0 # 1 A1 B1 k1 # 2 A2 B2 k2 # 3 A3 B3 k3 right = pd.DataFrame({ 'key': ['k0', 'k1', 'k2', 'k3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3'] }) # C D key # 0 C0 D0 k0 # 1 C1 D1 k1 # 2 C2 D2 k2 # 3 C3 D3 k3 pd.merge(left, right, on='key') # A B key C D # 0 A0 B0 k0 C0 D0 # 1 A1 B1 k1 C1 D1 # 2 A2 B2 k2 C2 D2 # 3 A3 B3 k3 C3 D3
left = pd.DataFrame({ 'key1': ['k0', 'k0', 'k1', 'k2'], 'key2': ['k0', 'k1', 'k0', 'k1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'] }) # A B key1 key2 # 0 A0 B0 k0 k0 # 1 A1 B1 k0 k1 # 2 A2 B2 k1 k0 # 3 A3 B3 k2 k1 right = pd.DataFrame({ 'key1': ['k0', 'k1', 'k1', 'k2'], 'key2': ['k0', 'k0', 'k0', 'k0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3'] }) # C D key1 key2 # 0 C0 D0 k0 k0 # 1 C1 D1 k1 k0 # 2 C2 D2 k1 k0 # 3 C3 D3 k2 k0 pd.merge(left, right, on=['key1', 'key2'], how='inner') # how默认是inner # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A2 B2 k1 k0 C1 D1 # 2 A2 B2 k1 k0 C2 D2 pd.merge(left, right, on=['key1', 'key2'], how='outer') # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A1 B1 k0 k1 NaN NaN # 2 A2 B2 k1 k0 C1 D1 # 3 A2 B2 k1 k0 C2 D2 # 4 A3 B3 k2 k1 NaN NaN # 5 NaN NaN k2 k0 C3 D3 pd.merge(left, right, on=['key1', 'key2']. how='right') # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A2 B2 k1 k0 C1 D1 # 2 A2 B2 k1 k0 C2 D2 # 3 NaN NaN k2 k0 C3 D3 pd.merge(left, right, on=['key1', 'key2'], how='left') # A B key1 key2 C D # 0 A0 B0 k0 k0 C0 D0 # 1 A1 B1 k0 k1 NaN NaN # 2 A2 B2 k1 k0 C1 D1 # 3 A2 B2 k1 k0 C2 D2 # 4 A3 B3 k2 k1 NaN NaN
matplotilb
导入
import matplotlib.pyplot as plt
API
plot
data = pd.Series(np.random.randn(1000)) # 随机1000个数 data = data.cumsum() # 累加 # 因为pandas本来就是一个数据,所以可以直接plot, # 还有两种写法: plt.plot(x= , y = ) 或者 plt.plot([xxx, xxx], [yyy, yyy]) data.plot() plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号 # linewidth线条的宽度 # linestyle线条风格(-实线 --破折线 -.点划线 :虚线 None说明都不画) plt.plot([1,50,100],[1,4,9], linewidth=2.5, linestyle='--', label='lalala') plt.legend(loc='upper left') # 没有这句, 上面的label将不会显示 plt.plot([1,100,200],[1,7,9]) # 第三个数据 plt.title('Demo') # 标题 plt.xlabel('xxx') # x轴名称 plt.ylabel('yyy') # y轴名称 plt.text(60, 10, u'说明文字') # 说明文字 plt.show() # 显示
# 随机1000行4列的数字, 行数从0到999, 列表为A B C D data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list('ABCD')) data = data.cumsum() # 累加 data.plot() plt.show()
其他图
柱状图
plt.bar(left, height, width=0.8)
以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网
猜你喜欢:- docker的centos安装操作及部分理解
- Flutter Stream 简介及部分操作符使用
- 使用 Clojure 编写 OpenWhisk 操作,第 2 部分: 将 Clojure OpenWhisk 操作连接成有用的序列
- 使用 Clojure 编写 OpenWhisk 操作,第 3 部分: 改进您的 OpenWhisk Clojure 应用程序
- 使用 Clojure 编写 OpenWhisk 操作,第 1 部分: 使用 Lisp 方言为 OpenWhisk 编写简明的代码
- Git 和 GitHub:从入门到实践,第 3 部分: Git 分支简介、Git 和 GitHub 日常操作
本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。
Ruby原理剖析
[美] Patrick Shaughnessy / 张汉东、秦凡鹏 / 华中科技大学出版社 / 2016-12-1 / 78.80元
《Ruby原理剖析》解开Ruby编程语言的魔法面纱。全书图文并茂、深入浅出地剖析了Ruby编程语言的核心工作原理。作者本着科学实证的精神,设计了一系列实验,帮助读者轻松了解这门编程语言的工作奥秘,包括Ruby如何用虚拟机执行代码,Ruby的垃圾回收算法,以及类和模块在Ruby内部的关系等。一起来看看 《Ruby原理剖析》 这本书的介绍吧!