Numpy & Pandas & Matplotilb部分API操作

栏目: 数据库 · 发布时间: 6年前

内容简介：在concat里, join的默认参数时outer

创建数组

np.array([10, 11, 12, 13])
# [10 11 12 13]


np.array([10, 11, 12, 13, 14 ,15]).reshape([2,3])
# [
#   [10 11 12]
#   [13 14 15]
# ]


np.array([[1, 2], [3, 4]])
# [
#   [1 2]
#   [3 4]
# ]


np.arange(4)
# [0 1 2 3]


np.arange(2, 6)
# [2 3 4 5]


np.arange(4).reshape([2,2])
# [
#   [0 1]
#   [2 3]
# ]


np.random.random([2,3])
# [
#   [ 0.00136044  0.46854718  0.59149907]
#   [ 0.75636339  0.18204628  0.53191402]
# ]

求值

arr = np.array([10, 11, 12, 13, 14 ,15]).reshape([2,3])
# [
#   [10 11 12]
#   [13 14 15]
# ]


# 总数
np.sum(arr, axis=0)
# [23 25 27]
np.sum(arr, axis=1)
# [33 42]


# 最小数
np.min(arr, axis=0)
# [10 11 12]
np.min(arr, axis=1)
# [10 13]


# 最大数
np.max(arr, axis=0)
# [13 14 15]
np.max(arr, axis=1)
# [12 15]


# 最大/小值得索引值
np.argmin(arr)
# 0 0是索引
np.argmax(arr)
# 5 5是索引


# 平均值
arr.mean()
# np.mean(arr)
# 12.5
np.average(arr)
# 12.5


# 逐步增加
np.cumsum(arr)
# [10 21 33 46 60 75]


# 相差
np.diff(arr)
# [
#   [1 1]
#   [1 1]
# ]

# 替换
np.clip(arr, 11, 14)
# [
#   [11 11 12]
#    [13 14 14]
# ]
# 小于11的数替换成11, 大于14的数替换成14, 其他数不变

索引

arr = np.arange(3, 15).reshape([3,4])
# [
#   [ 3  4  5  6]
#   [ 7  8  9 10]
#   [11 12 13 14]
# ]

arr[1, 1]
# arr[1][1]
# 8

arr[:, 1]
# [ 4 8 12]

arr[1, :]
# [ 7 8 9 10]

arr[1, 1:3]
# [8 9]

arr.flatten()
# [ 3  4  5  6  7  8  9 10 11 12 13 14]

for i in arr.flat:
  print(i)
# 每行打印出值。arr.flat是迭代器

合并

A = np.array([1, 1, 1])
B = np.array([2, 2, 2])

np.vstack((A, B))
# [
#   [1 1 1]
#   [2 2 2]
# ]
np.hstack((A, B))
# [1 1 1 2 2 2]

分割

arr = np.arange(12).reshape([3,4])
# [
#   [ 0  1  2  3]
#   [ 4  5  6  7]
#   [ 8  9 10 11]
# ]

np.split(arr, 2, axis=1)
# [array([
#   [0, 1],
#   [4, 5],
#   [8, 9]
# ]),
# array([
#   [ 2,  3],
#   [ 6,  7],
#   [10, 11]]
# )]

Pandas

导入

import pandas as pd

API

创建列表

pd.Series([1, 3, 6, np.nan, 44, 1])
# 0     1.0
# 1     3.0
# 2     6.0
# 3     NaN
# 4    44.0
# 5     1.0
# dtype: float64


pd.date_range('20171108', periods=6)
# DatetimeIndex(
#   ['2017-11-08', '2017-11-09', '2017-11-10', '2017-11-11','2017-11-12', '2017-11-13'],
#   dtype='datetime64[ns]', 
#   freq='D'
# )


dates = pd.date_range('20171108', periods=6)
pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
#                    a         b         c         d
# 2017-11-08  0.644350  1.122020 -1.263401  0.163371
# 2017-11-09  0.573329 -0.242054 -0.342220  1.070905
# 2017-11-10  0.714291 -0.721509 -2.298672 -0.513572
# 2017-11-11 -0.614927  2.010482 -1.369179 -0.901276
# 2017-11-12  0.709672 -0.430620  1.070244 -2.308874
# 2017-11-13  1.284080  1.169807  1.668942  0.859300


pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20171108'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(['test', 'train', 'test', 'train']),
  'F': 'foo'
})
#      A          B    C  D      E    F
# 0  1.0 2017-11-08  1.0  3   test  foo
# 1  1.0 2017-11-08  1.0  3  train  foo
# 2  1.0 2017-11-08  1.0  3   test  foo
# 3  1.0 2017-11-08  1.0  3  train  foo

选择获取

datas = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20171108'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(['test', 'train', 'test', 'train']),
  'F': 'foo'
})
#     A          B    C  D      E    F
# 0  1.0 2017-11-08  1.0  3   test  foo
# 1  1.0 2017-11-08  1.0  3  train  foo
# 2  1.0 2017-11-08  1.0  3   test  foo
# 3  1.0 2017-11-08  1.0  3  train  foo

datas.A
# datas['A']
# 0    1.0
# 1    1.0
# 2    1.0
# 3    1.0
# Name: A, dtype: float64

datas[0:3]
#      A          B    C  D      E    F
# 0  1.0 2017-11-08  1.0  3   test  foo
# 1  1.0 2017-11-08  1.0  3  train  foo
# 2  1.0 2017-11-08  1.0  3   test  foo

datas.loc[0]
# 当index是类似'2017-11-8的时候', datas.loc['20171108']
# A                      1
# B    2017-11-08 00:00:00
# C                      1
# D                      3
# E                   test
# F                    foo
# Name: 0, dtype: object

datas.loc[:,['A', 'B']]
#      A          B
# 0  1.0 2017-11-08
# 1  1.0 2017-11-08
# 2  1.0 2017-11-08
# 3  1.0 2017-11-08

datas.loc[[1, 3],['A', 'B']]
#      A          B
# 1  1.0 2017-11-08
# 3  1.0 2017-11-08

# icol是基于行号获取的, col是基于index获取的, ix是他们俩的混合(index、行号都可以)
# icol[1]
# ix[1]
# 当index为2017-11-08时, ix['20171108']

datas[datas.E == 'test']
#               A          B    C  D     E    F
# 2017-11-08  1.0 2017-11-08  1.0  3  test  foo
# 2017-11-10  1.0 2017-11-08  1.0  3  test  foo

datas.index
# Int64Index([0, 1, 2, 3], dtype='int64')

datas.columns
# Index([u'A', u'B', u'C', u'D', u'E', u'F'], dtype='object')

datas.values
# array(
# [
#   [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'test', 'foo'],
#   [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'train', 'foo'],
#   [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'test', 'foo'],
#   [1.0, Timestamp('2017-11-08 00:00:00'), 1.0, 3, 'train', 'foo']
# ],
# dtype=object)

排序

datas.sort_index(axis=0, ascending=False)
#      F      E  D    C          B    A
# 0  foo   test  3  1.0 2017-11-08  1.0
# 1  foo  train  3  1.0 2017-11-08  1.0
# 2  foo   test  3  1.0 2017-11-08  1.0
# 3  foo  train  3  1.0 2017-11-08  1.0

datas.sort_index(axis=0, ascending=False)
#      A          B    C  D      E    F
# 3  1.0 2017-11-08  1.0  3  train  foo
# 2  1.0 2017-11-08  1.0  3   test  foo
# 1  1.0 2017-11-08  1.0  3  train  foo
# 0  1.0 2017-11-08  1.0  3   test  foo

datas.sort_values(by='E')
#      A          B    C  D      E    F
# 0  1.0 2017-11-08  1.0  3   test  foo
# 2  1.0 2017-11-08  1.0  3   test  foo
# 1  1.0 2017-11-08  1.0  3  train  foo
# 3  1.0 2017-11-08  1.0  3  train  foo

设置值

datas = pd.DataFrame({
  'A': pd.Series([1, 5, 'test', 'foo'], index=list(range(4))),
  'B': pd.Series([np.nan, 1, np.nan, 'test'], index=list(range(4))),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
})
#       A     B    C
# 0     1   NaN  1.0
# 1     5     1  1.0
# 2  test   NaN  1.0
# 3   foo  test  1.0

datas.dropna(axis=0, how='any')
# 当axis是1时，则判断竖向里是否含有NaN的值
# how = 'any' || 'all' 默认是any
# 当是any的时候, 有一个值是NaN的时, 就删除这一行。
# 当时all的时候, 这一行全部为NaN时, 就删除这一行
#      A     B    C
# 1    5     1  1.0
# 3  foo  test  1.0

datas.fillna(value=0)
#       A     B    C
# 0     1     0  1.0
# 1     5     1  1.0
# 2  test     0  1.0
# 3   foo  test  1.0

datas.isnull()
#        A      B      C
# 0  False   True  False
# 1  False  False  False
# 2  False   True  False
# 3  False  False  False

# 当数据特别大的时候, 或者只想判断是否有值是NaN的值时
# np.any(datas.isnull()) == True
#   当有值时NaN时, 将返回True

导入导出

pd.read_csv('***.csv',delimiter=',',encoding='utf-8',names=['test1','test2','test3'])
# 参数一：读取的目标文件
# 参数二：csv文件的分隔符
# 参数三：编码
# 参数四：设置列名

#           test1     test2          test3
# 0    2017-11-18       ABC        51315.0
# 1    2017-11-19       DEF         5659.0
# 2    2017-11-20       GHI         1599.0
# 3    2017-11-21       JKL         2224.0

datas.to_csv('**.csv')

Numpy & Pandas & Matplotilb部分API操作

合并

concat

datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0

datas2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
#      a    b    c    d
# 0  1.0  1.0  1.0  1.0
# 1  1.0  1.0  1.0  1.0
# 2  1.0  1.0  1.0  1.0

datas3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
#      a    b    c    d
# 0  2.0  2.0  2.0  2.0
# 1  2.0  2.0  2.0  2.0
# 2  2.0  2.0  2.0  2.0

pd.concat([datas1, datas2, datas3], axis=0, ignore_index=True)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0
# 5  1.0  1.0  1.0  1.0
# 6  2.0  2.0  2.0  2.0
# 7  2.0  2.0  2.0  2.0
# 8  2.0  2.0  2.0  2.0

pd.concat([datas1, datas2, datas3], axis=1)
#      a    b    c    d    a    b    c    d    a    b    c    d
# 0  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
# 1  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0
# 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0  2.0  2.0  2.0  2.0

concat 部分参数

在concat里, join的默认参数时outer

datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
#      a    b    c    d
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  0.0  0.0  0.0  0.0

datas2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
#      b    c    d    e
# 2  1.0  1.0  1.0  1.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0

pd.concat([datas1, datas2], join='outer')
#      a    b    c    d    e
# 1  0.0  0.0  0.0  0.0  NaN
# 2  0.0  0.0  0.0  0.0  NaN
# 3  0.0  0.0  0.0  0.0  NaN
# 2  NaN  1.0  1.0  1.0  1.0
# 3  NaN  1.0  1.0  1.0  1.0
# 4  NaN  1.0  1.0  1.0  1.0

pd.concat([datas1, datas2], join='inner')
#      b    c    d
# 1  0.0  0.0  0.0
# 2  0.0  0.0  0.0
# 3  0.0  0.0  0.0
# 2  1.0  1.0  1.0
# 3  1.0  1.0  1.0
# 4  1.0  1.0  1.0

pd.concat([datas1, datas2], axis=1, join_axes=[datas2.index])
#      a    b    c    d    b    c    d    e
# 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
# 3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
# 4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
# 如果没有join_axes值时:
#      a    b    c    d    b    c    d    e
# 1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
# 2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
# 3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
# 4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0

append

datas1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0

datas2 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
# a    1
# b    2
# c    3
# d    4
# dtype: int64

datas1.append(datas2, ignore_index=True)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  2.0  3.0  4.0

merge

left = pd.DataFrame({
  'key': ['k0', 'k1', 'k2', 'k3'],
  'A': ['A0', 'A1', 'A2', 'A3'],
  'B': ['B0', 'B1', 'B2', 'B3']
})
#     A   B key
# 0  A0  B0  k0
# 1  A1  B1  k1
# 2  A2  B2  k2
# 3  A3  B3  k3

right = pd.DataFrame({
  'key': ['k0', 'k1', 'k2', 'k3'],
  'C': ['C0', 'C1', 'C2', 'C3'],
  'D': ['D0', 'D1', 'D2', 'D3']
})
#     C   D key
# 0  C0  D0  k0
# 1  C1  D1  k1
# 2  C2  D2  k2
# 3  C3  D3  k3

pd.merge(left, right, on='key')
#     A   B key   C   D
# 0  A0  B0  k0  C0  D0
# 1  A1  B1  k1  C1  D1
# 2  A2  B2  k2  C2  D2
# 3  A3  B3  k3  C3  D3

left = pd.DataFrame({
  'key1': ['k0', 'k0', 'k1', 'k2'],
  'key2': ['k0', 'k1', 'k0', 'k1'],
  'A': ['A0', 'A1', 'A2', 'A3'],
  'B': ['B0', 'B1', 'B2', 'B3']
})
#     A   B key1 key2
# 0  A0  B0   k0   k0
# 1  A1  B1   k0   k1
# 2  A2  B2   k1   k0
# 3  A3  B3   k2   k1

right = pd.DataFrame({
  'key1': ['k0', 'k1', 'k1', 'k2'],
  'key2': ['k0', 'k0', 'k0', 'k0'],
  'C': ['C0', 'C1', 'C2', 'C3'],
  'D': ['D0', 'D1', 'D2', 'D3']
})
#     C   D key1 key2
# 0  C0  D0   k0   k0
# 1  C1  D1   k1   k0
# 2  C2  D2   k1   k0
# 3  C3  D3   k2   k0

pd.merge(left, right, on=['key1', 'key2'], how='inner')
# how默认是inner
#     A   B key1 key2   C   D
# 0  A0  B0   k0   k0  C0  D0
# 1  A2  B2   k1   k0  C1  D1
# 2  A2  B2   k1   k0  C2  D2

pd.merge(left, right, on=['key1', 'key2'], how='outer')
#      A    B key1 key2    C    D
# 0   A0   B0   k0   k0   C0   D0
# 1   A1   B1   k0   k1  NaN  NaN
# 2   A2   B2   k1   k0   C1   D1
# 3   A2   B2   k1   k0   C2   D2
# 4   A3   B3   k2   k1  NaN  NaN
# 5  NaN  NaN   k2   k0   C3   D3

pd.merge(left, right, on=['key1', 'key2']. how='right')
#      A    B key1 key2   C   D 
# 0   A0   B0   k0   k0  C0  D0 
# 1   A2   B2   k1   k0  C1  D1 
# 2   A2   B2   k1   k0  C2  D2 
# 3  NaN  NaN   k2   k0  C3  D3

pd.merge(left, right, on=['key1', 'key2'], how='left')
#     A   B key1 key2    C    D
# 0  A0  B0   k0   k0   C0   D0
# 1  A1  B1   k0   k1  NaN  NaN
# 2  A2  B2   k1   k0   C1   D1
# 3  A2  B2   k1   k0   C2   D2
# 4  A3  B3   k2   k1  NaN  NaN

matplotilb

导入

import matplotlib.pyplot as plt

API

plot

data = pd.Series(np.random.randn(1000)) # 随机1000个数
data = data.cumsum() # 累加
# 因为pandas本来就是一个数据，所以可以直接plot,
# 还有两种写法: plt.plot(x= , y = ) 或者 plt.plot([xxx, xxx], [yyy, yyy])
data.plot()
plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
# linewidth线条的宽度
# linestyle线条风格(-实线 --破折线 -.点划线 :虚线 None说明都不画)
plt.plot([1,50,100],[1,4,9], linewidth=2.5, linestyle='--', label='lalala')
plt.legend(loc='upper left') # 没有这句, 上面的label将不会显示
plt.plot([1,100,200],[1,7,9]) # 第三个数据
plt.title('Demo') # 标题
plt.xlabel('xxx') # x轴名称
plt.ylabel('yyy') # y轴名称
plt.text(60, 10, u'说明文字') # 说明文字
plt.show()  # 显示

Numpy & Pandas & Matplotilb部分API操作

#  随机1000行4列的数字, 行数从0到999, 列表为A B C D
data = pd.DataFrame(np.random.randn(1000, 4),
          index=np.arange(1000),
          columns=list('ABCD'))
data = data.cumsum()  # 累加
data.plot()
plt.show()

Numpy & Pandas & Matplotilb部分API操作

其他图

柱状图

plt.bar(left, height, width=0.8)

以上就是本文的全部内容，希望本文的内容对大家的学习或者工作能带来一定的帮助，也希望大家多多支持码农网

查看所有标签

猜你喜欢:

本站部分资源来源于网络，本站转载出于传递更多信息之目的，版权归原作者或者来源机构所有，如转载稿涉及版权问题，请联系我们。

码农书籍

浪潮式发售

[美] 杰夫.沃克（Jeff Walker） / 李文远 / 广东人民出版社 / 2016-3-1 / 39.80元

10天时间，4种发售路径，让你的产品一上架就被秒杀 投资失败的个体户，怎样让长期积压的库存，变成众人抢购的稀缺品，最终敲开财富之门？ 只有一腔热血的大学毕业生，怎样将原本无人问津的网球课程，发售成价值45万美元的专业教程？ 长期脱离社会的全职主妇，如何白手起家，创造出自己的第一款爆品，并挽救即将破碎的家庭？改变上述人士命运的是同一件法宝——产品发售方程式。互......一起来看看《浪潮式发售》这本书的介绍吧!

码农工具

Numpy & Pandas & Matplotilb部分API操作

创建数组

求值

索引

合并

分割

Pandas

导入

API

创建列表

选择获取

排序

设置值

导入导出

合并

concat

concat 部分参数

append

merge

matplotilb

导入

API

plot

其他图

柱状图

浪潮式发售

RGB转16进制工具

MD5 加密

XML、JSON 在线转换