• numpy常用函数学习



    目录
    numpy常用函数学习
    点乘法
    线型预测
    线性拟合
    裁剪、压缩和累乘
    相关性
    多项式拟合
    提取符号数组
    杂项
    点乘法
    该方法为数学方法,但是在numpy使用的时候略坑。numpy的点乘为a.dot(b)或numpy.dot(a,b),要求a,b的原始数据结构为MxN .* NxL=MxL,不是显示数据,必须经过a.resize()或者a.shape=两种方法转换才能将原始数据改变结构。
    代码如下:

    >>> import numpy as np
    >>> a=np.array([[1,2,3,4],[5,6,7,8]])
    >>> a
    array([[1, 2, 3, 4],
    [5, 6, 7, 8]])
    >>> b=np.array([[9],[9]])
    >>> b
    array([[9],
    [9]])
    >>> a*b
    array([[ 9, 18, 27, 36],
    [45, 54, 63, 72]])
    >>> a.dot(b)
    Traceback (most recent call last):
    File "<pyshell#6>", line 1, in <module>
    a.dot(b)
    ValueError: shapes (2,4) and (2,1) not aligned: 4 (dim 1) != 2 (dim 0)
    >>> c=np.array([[9],[10]])
    >>> a*c
    array([[ 9, 18, 27, 36],
    [50, 60, 70, 80]])
    >>> d=np.array([[10,20,30,40],[50,60,70,80]])
    >>> a.dot(d)
    Traceback (most recent call last):
    File "<pyshell#10>", line 1, in <module>
    a.dot(d)
    ValueError: shapes (2,4) and (2,4) not aligned: 4 (dim 1) != 2 (dim 0)

    >>> d.reshape(4,2)
    array([[10, 20],
    [30, 40],
    [50, 60],
    [70, 80]])
    >>> a.dot(d)
    Traceback (most recent call last):
    File "<pyshell#23>", line 1, in <module>
    a.dot(d)
    ValueError: shapes (2,4) and (2,4) not aligned: 4 (dim 1) != 2 (dim 0)
    >>> d
    array([[10, 20, 30, 40],
    [50, 60, 70, 80]])
    >>> d.resize(4,2)
    >>> a.dot(d)
    array([[ 500, 600],
    [1140, 1400]])
    >>> a
    array([[1, 2, 3, 4],
    [5, 6, 7, 8]])
    >>> e=np.array([7,8,9,10])
    >>> e.shape=(4,1)
    >>> a.dot(e)
    array([[ 90],
    [226]])

    线型预测
    通过最小二乘法对已有数据拟合出函数,并预测未知数据。
    最小二乘法:在假定函数结构(这里假设我们知道结果是y=ax+b)的情况下,通过已知结果(x,y)求取未知变量(a,b)。
    具体求取原理参考:https://baijiahao.baidu.com/s?id=1613474944612061421&wfr=spider&for=pc
    预测例子:

    import datetime as dt
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as mp
    import matplotlib.dates as md


    def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd

    dates, closing_prices = np.loadtxt(
    '../../data/aapl.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
    N = 5
    pred_prices = np.zeros(
    closing_prices.size - 2 * N + 1)
    for i in range(pred_prices.size):
    a = np.zeros((N, N))
    for j in range(N):
    a[j, ] = closing_prices[i + j:i + j + N]
    b = closing_prices[i + N:i + N * 2]
    #[1]挤后面的为残差
    x = np.linalg.lstsq(a, b)[0]
    pred_prices[i] = b.dot(x)
    mp.figure('Linear Prediction',
    facecolor='lightgray')
    mp.title('Linear Prediction', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Price', fontsize=14)
    ax = mp.gca()
    # 设置水平坐标每个星期一为主刻度
    ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
    # 设置水平坐标每一天为次刻度
    ax.xaxis.set_minor_locator(md.DayLocator())
    # 设置水平坐标主刻度标签格式
    ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates, closing_prices, 'o-', c='lightgray',
    label='Closing Price')
    dates = np.append(dates,
    dates[-1] + pd.tseries.offsets.BDay())
    mp.plot(dates[2 * N:], pred_prices, 'o-',
    c='orangered', linewidth=3,
    label='Predicted Price')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

    线性拟合
    原理同上:通过最小二乘法对已有数据拟合出函数,并预测未知数据。

    y`代表预测值
    y-y`为误差
    kx + b = y y`
    kx1 + b = y1 y1` (y1-y1`)^2
    kx2 + b = y2 y2` (y2-y2`)^2
    ...
    kxn + b = yn yn` (yn-yn`)^2
    ----------------------------------------------------------
    E=f(,k,b)
    找到合适的k和b,使E取得最小,由此,k和b所确定的直线为拟合直线。
    / x1 1 / k / y1`
    | x2 1 | X | b | 接近 | y2` |
    | ... | / | ... |
    xn 1/ yn`/
    a x b
    最小二乘法的方法:
    = np.linalg.lstsq(a, b)[0]
    y = kx + b
    kx1 + b = y1' - y1
    kx2 + b = y2' - y2
    ...
    kxn + b = yn' - yn
    [y1 - (kx1 + b)]^2 +
    [y2 - (kx2 + b)]^2 + ... +
    [yn - (kxn + b)]^2 = loss = f(k, b)
    k, b? -> loss ->min

    趋势线示例:

    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md

    def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd
    dates, opening_prices, highest_prices,
    lowest_prices, closing_prices = np.loadtxt(
    r'C:UsersCsDesktop数据分析DS+MLDSdataaapl.csv',
    delimiter=',', usecols=(1, 3, 4, 5, 6),
    unpack=True, dtype='M8[D], f8, f8, f8, f8',
    converters={1: dmy2ymd})
    trend_points = (highest_prices+lowest_prices+closing_prices)/3
    days = dates.astype(int)
    # =np.column_stack:将一位矩阵以纵向组合
    """
    >>> a=[1,2,3];b=[11,22,33];np.column_stack((a,b))
    array([[ 1, 11],
    [ 2, 22],
    [ 3, 33]])
    """
    # 同理还有row_stack(),方法与其刚好相反
    # np.ones_like() 生成一个与参数矩阵结构相同但值为1的矩阵
    a = np.column_stack((days, np.ones_like(days)))
    # 生成a,b的组合,暂时不知道多个变量情况下的拟合的公示,查手册
    x = np.linalg.lstsq(a, trend_points)[0]
    #print(np.linalg.lstsq(a, trend_points))
    # :(array([ 1.81649663e-01, -2.37829793e+03]), array([1267.18780684]), 2, array([8.22882234e+04, 4.62700411e-03]))
    #得到的y`的值矩阵
    trend_line = days*x[0]+x[1]
    mp.figure('Candlestick', facecolor='lightgray')
    mp.title('Candlestick', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Price', fontsize=14)
    ax = mp.gca()
    # 设置水平坐标每个星期一为主刻度
    ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
    # 设置水平坐标每一天为次刻度
    ax.xaxis.set_minor_locator(md.DayLocator())
    # 设置水平坐标主刻度标签格式
    ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    # 阳线掩码
    rise = closing_prices - opening_prices >= 0.01
    # 阴线掩码
    fall = opening_prices - closing_prices >= 0.01
    # 填充色
    fc = np.zeros(dates.size, dtype='3f4')
    fc[rise], fc[fall] = (1, 1, 1), (0, 0.5, 0)
    # 边缘色
    ec = np.zeros(dates.size, dtype='3f4')
    ec[rise], ec[fall] = (1, 0, 0), (0, 0.5, 0)
    mp.bar(dates, highest_prices - lowest_prices, 0,
    lowest_prices, color=fc, edgecolor=ec)
    mp.bar(dates, closing_prices - opening_prices, 0.8,
    opening_prices, color=fc, edgecolor=ec)
    mp.plot(dates, trend_line)
    # 自动调整水平坐标轴的日期标签
    mp.gcf().autofmt_xdate()
    mp.show()

    裁剪、压缩和累乘
    ndarray.clip(min=下限, max=上限)
    将调用数组中小于和大于下限和上限的元素替换为下限和上限,返回裁剪后的数组,调用数组保持不变。
    ndarray.compress(条件)
    返回由调用数组中满足条件的元素组成的新数组。
    ndarray.prod()
    返回调用数组中所有元素的乘积——累乘。
    ndarray.cumprod()
    返回调用数组中所有元素执行累乘的过程数组。

    import numpy as np
    a = np.array([10, 20, 30, 40, 50])
    print(a)
    b = a.clip(min=15, max=45)
    print(b)
    c = a.compress((15 <= a) & (a <= 45))
    print(c)
    d = a.prod()
    print(d)
    e = a.cumprod()
    print(e)
    def jiecheng(n):
    return n if n == 1 else n * jiecheng(n - 1)
    n = 5
    print(jiecheng(n))
    jc = 1
    for i in range(2, n + 1):
    jc *= i
    print(jc)
    print(np.arange(2, n + 1).prod())
    结果:
    [10 20 30 40 50]
    [15 20 30 40 45]
    [20 30 40]
    12000000
    [ 10 200 6000 240000 12000000]
    120
    120
    120

    相关性

    相关性:
    相关系数=相关系数
    cov_ab/(std_a x std_b)=cov_ba/(std_b x std_a)
    协方差矩阵:

    标准差矩阵:


    相关性矩阵=协方差矩阵/标准差矩阵:(等号右边是一个矩阵)
    | var_a/(std_a x std_a) cov_ab/(std_a x std_b) |
    相关性= | cov_ba/(std_b x std_a) var_b/(std_b x std_b) |

    numpy.cov(a, b)->相关矩阵的分子矩阵(协方差矩阵)
    numpy.corrcoef(a, b)->相关性矩阵
    手动和自动计算的例:

    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md

    def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(
    dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd

    dates, bhp_closing_prices = np.loadtxt(
    '../../data/bhp.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
    vale_closing_prices = np.loadtxt(
    '../../data/vale.csv', delimiter=',',
    usecols=(6), unpack=True)
    bhp_returns = np.diff(
    bhp_closing_prices) / bhp_closing_prices[:-1]
    vale_returns = np.diff(
    vale_closing_prices) / vale_closing_prices[:-1]
    ave_a = bhp_returns.mean()
    dev_a = bhp_returns - ave_a
    var_a = (dev_a * dev_a).sum() / (dev_a.size - 1)
    std_a = np.sqrt(var_a)
    ave_b = vale_returns.mean()
    dev_b = vale_returns - ave_b
    var_b = (dev_b * dev_b).sum() / (dev_b.size - 1)
    std_b = np.sqrt(var_b)
    cov_ab = (dev_a * dev_b).sum() / (dev_a.size - 1)
    cov_ba = (dev_b * dev_a).sum() / (dev_b.size - 1)
    #相关系数
    corr = np.array([
    [var_a / (std_a * std_a), cov_ab / (std_a * std_b)],
    [cov_ba / (std_b * std_a), var_b / (std_b * std_b)]])
    print(corr)
    #相关性矩阵的分子矩阵:协方差矩阵
    covs = np.cov(bhp_returns, vale_returns)
    #相关性矩阵的分母矩阵:标准差矩阵
    stds = np.array([
    [std_a * std_a, std_a * std_b],
    [std_b * std_a, std_b * std_b]])
    corr = covs / stds
    print(corr)
    corr = np.corrcoef(bhp_returns, vale_returns)
    print(corr)
    mp.figure('Correlation Of Returns',
    facecolor='lightgray')
    mp.title('Correlation Of Returns', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Returns', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates[:-1], bhp_returns, c='orangered',
    label='BHP')
    mp.plot(dates[:-1], vale_returns, c='dodgerblue',
    label='VALE')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()
    结果:
    [[1. 0.67841747]
    [0.67841747 1. ]]
    [[1. 0.67841747]
    [0.67841747 1. ]]
    [[1. 0.67841747]
    [0.67841747 1. ]]

    结果解读:
    在相关性矩阵中,主对角线上的元素是1,代表每个随机变量关于其自身一定是最强的正相关,辅助角上的元素为去除了分散性以后的净相关性指标–相关系数。相关系数介于[-1,1],正负代表了相关性的方向,绝对值表示了相关性的强弱。

    多项式拟合
    y = p0x^n + p1x^n-1 + p2x^n-2 + … + pn = f(x)
    y1’ = f(x1) -> y1
    y2’ = f(x2) -> y2

    yn’ = f(xn) -> yn
    (y1-y1’)^2 + (y2-y2’)^2 + … + (yn-yn’)^2
    = loss (p0, …, pn)
    p0, …, pn = ? -> loss -> min
    X = [x1, x2, …, xn] - 自变量
    Y = [y1, y2, …, yn] - 实际函数值
    Y’= [y1’,y2’,…,yn’] - 拟合函数值
    P = [p0, p1, …, pn] - 多项式函数中的系数
    Q = [q0, q1, …, qn-1] - 多项式函数导函数的系数
    np.polyfit(X, Y, 最高次幂)->P
    np.polyval(P, X)->Y’
    np.polyder§->Q
    y = 4x^3 + 3x^2 + 2x + 1, P=[4,3,2,1]
    dy/dx = 12x^2 + 6x + 2, Q=[12, 6, 2]
    4x^3 + 3x^2 + 2x + 1 = 0的根:np.roots§(f(x)=0的解)
    np.polysub(P1, P2)->两个多项式函数的差函数的系数
    y = 4x^3 + 3x^2 + 2x + 1, P1=[4,3,2,1]
    y = 5x^4 + x, P2=[5, 0, 0, 1, 0]
    y = -5x^4 + 4x^3 + 3x^2 + x + 1
    np.polysub(P1, P2)->[-5, 4, 4, 1, 1]
    np.polyfit(X, Y, 最高次幂)->P得到一个函数,赋予变量才可以得到值
    np.roots§(f(x)=0的解)
    np.polysub(P1, P2)->两个多项式函数的差函数的系数
    np.polyval(p, days) 对曲线求值
    【polyfit】多项式曲线拟合
    【polyval】多项式曲线求值
    np.polyder§对p函数求导、

    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md


    def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(
    dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


    dates, bhp_closing_prices = np.loadtxt(
    r'C:UsersCsDesktop数据分析DS+MLDSdatahp.csv', delimiter=',',
    usecols=(1, 6), unpack=True,
    dtype='M8[D], f8', converters={1: dmy2ymd})
    vale_closing_prices = np.loadtxt(
    r'C:UsersCsDesktop数据分析DS+MLDSdatavale.csv', delimiter=',',
    usecols=(6), unpack=True)
    diff_closing_prices = bhp_closing_prices - vale_closing_prices
    #将日期转换为int格式,方便计算
    days = dates.astype(int)
    print(dates)
    # 拟合4次曲线
    p = np.polyfit(days, diff_closing_prices, 4)
    # 生成曲线定点的值
    poly_closing_prices = np.polyval(p, days)
    # 求导
    q = np.polyder(p)
    #解导数等于0的值
    roots_x = np.roots(q)
    #求导数等于0的时候函数值(y值)
    roots_y = np.polyval(p, roots_x)
    mp.figure('Polynomial Fitting', facecolor='lightgray')
    mp.title('Polynomial Fitting', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('Difference Price', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(linestyle=':')
    dates = dates.astype(md.datetime.datetime)
    mp.plot(dates, poly_closing_prices, c='limegreen',
    linewidth=3, label='Polynomial Fitting')
    mp.scatter(dates, diff_closing_prices, c='dodgerblue',
    alpha=0.5, s=60, label='Difference Price')
    #将求得的解转换为日期格式
    roots_x = roots_x.astype(int).astype(
    'M8[D]').astype(md.datetime.datetime)
    mp.scatter(roots_x, roots_y, marker='^', s=80,
    c='orangered', label='Peek', zorder=4)
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()

    提取符号数组
    将数组的正负提取出来单独作为一个数组:
    两种方法:

    np.sign(源数组)->符号数组
    + -> 1
    - -> -1
    0 -> 0
    np.piecewise(源数组, 条件序列, 取值序列)->目标数组
    针对源数组中的每一个元素,检测其是否符合条件序列中的每一个条件,符合哪个条件就用取值系列中与之对应的值,表示该元素,放到目标数组中返回。
    条件序列: [a < 0, a == 0, a > 0]
    取值序列: [-1, 0, 1]
    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import numpy as np
    a = np.array([70, 80, 60, 30, 40])
    print(a)
    b = a - 60
    print(b)
    c = np.sign(b)
    print(c)
    d = np.piecewise(a, [a < 60, a == 60, a > 60],[-1, 0, 1])
    print(d)

    例子2(没啥意义,和例子一差不多):

    # -*- coding: utf-8 -*-
    from __future__ import unicode_literals
    import datetime as dt
    import numpy as np
    import matplotlib.pyplot as mp
    import matplotlib.dates as md


    def dmy2ymd(dmy):
    dmy = str(dmy, encoding='utf-8')
    date = dt.datetime.strptime(
    dmy, '%d-%m-%Y').date()
    ymd = date.strftime('%Y-%m-%d')
    return ymd


    dates, closing_prices, volumes = np.loadtxt(
    r'C:UsersCsDesktop数据分析DS+MLDSdatahp.csv', delimiter=',',
    usecols=(1, 6, 7), unpack=True,
    dtype='M8[D], f8, f8', converters={1: dmy2ymd})
    diff_closing_prices = np.diff(closing_prices)
    #sign_closing_prices = np.sign(diff_closing_prices)
    sign_closing_prices = np.piecewise(
    diff_closing_prices, [
    diff_closing_prices < 0,
    diff_closing_prices == 0,
    diff_closing_prices > 0], [-1, 0, 1])
    print(volumes)
    obvs = volumes[1:] * sign_closing_prices
    print(obvs)
    mp.figure('On-Balance Volume', facecolor='lightgray')
    mp.title('On-Balance Volume', fontsize=20)
    mp.xlabel('Date', fontsize=14)
    mp.ylabel('OBV', fontsize=14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(md.WeekdayLocator(
    byweekday=md.MO))
    ax.xaxis.set_minor_locator(md.DayLocator())
    ax.xaxis.set_major_formatter(md.DateFormatter(
    '%d %b %Y'))
    mp.tick_params(labelsize=10)
    mp.grid(axis='y', linestyle=':')
    dates = dates[1:].astype(md.datetime.datetime)
    mp.bar(dates, obvs, 1.0, color='dodgerblue',
    edgecolor='white', label='OBV')
    mp.legend()
    mp.gcf().autofmt_xdate()
    mp.show()
    杂项
    numpy.diff(a, n=1,axis=-1)
    沿着指定轴计算第N维的离散差值
    参数:
    a:输入矩阵
    n:可选,代表要执行几次差值
    axis:默认是最后一个
    示例:

    >>> a=np.arange(2,14)
    >>> a.shape=(3,4)
    >>> a
    array([[ 2, 3, 4, 5],
    [ 6, 7, 8, 9],
    [10, 11, 12, 13]])
    >>> np.diff(a)
    array([[1, 1, 1],
    [1, 1, 1],
    [1, 1, 1]])

  • 相关阅读:
    struts2标签具体解释
    证明N={1,2,...,n,...}最高万元 黄晓宁
    Oracle 11g 环境,使用utl_smtp创建一个存储过程来发送邮件
    通过非暴力促进国内操作系统的可行性分析
    pig 的chararry不能用于比较的类型可以comparison operator
    Tair LDB基于Prefixkey找到如何提取一系列性能优化项目key的prefix_size
    hdu 5073 Galaxy(2014acm鞍山亚洲分部 D)
    乐趣与你rabbitMQ 源代码
    我们的空间是它圆——基于Poicare对宇宙的模型
    自己主动旋转木马,自己主动作为幻灯片标题类似或图片
  • 原文地址:https://www.cnblogs.com/aibabel/p/11011704.html
Copyright © 2020-2023  润新知