Auto Byte

Science AI

# 目标函数的经典优化算法介绍

GitHub 链接：https://github.com/ManuelGonzalezRivero/3dbabove

• 随机梯度下降法
• 动量算法
• RMSProp

1. 初始化参数（θ，学习率）
2. 计算每个θ处的梯度
3. 更新参数
4. 重复步骤 2 和 3，直到代价值稳定

import numpy as npdef minimaFunction(theta):    return np.cos(3*np.pi*theta)/thetadef minimaFunctionDerivative(theta):    const1 = 3*np.pi    const2 = const1*theta    return -(const1*np.sin(const2)/theta)-np.cos(const2)/theta**2theta = np.arange(.1,2.1,.01)Jtheta = minimaFunction(theta)dJtheta = minimaFunctionDerivative(theta)plt.plot(theta,Jtheta,label = r'$J(\theta)$')plt.plot(theta,dJtheta/30,label = r'$dJ(\theta)/30$')plt.legend()axes = plt.gca()#axes.set_ylim([-10,10])plt.ylabel(r'$J(\theta),dJ(\theta)/30$')plt.xlabel(r'$\theta$')plt.title(r'$J(\theta),dJ(\theta)/30$ vs $\theta$')plt.show()

import numpy as npimport matplotlib.pyplot as pltimport matplotlib.animation as animationdef optimize(iterations, oF, dOF,params,learningRate):    """    computes the optimal value of params for a given objective function and its derivative    Arguments:        - iteratoins - the number of iterations required to optimize the objective function        - oF - the objective function        - dOF - the derivative function of the objective function        - params - the parameters of the function to optimize        - learningRate - the learning rate    Return:        - oParams - the list of optimized parameters at each step of iteration    """    oParams = [params]    #The iteration loop    for i in range(iterations):        # Compute the derivative of the parameters        dParams = dOF(params)        # Compute the update        params = params-learningRate*dParams        # app end the new parameters        oParams.append(params)        return np.array(oParams)def minimaFunction(theta):    return np.cos(3*np.pi*theta)/thetadef minimaFunctionDerivative(theta):    const1 = 3*np.pi    const2 = const1*theta    return -(const1*np.sin(const2)/theta)-np.cos(const2)/theta**2theta = .6iterations=45learningRate = .0007optimizedParameters = optimize(iterations,\                               minimaFunction,\                               minimaFunctionDerivative,\                               theta,\                               learningRate)

SGD 也适用于多变量参数空间的情况。我们可以将二维函数绘制成等高线图。在这里你可以看到 SGD 对一个不对称的碗形函数同样有效。

import numpy as npimport matplotlib.mlab as mlabimport matplotlib.pyplot as pltimport scipy.statsimport matplotlib.animation as animationdef minimaFunction(params):    #Bivariate Normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)    Z = Z1    return -40*Zdef minimaFunctionDerivative(params):    # Derivative of the bivariate normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2    dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2    return (dZ1X,dZ1Y)def optimize(iterations, oF, dOF,params,learningRate,beta):    """    computes the optimal value of params for a given objective function and its derivative    Arguments:        - iteratoins - the number of iterations required to optimize the objective function        - oF - the objective function        - dOF - the derivative function of the objective function        - params - the parameters of the function to optimize        - learningRate - the learning rate- beta - The weighted moving average parameter    Return:        - oParams - the list of optimized parameters at each step of iteration    """    oParams = [params]    vdw     = (0.0,0.0)    #The iteration loop    for i in range(iterations):        # Compute the derivative of the parameters        dParams = dOF(params)        #SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter        params = tuple([par-learningRate*dPar for dPar,par in zip(dParams,params)])        # append the new parameters        oParams.append(params)        return oParamsiterations=100learningRate = 1beta = .9x,y = 4.0,1.0params = (x,y)optimizedParameters = optimize(iterations,\                               minimaFunction,\                               minimaFunctionDerivative,\                               params,\                               learningRate,\                               beta)

γ 和 ν 值允许用户对 dJ(θ) 的前一个值和当前值进行加权来确定新的θ值。人们通常选择γ和ν的值来创建指数加权移动平均值，如下所示：

β参数的最佳选择是 0.9。选择一个等于 1-1/t 的β值可以让用户更愿意考虑νdw 的最新 t 值。这种简单的改变可以使优化过程产生显著的结果！我们现在可以使用更大的学习率，并在尽可能短的时间内收敛！

import numpy as npimport matplotlib.mlab as mlabimport matplotlib.pyplot as pltimport scipy.statsimport matplotlib.animation as animationdef minimaFunction(params):    #Bivariate Normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)    Z = Z1    return -40*Zdef minimaFunctionDerivative(params):    # Derivative of the bivariate normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2    dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2    return (dZ1X,dZ1Y)def optimize(iterations, oF, dOF,params,learningRate,beta):    """    computes the optimal value of params for a given objective function and its derivative    Arguments:        - iteratoins - the number of iterations required to optimize the objective function        - oF - the objective function        - dOF - the derivative function of the objective function        - params - the parameters of the function to optimize        - learningRate - the learning rate- beta - The weighted moving average parameter for momentum    Return:        - oParams - the list of optimized parameters at each step of iteration    """    oParams = [params]    vdw     = (0.0,0.0)    #The iteration loop    for i in range(iterations):        # Compute the derivative of the parameters        dParams = dOF(params)        # Compute the momentum of each gradient vdw = vdw*beta+(1.0+beta)*dPar        vdw = tuple([vDW*beta+(1.0-beta)*dPar for dPar,vDW in zip(dParams,vdw)])        #SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter        params = tuple([par-learningRate*dPar for dPar,par in zip(vdw,params)])        # append the new parameters        oParams.append(params)        return oParamsiterations=100learningRate = 5.3beta = .9x,y = 4.0,1.0params = (x,y)optimizedParameters = optimize(iterations,\                               minimaFunction,\                               minimaFunctionDerivative,\                               params,\                               learningRate,\                               beta)

RMSProp

import numpy as npimport matplotlib.mlab as mlabimport matplotlib.pyplot as pltimport scipy.statsimport matplotlib.animation as animationdef minimaFunction(params):    #Bivariate Normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)    Z = Z1    return -40*Zdef minimaFunctionDerivative(params):    # Derivative of the bivariate normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2    dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2    return (dZ1X,dZ1Y)def optimize(iterations, oF, dOF,params,learningRate,beta):    """    computes the optimal value of params for a given objective function and its derivative    Arguments:        - iteratoins - the number of iterations required to optimize the objective function        - oF - the objective function        - dOF - the derivative function of the objective function        - params - the parameters of the function to optimize        - learningRate - the learning rate- beta - The weighted moving average parameter for RMSProp    Return:        - oParams - the list of optimized parameters at each step of iteration    """    oParams = [params]    sdw     = (0.0,0.0)    eps = 10**(-7)    #The iteration loop    for i in range(iterations):        # Compute the derivative of the parameters        dParams = dOF(params)        # Compute the momentum of each gradient sdw = sdw*beta+(1.0+beta)*dPar^2        sdw = tuple([sDW*beta+(1.0-beta)*dPar**2 for dPar,sDW in zip(dParams,sdw)])        #SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter        params = tuple([par-learningRate*dPar/((sDW**.5)+eps) for sDW,par,dPar in zip(sdw,params,dParams)])        # append the new parameters        oParams.append(params)        return oParamsiterations=10learningRate = .3beta = .9x,y = 5.0,1.0params = (x,y)optimizedParameters = optimize(iterations,\                               minimaFunction,\                               minimaFunctionDerivative,\                               params,\                               learningRate,\                               beta)

import numpy as npimport matplotlib.mlab as mlabimport matplotlib.pyplot as pltimport scipy.statsimport matplotlib.animation as animationdef minimaFunction(params):    #Bivariate Normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    Z1 = mlab.bivariate_normal(X, Y, sigma11,sigma12,mu11,mu12)    Z = Z1    return -40*Zdef minimaFunctionDerivative(params):    # Derivative of the bivariate normal function    X,Y = params    sigma11,sigma12,mu11,mu12 = (3.0,.5,0.0,0.0)    dZ1X = -scipy.stats.norm.pdf(X, mu11, sigma11)*(mu11 - X)/sigma11**2    dZ1Y = -scipy.stats.norm.pdf(Y, mu12, sigma12)*(mu12 - Y)/sigma12**2    return (dZ1X,dZ1Y)def optimize(iterations, oF, dOF,params,learningRate,beta1,beta2):    """    computes the optimal value of params for a given objective function and its derivative    Arguments:        - iteratoins - the number of iterations required to optimize the objective function        - oF - the objective function        - dOF - the derivative function of the objective function        - params - the parameters of the function to optimize        - learningRate - the learning rate- beta1 - The weighted moving average parameter for momentum component of ADAM- beta2 - The weighted moving average parameter for RMSProp component of ADAM    Return:        - oParams - the list of optimized parameters at each step of iteration    """    oParams = [params]    vdw     = (0.0,0.0)    sdw     = (0.0,0.0)    vdwCorr = (0.0,0.0)    sdwCorr = (0.0,0.0)    eps = 10**(-7)    #The iteration loop    for i in range(iterations):        # Compute the derivative of the parameters        dParams = dOF(params)        # Compute the momentum of each gradient vdw = vdw*beta+(1.0+beta)*dPar        vdw     = tuple([vDW*beta1+(1.0-beta1)*dPar for dPar,vDW in zip(dParams,vdw)])        # Compute the rms of each gradient sdw = sdw*beta+(1.0+beta)*dPar^2        sdw     = tuple([sDW*beta2+(1.0-beta2)*dPar**2.0 for dPar,sDW in zip(dParams,sdw)])        # Compute the weight boosting for sdw and vdw        vdwCorr = tuple([vDW/(1.0-beta1**(i+1.0)) for vDW in vdw])        sdwCorr = tuple([sDW/(1.0-beta2**(i+1.0)) for sDW in sdw])        #SGD in this line Goes through each parameter and applies parameter = parameter -learningrate*dParameter        params = tuple([par-learningRate*vdwCORR/((sdwCORR**.5)+eps) for sdwCORR,vdwCORR,par in zip(vdwCorr,sdwCorr,params)])        # append the new parameters        oParams.append(params)        return oParamsiterations=100learningRate = .1beta1 = .9beta2 = .999x,y = 5.0,1.0params = (x,y)optimizedParameters = optimize(iterations,\                               minimaFunction,\                               minimaFunctionDerivative,\                               params,\                               learningRate,\                               beta1,\                               beta2)</div>

– SGD: 100 次迭代

– SGD+Momentum: 50 次迭代

– RMSProp: 10 次迭代