Auto Byte

Science AI

# 从最小二乘到DNN：六段代码了解深度学习简史

``floyd run --data emilwallner/datasets/mnist/1:mnist --tensorboard --mode jupyter``

``````# y = mx + b# m is slope, b is y-interceptdef compute_error_for_line_given_points(b, m, coordinates):
totalError = 0
for i in range(0, len(coordinates)):
x = coordinates[i][0]
y = coordinates[i][1]
totalError += (y - (m * x + b)) ** 2

Legendre 的手动尝试减少误差率的方法比较费时。一个世纪后，来自荷兰的诺贝尔奖获得者 Peter Debye 创造了一种解决方案（1909, Debye）。

Peter Debye 注意到最小值左侧的偏斜度是负值，右侧是正值。因此，如果你知道任意 X 的斜率，就可以找到 Y 的最小值。

Debye 的数学片段也已经翻译成了 Python：

``````current_x = 0.5 # the algorithm starts at x=0.5learning_rate = 0.01 # step size multipliernum_iterations = 60 # the number of times to train the function#the derivative of the error function (x**4 = the power of 4 or x^4)def slope_at_given_x_value(x):
return 5 * x**4 - 6 * x**2# Move X to the right or left depending on the slope of the error functionfor i in range(num_iterations):
previous_x = current_x
current_x += -learning_rate * slope_at_given_x_value(previous_x)
print(previous_x)print("The local minimum occurs at %f" % current_x)``````

``````#Price of wheat/kg and the average price of breadwheat_and_bread = [[0.5,5],[0.6,5.5],[0.8,6],[1.1,6.8],[1.4,7]]def step_gradient(b_current, m_current, points, learningRate):
N = float(len(points))
for i in range(0, len(points)):
x = points[i][0]
y = points[i][1]
b_gradient += -(2/N) * (y - ((m_current * x) + b_current))
m_gradient += -(2/N) * x * (y - ((m_current * x) + b_current))
new_b = b_current - (learningRate * b_gradient)
new_m = m_current - (learningRate * m_gradient)
return [new_b, new_m]def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
b = starting_b
m = starting_m        for i in range(num_iterations):
b, m = step_gradient(b, m, points, learning_rate)

``````from random import choice
from numpy import array, dot, random
1_or_0 = lambda x: 0 if x < 0 else 1 training_data = [ (array([0,0,1]), 0),
(array([0,1,1]), 1),
(array([1,0,1]), 1),
(array([1,1,1]), 1), ] weights = random.rand(3) errors = [] learning_rate = 0.2 num_iterations = 100 for i in range(num_iterations):
input, truth = choice(training_data)
result = dot(weights, input)
error = truth - 1_or_0(result)
errors.append(error)
weights += learning_rate * error * input
for x, _ in training_data:
result = dot(x, w)
print("{}: {} -> {}".format(input[:2], result, 1_or_0(result)))``````

Minsky 和 Papert 的书发表仅一年后，一个芬兰硕士生发现了利用多层感知机解非线性问题的理论 (Linnainmaa, 1970)。由于当时主流对感知机持批评态度，对 AI 的投资枯竭了十多年。这就是众所周知的 AI 的第一个冬天。

Minsky 和 Papert 对感知机的有力批评在于 XOR（与或）问题的讨论上。其中的逻辑和 OR 逻辑一样，只有一个期望值，当有两个为真的声明时（1 & 1），返回值为假（0）。

「我们为类神经元单元组成的网络加入了新的学习过程，反向传播。反向传播不断重复调整网络连接的权重，以最小化网络的实际输出向量和期望输出向量之间的差异。经过权重调整之后，网络内部的隐藏单元（不包括输入层和输出层）可以表示任务范围的重要特征，并通过单元间的相互作用捕捉任务中的规律。神经网络创建有用的新特征的能力使其区别于早先提出的更简单的反向传播方法，比如感知机收敛过程（perceptron-convergence procedure）。」Nature 323, 533 - 536 (09 October 1986)

``````import numpy as np

X_XOR = np.array([[0,0,1], [0,1,1], [1,0,1],[1,1,1]]) y_truth = np.array([[0],[1],[1],[0]])np.random.seed(1)syn_0 = 2*np.random.random((3,4)) - 1syn_1 = 2*np.random.random((4,1)) - 1def sigmoid(x):
output = 1/(1+np.exp(-x))
return outputdef sigmoid_output_to_derivative(output):
return output*(1-output) for j in range(60000):
layer_1 = sigmoid(np.dot(X_XOR, syn_0))
layer_2 = sigmoid(np.dot(layer_1, syn_1))
error = layer_2 - y_truth
layer_2_delta = error * sigmoid_output_to_derivative(layer_2)
layer_1_error = layer_2_delta.dot(syn_1.T)
layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
syn_1 -= layer_1.T.dot(layer_2_delta)
syn_0 -= X_XOR.T.dot(layer_1_delta)print("Output After Training: \n", layer_2)``````

• GPU>Nvidia Tesla K80。该硬件通常用作图形处理，比起 CPUs，在深度学习中通常 GPU 要快 50-200 倍。
• CUDA>GPUs 的低级编程语言。
• CuDNN>Nvdia 优化 CUDA 的库。
• TFlearn>TensorFlow 的一个前端框架。

TFlearn 的实现：

``````from __future__ import division, print_function, absolute_importimport tflearnfrom tflearn.layers.core import dropout, fully_connectedfrom tensorflow.examples.tutorials.mnist import input_datafrom tflearn.layers.conv import conv_2d, max_pool_2dfrom tflearn.layers.normalization import local_response_normalizationfrom tflearn.layers.estimator import regression# Data loading and preprocessingmnist = input_data.read_data_sets("/data/", one_hot=True)X, Y, testX, testY = mnist.train.images, mnist.train.labels, mnist.test.images, mnist.test.labels
X = X.reshape([-1, 28, 28, 1])testX = testX.reshape([-1, 28, 28, 1])# Building convolutional networknetwork = tflearn.input_data(shape=[None, 28, 28, 1], name='input')network = conv_2d(network, 32, 3, activation='relu', regularizer="L2")network = max_pool_2d(network, 2)network = local_response_normalization(network)network = conv_2d(network, 64, 3, activation='relu', regularizer="L2")network = max_pool_2d(network, 2)network = local_response_normalization(network)network = fully_connected(network, 128, activation='tanh')network = dropout(network, 0.8)network = fully_connected(network, 256, activation='tanh')network = dropout(network, 0.8)network = fully_connected(network, 10, activation='softmax')network = regression(network, optimizer='adam', learning_rate=0.01,
loss='categorical_crossentropy', name='target')# Trainingmodel = tflearn.DNN(network, tensorboard_verbose=0)model.fit({'input': X}, {'target': Y}, n_epoch=20,
validation_set=({'input': testX}, {'target': testY}),
snapshot_step=100, show_metric=True, run_id='convnet_mnist')``````