# 《神经网络和深度学习》系列文章七：实现我们的神经网络来分类数字

## 目录

1、使用神经网络识别手写数字

• 感知机
• 乙状结肠神经元
• 神经网络的结构
• 用简单的网络结构解决手写数字识别
• 通过梯度下降法学习参数
• 实现我们的网络来分类数字
• 关于深度学习

2，反向传播算法是如何工作的

3，改进神经网络的学习方法

4，神经网络能够计算任意函数的视觉证明

5，为什么深度神经网络的训练是困难的

### 实现我们的神经网络来分类数字

1如前所述，MNIST数据集是基于NIST（美国国家标准与技术研究院）收集的两个数据集合。为了构建MNIST，NIST数据集合被Yann LeCun，Corinna Cortes和Christopher JC Burges拆分放入一个更方便的格式。更多细节请看这个链接。我的仓库中的数据集是在一种更容易在Python的中加载和操纵MNIST数据的形式。我从蒙特利尔大学的LISA机器学习实验室获得了这个特殊格式的数据（链接）。

class Network(object):          def __init__(self, sizes):                 self.num_layers = len(sizes)                 self.sizes = sizes                 self.biases = [np.random.randn(y, 1)         for y in sizes[1:]]                 self.weights = [np.random.randn(y, x)             for x, y in zip(sizes[:-1], sizes[1:])]

net = Network（[2，3，1]）

#### 练习

def sigmoid(z):         return 1.0/(1.0+np.exp(-z))

def feedforward(self, a):             """Return the output of the network if "a" is input."""     for b, w in zip(self.biases, self.weights):         a = sigmoid(np.dot(w, a)+b)            return a

def SGD(self, training_data, epochs, mini_batch_size, eta,   test_data=None):    """Train the neural network using mini-batch stochastic      gradient descent. The "training_data" is a list of tuples    "(x, y)" representing the training inputs and the desired    outputs. The other non-optional parameters are self-explanatory. If "test_data" is provided then the network will be evaluated against the test data after each epoch, and partial progress printed out.  This is useful for tracking progress, but slows things down substantially."""             if test_data: n_test = len(test_data)             n = len(training_data)            for j in xrange(epochs):                     random.shuffle(training_data)                     mini_batches = [training_data[k:k+mini_batch_size]        for k in xrange(0, n, mini_batch_size)]                        for mini_batch in mini_batches:                      self.update_mini_batch(mini_batch, eta)                if test_data:                    print "Epoch {0}: {1} / {2}".format(j, self.evaluate(test_data), n_test)                 else:                                    print "Epoch {0} complete".format(j)

trainingdata是一个代表着训练输入和对应的期望输出的元组（X，Y）的列表。变量历元和minibatchsize是你期望的训练的迭代次数和取样时所用的小批量块的大小.eta是学习率η。如果可选参数TESTDATA被提供，那么程序将会在每次训练迭代之后评价网络，并输出我们的局部进展。这对于跟踪进展是有用的，但是大幅度降低速度。

def update_mini_batch(self, mini_batch, eta):         """Update the network's weights and biases by applying gradient descent using backpropagation to a single mini batch. The "mini_batch" is a list of tuples "(x, y)", and "eta" is the learning rate."""             nabla_b = [np.zeros(b.shape) for b in self.biases]       nabla_w = [np.zeros(w.shape) for w in self.weights]      for x, y in mini_batch:                    delta_nabla_b, delta_nabla_w = self.backprop(x, y)        nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]        nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]                self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]                self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]

deltanablab，deltanablaw = self.backprop（x，y）

""" network.py \~~~~~~~~~~  A module to implement the stochastic gradient descent learning algorithm for a feedforward neural network.  Gradients are calculated using backpropagation.  Note that I have focused on making the code simple, easily readable, and easily modifiable.  It is not optimized, and omits many desirable features. """#### Libraries# Standard libraryimport random# Third-party librariesimport numpy as npclass Network(object):def __init__(self, sizes):    """The list sizes contains the number of neurons in the respective layers of the network.  For example, if the    list was [2, 3, 1] then it would be a three-layer network,   with the first layer containing 2 neurons, the second layer 3 neurons, and the third layer 1 neuron.  The biases and      weights for the network are initialized randomly, using a    Gaussian distribution with mean 0, and variance 1.  Note thatthe first layer is assumed to be an input layer, and by      convention we won't set any biases for those neurons, since  biases are only ever used in computing the outputs from later layers."""                self.num_layers = len(sizes)    self.sizes = sizes    self.biases = [np.random.randn(y, 1) for y in sizes[1:]]    self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]       def feedforward(self, a):        """Return the output of the network if a is input."""             for b, w in zip(self.biases, self.weights):        a = sigmoid(np.dot(w, a)+b)    return a    def SGD(self, training_data, epochs, mini_batch_size, eta,   test_data=None):        """Train the neural network using mini-batch stochastic      gradient descent.  The training_data is a list of tuples (x, y) representing the training inputs and the desired  outputs.  The other non-optional parameters are              self-explanatory. If test_data is provided then the      network will be evaluated against the test data after each   epoch, and partial progress printed out.  This is useful for tracking progress, but slows things down substantially."""    if test_data: n_test = len(test_data)    n = len(training_data)    for j in xrange(epochs):        random.shuffle(training_data)        mini_batches = [training_data[k:k+mini_batch_size] for k in xrange(0, n, mini_batch_size)]        for mini_batch in mini_batches:            self.update_mini_batch(mini_batch, eta)                if test_data:                    print "Epoch {0}: {1} / {2}".format(j, self.evaluate(test_data), n_test)                else:                    print "Epoch {0} complete".format(j)                      def update_mini_batch(self, mini_batch, eta):"""Update the network's weights and biases by applying       gradient descent using backpropagation to a single mini batch. The mini_batch is a list of tuples (x, y), and     eta is the learning rate."""    nabla_b = [np.zeros(b.shape) for b in self.biases]    nabla_w = [np.zeros(w.shape) for w in self.weights]    for x, y in mini_batch:        delta_nabla_b, delta_nabla_w = self.backprop(x, y)        nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]        nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]        self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]        self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]        def backprop(self, x, y):"""Return a tuple (nabla_b, nabla_w) representing the gradient for the cost function C_x.  nabla_b and nabla_w are layer-by-layer lists of numpy arrays, similarto self.biases and self.weights."""     nabla_b = [np.zeros(b.shape) for b in self.biases]    nabla_w = [np.zeros(w.shape) for w in self.weights]    # feedforward    activation = x    activations = [x]    # list to store all the activations, layer by layer    zs = []    # list to store all the z vectors, layer by layer    for b, w in zip(self.biases, self.weights):    z = np.dot(w, activation)+b        zs.append(z)        activation = sigmoid(z)        activations.append(activation)    # backward pass    delta = self.cost_derivative(activations[-1], y) * \    sigmoid_prime(zs[-1])    nabla_b[-1] = delta    nabla_w[-1] = np.dot(delta, activations[-2].transpose())    # Note that the variable l in the loop below is used a little    # differently to the notation in Chapter 2 of the book.  Here,    # l = 1 means the last layer of neurons, l = 2 is the    # second-last layer, and so on. It's a renumbering of the    # scheme in the book, used here to take advantage of the fact    # that Python can use negative indices in lists.    for l in xrange(2, self.num_layers):    z = zs[-l]     sp = sigmoid_prime(z)     delta = np.dot(self.weights[-l+1].transpose(), delta) *  sp     nabla_b[-l] = delta     nabla_w[-l]= np.dot(delta, activations[-l-1].transpose())            return (nabla_b, nabla_w)    def evaluate(self, test_data):"""Return the number of test inputs for which the neural     network outputs the correct result. Note that the neural     network's output is assumed to be the index of whichever     neuron in the final layer has the highest activation."""             test_results = [(np.argmax(self.feedforward(x)), y)  for (x, y) in test_data]     return sum(int(x == y) for (x, y) in test_results)      def cost_derivative(self, output_activations, y):     """Return the vector of partial derivatives \partial C_x /    \partial a for the output activations."""            return (output_activations-y)#### Miscellaneous functionsdef sigmoid(z):    """The sigmoid function."""     return 1.0/(1.0+np.exp(-z))def sigmoid_prime(z):     """Derivative of the sigmoid function."""    return sigmoid(z)*(1-sigmoid(z))

>>> import network >>> net = network.Network（[784，30，10]）

>>> net.SGD（trainingdata，30，10，3.0，testdata = test_data）

>>> net = network.Network（[784，100，10]）>>> net.SGD（trainingdata，30，10，3.0，testdata = test_data）

>>> net = network.Network（[784，100，10]）>>> net.SGD（trainingdata，30，10，0.001，testdata = test_data）

>>> net = network.Network（[784，30，10]）>>> net.SGD（trainingdata，30，10，100.0，testdata = test_data）