Auto Byte

Science AI

# 利用TensorFlow和神经网络来处理文本分类问题

1. TensorFlow 如何工作
2. 机器学习模型是什么
3. 神经网络是什么
4. 神经网络怎样进行学习
5. 如何处理数据并且把它们传输给神经网络的输入
6. 怎样运行模型并且得到预测结果

### 1. TensorFlow 概览

TensorFlow 是最流行的开源 AI 库之一。它的高计算效率，丰富的开发资源使它被企业和个人开发者广泛采用。在我看来，学习 TensorFlow 的最好的方法就是使用它的官网教程（https://www.tensorflow.org/）。在这个网站上，你可以浏览「getting started」教程。

``````#import the library
import tensorflow as tf
#build the graph and name as my_graph
my_graph = tf.Graph()
#tf.Session encapsulate the environment for my_graph
with my_graph.as_default():
x = tf.constant([1,3,6])
y = tf.constant([1,1,1])
#run it by fetches
result = sess.run(fetches=op)
#print it
print(result)``````

### 3. 神经网络

#### 神经网络结构

f(x) = max(0,x)（输出是 x 或 0，无论 x 多大）

``````# Network Parameters
n_hidden_1 = 10        # 1st layer number of features
n_hidden_2 = 5         # 2nd layer number of features
n_input = total_words  # Words in vocab
n_classes = 3          # Categories: graphics, space and baseball
def multilayer_perceptron(input_tensor, weights, biases):
layer_1_multiplication = tf.matmul(input_tensor, weights['h1'])
# Hidden layer with RELU activation
layer_2_multiplication = tf.matmul(layer_1_activation, weights['h2'])
# Output layer with linear activation
out_layer_multiplication = tf.matmul(layer_2_activation, weights['out'])

### 4. 神经网络是如何训练的

``````# Construct model
prediction = multilayer_perceptron(input_tensor, weights, biases)
# Define loss
entropy_loss = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=output_tensor)
loss = tf.reduce_mean(entropy_loss)``````

``````learning_rate = 0.001
# Construct model
prediction = multilayer_perceptron(input_tensor, weights, biases)
# Define loss
entropy_loss = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=output_tensor)
loss = tf.reduce_mean(entropy_loss)

### 5. 数据操作

``````import numpy as np    #numpy is a package for scientific computing
from collections import Counter
vocab = Counter()
text = "Hi from Brazil"
#Get all words
for word in text.split(' '):
vocab[word]+=1

#Convert words to indexes
def get_word_2_index(vocab):
word2index = {}
for i,word in enumerate(vocab):
word2index[word] = i

return word2index
#Now we have an index
word2index = get_word_2_index(vocab)
total_words = len(vocab)
#This is how we create a numpy array (our matrix)
matrix = np.zeros((total_words),dtype=float)
#Now we fill the values
for word in text.split():
matrix[word2index[word]] += 1
print(matrix)
>>> [ 1.  1.  1.]``````

Python 中的 Counter() 是一个哈希表。当输入是「Hi from Brazil」时，矩阵是 [1 ,1, 1]。如果输入不同，比如「Hi」，矩阵会得到不同的结果：

``````matrix = np.zeros((total_words),dtype=float)
text = "Hi"
for word in text.split():
matrix[word2index[word.lower()]] += 1
print(matrix)
>>> [ 1.  0.  0.]``````

### 6. 运行模型，获得结果

``````from sklearn.datasets import fetch_20newsgroups
categories = ["comp.graphics","sci.space","rec.sport.baseball"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)``````

``````n_input = total_words # Words in vocab
n_classes = 3         # Categories: graphics, sci.space and baseball
input_tensor = tf.placeholder(tf.float32,[None, n_input],name="input")
output_tensor = tf.placeholder(tf.float32,[None, n_classes],name="output")``````

``````training_epochs = 10
# Launch the graph
with tf.Session() as sess:
sess.run(init) #inits the variables (normal distribution, remember?)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(len(newsgroups_train.data)/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
c,_ = sess.run([loss,optimizer], feed_dict={input_tensor: batch_x, output_tensor:batch_y})``````

``````    # Test model
index_prediction = tf.argmax(prediction, 1)
index_correct = tf.argmax(output_tensor, 1)
correct_prediction = tf.equal(index_prediction, index_correct)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
total_test_data = len(newsgroups_test.target)
batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data)
print("Accuracy:", accuracy.eval({input_tensor: batch_x_test, output_tensor: batch_y_test}))``````

### 结论

``````# if you didn't download the twenty_newsgroups datasets, it will run with error
# this logging can help to solve the error
import logging
logging.basicConfig()``````

``````import pandas as pd
import numpy as np
import tensorflow as tf
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
# if you didn't download the twenty_newsgroups datasets, it will run with error
# this logging can help to solve the error
import logging
logging.basicConfig()

categories = ["comp.graphics","sci.space","rec.sport.baseball"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('total texts in train:',len(newsgroups_train.data))
print('total texts in test:',len(newsgroups_test.data))

vocab = Counter()
for text in newsgroups_train.data:
for word in text.split(' '):
vocab[word.lower()]+=1

for text in newsgroups_test.data:
for word in text.split(' '):
vocab[word.lower()]+=1

total_words = len(vocab)
def get_word_2_index(vocab):
word2index = {}
for i,word in enumerate(vocab):
word2index[word.lower()] = i

return word2index

word2index = get_word_2_index(vocab)

def get_batch(df,i,batch_size):
batches = []
results = []
texts = df.data[i*batch_size:i*batch_size+batch_size]
categories = df.target[i*batch_size:i*batch_size+batch_size]
for text in texts:
layer = np.zeros(total_words,dtype=float)
for word in text.split(' '):
layer[word2index[word.lower()]] += 1

batches.append(layer)

for category in categories:
y = np.zeros((3),dtype=float)
if category == 0:
y[0] = 1.
elif category == 1:
y[1] = 1.
else:
y[2] = 1.
results.append(y)

return np.array(batches),np.array(results)

# Parameters
learning_rate = 0.01
training_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
n_hidden_1 = 100      # 1st layer number of features
n_hidden_2 = 100       # 2nd layer number of features
n_input = total_words # Words in vocab
n_classes = 3         # Categories: graphics, sci.space and baseball

input_tensor = tf.placeholder(tf.float32,[None, n_input],name="input")
output_tensor = tf.placeholder(tf.float32,[None, n_classes],name="output")

def multilayer_perceptron(input_tensor, weights, biases):
layer_1_multiplication = tf.matmul(input_tensor, weights['h1'])

# Hidden layer with RELU activation
layer_2_multiplication = tf.matmul(layer_1, weights['h2'])

# Output layer
out_layer_multiplication = tf.matmul(layer_2, weights['out'])

# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'b2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
prediction = multilayer_perceptron(input_tensor, weights, biases)

# Define loss and optimizer
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=output_tensor))

# Initializing the variables
init = tf.initialize_all_variables()

# Launch the graph
with tf.Session() as sess:
sess.run(init)

# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(len(newsgroups_train.data)/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
c,_ = sess.run([loss,optimizer], feed_dict={input_tensor: batch_x,output_tensor:batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "loss=", \
"{:.9f}".format(avg_cost))
print("Optimization Finished!")

# Test model
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(output_tensor, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
total_test_data = len(newsgroups_test.target)
batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data)
print("Accuracy:", accuracy.eval({input_tensor: batch_x_test, output_tensor: batch_y_test}))``````

[1] https://stats.stackexchange.com/questions/63152/what-does-the-hidden-layer-in-a-neural-network-compute

[2] http://stackoverflow.com/questions/2480650/role-of-bias-in-neural-networks