Github: https://github.com/yjfiejd/Text_Classification_NN/blob/master/text_classification_6.28.py
# -*- coding:utf8 -*-
# @TIME : 2018/6/28 下午10:34
# @Author : Allen
# @File : text_classification_6.28.py
import nltk
from nltk.stem.lancaster import LancasterStemmer
import os
import json
import datetime
stemmer = LancasterStemmer()
#3种意图
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})
training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
#预处理
words = []
classes = []
documents = []
ignore_words = ['?']
for pattern in training_data:
w = nltk.word_tokenize(pattern['sentence'])
words.extend(w)
documents.append((w, pattern['class']))
if pattern['class'] not in classes:
classes.append(pattern['class'])
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = list(set(words))
classes = list(set(classes))
print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)
print("************")
print(documents)
print(classes)
print(words)
print("************")
#bag of word
training = []
output = []
output_empty = [0] * len(classes)
for doc in documents:
bag = []
pattern_words = doc[0]
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
for w in words:
bag.append(1) if w in pattern_words else bag.append(0)
training.append(bag)
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
output.append(output_row)
i = 0
w = documents[i][0]
print(w)
print([stemmer.stem(word.lower()) for word in w])
print(training[i])
print(output[i])
print("******下面的代码,实现了词袋处理并将输入句子转换为0、1数组******")
#使用sigmiod作为激活函数, 不断的调整参数,直到错误率降低到可以接受
import numpy as np
import time
def sigmoid(x):
output = 1/(1 + np.exp(-x))
return output
def sigmoid_output_to_derivative(output):
return output*(1-output)
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
return sentence_words
def bow(sentence, words, show_details=False):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for s in sentence_words:
for i,w in enumerate(words):
if w == s:
bag[i] = 1
if show_details:
print("found in bag: %s" % w)
return (np.array(bag))
def think(sentence, show_details=False):
x = bow(sentence.lower(), words, show_details)
if show_details:
print ("sentence:", sentence, "\n bow:", x)
# input layer is our bag of words
l0 = x
# matrix multiplication of input and hidden layer
l1 = sigmoid(np.dot(l0, synapse_0))
# output layer
l2 = sigmoid(np.dot(l1, synapse_1))
return l2
print("******实现神经网络的训练函数来调整突触的权重******")
def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5):
print ("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') )
print ("Input matrix: %sx%s Output matrix: %sx%s" % (len(X),len(X[0]),1, len(classes)) )
np.random.seed(1)
last_mean_error = 1
# randomly initialize our weights with mean 0
synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1
prev_synapse_0_weight_update = np.zeros_like(synapse_0)
prev_synapse_1_weight_update = np.zeros_like(synapse_1)
synapse_0_direction_count = np.zeros_like(synapse_0)
synapse_1_direction_count = np.zeros_like(synapse_1)
for j in iter(range(epochs+1)):
# Feed forward through layers 0, 1, and 2
layer_0 = X
layer_1 = sigmoid(np.dot(layer_0, synapse_0))
if(dropout):
layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))
layer_2 = sigmoid(np.dot(layer_1, synapse_1))
# how much did we miss the target value?
layer_2_error = y - layer_2
if (j% 10000) == 0 and j > 5000:
# if this 10k iteration's error is greater than the last iteration, break out
if np.mean(np.abs(layer_2_error)) < last_mean_error:
print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
last_mean_error = np.mean(np.abs(layer_2_error))
else:
print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error )
break
# in what direction is the target value?
# were we really sure? if so, don't change too much.
layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)
# how much did each l1 value contribute to the l2 error (according to the weights)?
layer_1_error = layer_2_delta.dot(synapse_1.T)
# in what direction is the target l1?
# were we really sure? if so, don't change too much.
layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
synapse_1_weight_update = (layer_1.T.dot(layer_2_delta))
synapse_0_weight_update = (layer_0.T.dot(layer_1_delta))
if(j > 0):
synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0))
synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0))
synapse_1 += alpha * synapse_1_weight_update
synapse_0 += alpha * synapse_0_weight_update
prev_synapse_0_weight_update = synapse_0_weight_update
prev_synapse_1_weight_update = synapse_1_weight_update
now = datetime.datetime.now()
# persist synapses
synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(),
'datetime': now.strftime("%Y-%m-%d %H:%M"),
'words': words,
'classes': classes
}
synapse_file = "synapses.json"
with open(synapse_file, 'w') as outfile:
json.dump(synapse, outfile, indent=4, sort_keys=True)
# print ("saved synapses to:", synapse_file)
print("******隐层中只使用了20个神经元,因此比较容易进行调节******")
X = np.array(training)
y = np.array(output)
start_time = time.time()
train(X, y, hidden_neurons=20, alpha=0.1, epochs=100000, dropout=False, dropout_percent=0.2)
elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")
print("******预测一个句子属于某个分类的概率******")
# probability threshold
ERROR_THRESHOLD = 0.2
# load our calculated synapse values
synapse_file = 'synapses.json'
with open(synapse_file) as data_file:
synapse = json.load(data_file)
synapse_0 = np.asarray(synapse['synapse0'])
synapse_1 = np.asarray(synapse['synapse1'])
def classify(sentence, show_details=False):
results = think(sentence, show_details)
results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ]
results.sort(key=lambda x: x[1], reverse=True)
return_results =[[classes[r[0]],r[1]] for r in results]
print ("%s \n classification: %s" % (sentence, return_results))
return return_results
classify("sudo make me a sandwich")
classify("how are you today?")
classify("talk to you tomorrow")
classify("who are you?")
classify("make me some lunch")
classify("how was your lunch today?")
print()
classify("good day", show_details=True)
#运行的结果
# 12 documents
# 3 classes ['greeting', 'sandwich', 'goodbye']
# 26 unique stemmed words ['me', 'can', 'lunch', 'soon', 'good', 'to', 'for', 'see', 'a', 'is', 'what', 'day', "'s", 'going', 'ar', 'lat', 'today', 'it', 'hav', 'you', 'talk', 'yo', 'nic', 'sandwich', 'mak', 'how']
# ************
# [(['how', 'are', 'you', '?'], 'greeting'), (['how', 'is', 'your', 'day'], 'greeting'), (['good', 'day'], 'greeting'), (['how', 'is', 'it', 'going', 'today', '?'], 'greeting'), (['have', 'a', 'nice', 'day'], 'goodbye'), (['see', 'you', 'later'], 'goodbye'), (['have', 'a', 'nice', 'day'], 'goodbye'), (['talk', 'to', 'you', 'soon'], 'goodbye'), (['make', 'me', 'a', 'sandwich'], 'sandwich'), (['can', 'you', 'make', 'a', 'sandwich'], 'sandwich'), (['having', 'a', 'sandwich', 'today'], 'sandwich'), (['what', "'s", 'for', 'lunch', '?'], 'sandwich')]
# ['greeting', 'sandwich', 'goodbye']
# ['me', 'can', 'lunch', 'soon', 'good', 'to', 'for', 'see', 'a', 'is', 'what', 'day', "'s", 'going', 'ar', 'lat', 'today', 'it', 'hav', 'you', 'talk', 'yo', 'nic', 'sandwich', 'mak', 'how']
# ************
# ['how', 'are', 'you', '?']
# ['how', 'ar', 'you', '?']
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]
# [1, 0, 0]
# ******下面的代码,实现了词袋处理并将输入句子转换为0、1数组******
# ******实现神经网络的训练函数来调整突触的权重******
# ******隐层中只使用了20个神经元,因此比较容易进行调节******
# Training with 20 neurons, alpha:0.1, dropout:False
# Input matrix: 12x26 Output matrix: 1x3
# delta after 10000 iterations:0.006316297034425907
# delta after 20000 iterations:0.0043193475426743615
# delta after 30000 iterations:0.003467478418933735
# delta after 40000 iterations:0.0029698374148845906
# delta after 50000 iterations:0.0026348137914361477
# delta after 60000 iterations:0.0023899968815955614
# delta after 70000 iterations:0.0022012384383537555
# delta after 80000 iterations:0.0020500796559913266
# delta after 90000 iterations:0.0019255650109882149
# delta after 100000 iterations:0.00182073232414893
# processing time: 8.264868974685669 seconds
# ******预测一个句子属于某个分类的概率******
# sudo make me a sandwich
# classification: [['sandwich', 0.998725720350513]]
# how are you today?
# classification: [['greeting', 0.9991704815542843]]
# talk to you tomorrow
# classification: [['goodbye', 0.9916537770480427]]
# who are you?
# classification: [['greeting', 0.852891610128995]]
# make me some lunch
# classification: [['sandwich', 0.9756042223385346]]
# how was your lunch today?
# classification: [['greeting', 0.9854377884742492]]
#
# found in bag: good
# found in bag: day
# sentence: good day
# bow: [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# good day
# classification: [['greeting', 0.9966108820436639]]
#
# Process finished with exit code 0