基于神经网络的意图识别 - 基础版

Github: https://github.com/yjfiejd/Text_Classification_NN/blob/master/text_classification_6.28.py

# -*- coding:utf8 -*-
# @TIME : 2018/6/28 下午10:34
# @Author : Allen
# @File : text_classification_6.28.py

import nltk
from nltk.stem.lancaster import LancasterStemmer
import os
import json
import datetime
stemmer = LancasterStemmer()

#3种意图
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})

training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})

training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})

#预处理
words = []
classes = []
documents = []
ignore_words = ['?']

for pattern in training_data:
    w = nltk.word_tokenize(pattern['sentence'])
    words.extend(w)
    documents.append((w, pattern['class']))
    if pattern['class'] not in classes:
        classes.append(pattern['class'])

words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = list(set(words))
classes = list(set(classes))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)
print("************")
print(documents)
print(classes)
print(words)
print("************")

#bag of word
training = []
output = []
output_empty = [0] * len(classes)

for doc in documents:
    bag = []
    pattern_words = doc[0]
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)

i = 0
w = documents[i][0]
print(w)
print([stemmer.stem(word.lower()) for word in w])
print(training[i])
print(output[i])
print("******下面的代码，实现了词袋处理并将输入句子转换为0、1数组******")

#使用sigmiod作为激活函数, 不断的调整参数，直到错误率降低到可以接受
import numpy as np
import time

def sigmoid(x):
    output = 1/(1 + np.exp(-x))
    return output

def sigmoid_output_to_derivative(output):
    return output*(1-output)

def clean_up_sentence(sentence):
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

def bow(sentence, words, show_details=False):
    sentence_words = clean_up_sentence(sentence)
    bag = [0] * len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s:
                bag[i] = 1
                if show_details:
                    print("found in bag: %s" % w)
    return (np.array(bag))

def think(sentence, show_details=False):
    x = bow(sentence.lower(), words, show_details)
    if show_details:
        print ("sentence:", sentence, "\n bow:", x)
    # input layer is our bag of words
    l0 = x
    # matrix multiplication of input and hidden layer
    l1 = sigmoid(np.dot(l0, synapse_0))
    # output layer
    l2 = sigmoid(np.dot(l1, synapse_1))
    return l2

print("******实现神经网络的训练函数来调整突触的权重******")

def train(X, y, hidden_neurons=10, alpha=1, epochs=50000, dropout=False, dropout_percent=0.5):

    print ("Training with %s neurons, alpha:%s, dropout:%s %s" % (hidden_neurons, str(alpha), dropout, dropout_percent if dropout else '') )
    print ("Input matrix: %sx%s    Output matrix: %sx%s" % (len(X),len(X[0]),1, len(classes)) )
    np.random.seed(1)

    last_mean_error = 1
    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
    synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1

    prev_synapse_0_weight_update = np.zeros_like(synapse_0)
    prev_synapse_1_weight_update = np.zeros_like(synapse_1)

    synapse_0_direction_count = np.zeros_like(synapse_0)
    synapse_1_direction_count = np.zeros_like(synapse_1)

    for j in iter(range(epochs+1)):

        # Feed forward through layers 0, 1, and 2
        layer_0 = X
        layer_1 = sigmoid(np.dot(layer_0, synapse_0))

        if(dropout):
            layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))

        layer_2 = sigmoid(np.dot(layer_1, synapse_1))

        # how much did we miss the target value?
        layer_2_error = y - layer_2

        if (j% 10000) == 0 and j > 5000:
            # if this 10k iteration's error is greater than the last iteration, break out
            if np.mean(np.abs(layer_2_error)) < last_mean_error:
                print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
                last_mean_error = np.mean(np.abs(layer_2_error))
            else:
                print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error )
                break

        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)

        synapse_1_weight_update = (layer_1.T.dot(layer_2_delta))
        synapse_0_weight_update = (layer_0.T.dot(layer_1_delta))

        if(j > 0):
            synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0))
            synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0))

        synapse_1 += alpha * synapse_1_weight_update
        synapse_0 += alpha * synapse_0_weight_update

        prev_synapse_0_weight_update = synapse_0_weight_update
        prev_synapse_1_weight_update = synapse_1_weight_update

    now = datetime.datetime.now()

    # persist synapses
    synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(),
               'datetime': now.strftime("%Y-%m-%d %H:%M"),
               'words': words,
               'classes': classes
              }
    synapse_file = "synapses.json"

    with open(synapse_file, 'w') as outfile:
        json.dump(synapse, outfile, indent=4, sort_keys=True)
# print ("saved synapses to:", synapse_file)

print("******隐层中只使用了20个神经元，因此比较容易进行调节******")

X = np.array(training)
y = np.array(output)

start_time = time.time()

train(X, y, hidden_neurons=20, alpha=0.1, epochs=100000, dropout=False, dropout_percent=0.2)

elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")

print("******预测一个句子属于某个分类的概率******")
# probability threshold
ERROR_THRESHOLD = 0.2
# load our calculated synapse values
synapse_file = 'synapses.json'
with open(synapse_file) as data_file:
    synapse = json.load(data_file)
    synapse_0 = np.asarray(synapse['synapse0'])
    synapse_1 = np.asarray(synapse['synapse1'])

def classify(sentence, show_details=False):
    results = think(sentence, show_details)

    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ]
    results.sort(key=lambda x: x[1], reverse=True)
    return_results =[[classes[r[0]],r[1]] for r in results]
    print ("%s \n classification: %s" % (sentence, return_results))
    return return_results

classify("sudo make me a sandwich")
classify("how are you today?")
classify("talk to you tomorrow")
classify("who are you?")
classify("make me some lunch")
classify("how was your lunch today?")
print()
classify("good day", show_details=True)


#运行的结果
# 12 documents
# 3 classes ['greeting', 'sandwich', 'goodbye']
# 26 unique stemmed words ['me', 'can', 'lunch', 'soon', 'good', 'to', 'for', 'see', 'a', 'is', 'what', 'day', "'s", 'going', 'ar', 'lat', 'today', 'it', 'hav', 'you', 'talk', 'yo', 'nic', 'sandwich', 'mak', 'how']
# ************
# [(['how', 'are', 'you', '?'], 'greeting'), (['how', 'is', 'your', 'day'], 'greeting'), (['good', 'day'], 'greeting'), (['how', 'is', 'it', 'going', 'today', '?'], 'greeting'), (['have', 'a', 'nice', 'day'], 'goodbye'), (['see', 'you', 'later'], 'goodbye'), (['have', 'a', 'nice', 'day'], 'goodbye'), (['talk', 'to', 'you', 'soon'], 'goodbye'), (['make', 'me', 'a', 'sandwich'], 'sandwich'), (['can', 'you', 'make', 'a', 'sandwich'], 'sandwich'), (['having', 'a', 'sandwich', 'today'], 'sandwich'), (['what', "'s", 'for', 'lunch', '?'], 'sandwich')]
# ['greeting', 'sandwich', 'goodbye']
# ['me', 'can', 'lunch', 'soon', 'good', 'to', 'for', 'see', 'a', 'is', 'what', 'day', "'s", 'going', 'ar', 'lat', 'today', 'it', 'hav', 'you', 'talk', 'yo', 'nic', 'sandwich', 'mak', 'how']
# ************
# ['how', 'are', 'you', '?']
# ['how', 'ar', 'you', '?']
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]
# [1, 0, 0]
# ******下面的代码，实现了词袋处理并将输入句子转换为0、1数组******
# ******实现神经网络的训练函数来调整突触的权重******
# ******隐层中只使用了20个神经元，因此比较容易进行调节******
# Training with 20 neurons, alpha:0.1, dropout:False
# Input matrix: 12x26    Output matrix: 1x3
# delta after 10000 iterations:0.006316297034425907
# delta after 20000 iterations:0.0043193475426743615
# delta after 30000 iterations:0.003467478418933735
# delta after 40000 iterations:0.0029698374148845906
# delta after 50000 iterations:0.0026348137914361477
# delta after 60000 iterations:0.0023899968815955614
# delta after 70000 iterations:0.0022012384383537555
# delta after 80000 iterations:0.0020500796559913266
# delta after 90000 iterations:0.0019255650109882149
# delta after 100000 iterations:0.00182073232414893
# processing time: 8.264868974685669 seconds
# ******预测一个句子属于某个分类的概率******
# sudo make me a sandwich
#  classification: [['sandwich', 0.998725720350513]]
# how are you today?
#  classification: [['greeting', 0.9991704815542843]]
# talk to you tomorrow
#  classification: [['goodbye', 0.9916537770480427]]
# who are you?
#  classification: [['greeting', 0.852891610128995]]
# make me some lunch
#  classification: [['sandwich', 0.9756042223385346]]
# how was your lunch today?
#  classification: [['greeting', 0.9854377884742492]]
#
# found in bag: good
# found in bag: day
# sentence: good day
#  bow: [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# good day
#  classification: [['greeting', 0.9966108820436639]]
#
# Process finished with exit code 0